miniscan/src/SimpleScanner.m3


---------------------------------------------------------------------------
MODULE SimpleScanner;

IMPORT ASCII, Rd, TextSetDef, Text, Thread, Fmt;
IMPORT ScanToken, ScanTokenSeq, SMsg AS Msg;
---------------------------------------------------------------------------
CONST Backslash = '\\';
---------------------------------------------------------------------------
REVEAL
  T = Public BRANDED "SimpleScanner Type 0.0" OBJECT
    (* public variables
       skipComments    : BOOLEAN;         default TRUE
       nestingComments : BOOLEAN;         default TRUE
       oneLineComments : BOOLEAN;         default TRUE
       commentOpenSym  : TEXT;            default `/*'
       commentCloseSym : TEXT;            default `*/'
       lineCommentSym  : TEXT;            default '#'
       stringOpenSym   : TEXT;            default `"'
       stringCloseSym  : TEXT;            default `"'
       identSymbols    : ASCII.Set;       default ASCII.AlphaNumerics + {'_'}
       whiteSpace      : ASCII.Set;       default ASCII.Spaces
       specialSymbols  : ASCII.Set;       default DefaultSpecialChars
       compoundToken   : TextSet.T;       default {`:=', `<=', `>=', `->'}
       keywordToken    : TextSet.T;       default {}
    *)
    endOfFile : BOOLEAN;
    rd        : Rd.T;
    line, col : CARDINAL;
    actLen    : CARDINAL;
    actLine   : TEXT;
    actData   : TEXT;
    stack     : ScanTokenSeq.T;
  METHODS
    nextLine() RAISES {Error} := NextLine;
    nextChar() : CHAR RAISES {Error} := NextChar;
    actChar() : CHAR RAISES {Error} := ActChar;
    pos() : TEXT := Pos;
    errorState() : TEXT := ErrorState;
  OVERRIDES
    init := Init;
    nextToken := NextToken;
    pushBack := PushBack;
    eof := Eof;
  END;
---------------------------------------------------------------------------
PROCEDURE Init(self : T; inputStream : Rd.T) : T =
  BEGIN
    self.skipComments    := TRUE;
    self.nestingComments := TRUE;
    self.oneLineComments := TRUE;
    self.commentOpenSym  := "/*";
    self.commentCloseSym := "*/";
    self.lineCommentSym  := "#";
    self.stringOpenSym   := "\"";
    self.stringCloseSym  := "\"";
    self.identSymbols    := ASCII.AlphaNumerics + ASCII.Set{'_'};
    self.whiteSpace      := ASCII.Spaces;
    self.compoundToken   := NEW(TextSetDef.T).init();
    self.keywordToken    := NEW(TextSetDef.T).init();
    self.rd              := inputStream;
    self.specialSymbols  := DefaultSpecialChars;
    EVAL self.compoundToken.insert(":=");
    EVAL self.compoundToken.insert("<=");
    EVAL self.compoundToken.insert(">=");
    EVAL self.compoundToken.insert("->");
    self.line := 0;
    self.col  := 0;
    self.actLine := "";
    self.actLen  := 0;
    self.actData := "";
    self.stack := NIL; (* allocate stack only on need *)
    RETURN self;
  END Init;
---------------------------------------------------------------------------
PROCEDURE Pos(self : T) : TEXT =
  BEGIN
    RETURN "line " & Fmt.Int(self.line) & " column " & Fmt.Int(self.col);
  END Pos;
---------------------------------------------------------------------------
PROCEDURE ErrorState(self : T) : TEXT =
  VAR c : CHAR; cs : TEXT;
  BEGIN
    TRY
      c  := self.actChar();
      cs := Text.FromChar(c);
    EXCEPT ELSE
      cs := "undefined";
    END;
    RETURN "\n  current symbol:   `" & cs & "'" &
           "\n  current line:     `" & self.actLine & "'" &
           "\n  accumulated data: `" & self.actData & "'";
  END ErrorState;
---------------------------------------------------------------------------
PROCEDURE NextLine(self : T) RAISES {Error} =
  BEGIN
    IF self.col >= Text.Length(self.actLine) THEN
      TRY
        self.actLine := Rd.GetLine(self.rd) & "\n";
        self.col := 0;
        self.actLen := Text.Length(self.actLine);
        INC(self.line);
      EXCEPT
        Rd.EndOfFile   => self.endOfFile := TRUE;
      | Rd.Failure     => RAISE Error("error reading input stream");
      | Thread.Alerted => RAISE Error("interrupted while scanning");
      END;
    END;
  END NextLine;
---------------------------------------------------------------------------
PROCEDURE ActChar(self : T) : CHAR  RAISES {Error} =
  BEGIN
    IF self.col < self.actLen THEN
      RETURN Text.GetChar(self.actLine, self.col);
    ELSE
      RAISE Error(NIL);
    END;
  END ActChar;
---------------------------------------------------------------------------
PROCEDURE NextChar(self : T) : CHAR  RAISES {Error} =
  VAR c : CHAR;
  BEGIN
    WHILE (NOT self.endOfFile) AND (self.col >= self.actLen) DO
      NextLine(self);
    END;
    IF self.endOfFile THEN
      c := '?';
    ELSE
      c := Text.GetChar(self.actLine, self.col);
      INC(self.col);
    END;
    RETURN c;
  END NextChar;
---------------------------------------------------------------------------
PROCEDURE SkipWhiteSpace(self : T) RAISES {Error} =
  BEGIN
    WHILE (NOT self.endOfFile) AND (self.nextChar() IN self.whiteSpace) DO
      (* nothing *)
    END;
    IF self.col > 0 THEN
      DEC(self.col);
    END;
  END SkipWhiteSpace;
---------------------------------------------------------------------------
PROCEDURE ReadSymbols(self : T; set : ASCII.Set) RAISES {Error} =
  VAR c := self.nextChar();
  BEGIN
    WHILE (NOT self.endOfFile) AND (c IN set) DO
      self.actData := self.actData & Text.FromChar(c);
      c := self.nextChar();
    END;
    IF self.col > 0 THEN
      DEC(self.col);
    ELSE
      RAISE Error("ReadSymbols: cannot push back character");
    END;
  END ReadSymbols;
---------------------------------------------------------------------------
PROCEDURE Adjust(self : T; token : TEXT) RAISES {Error} =
  BEGIN
    WITH sub = Text.Length(self.actData) - Text.Length(token) DO
      IF self.col >= sub THEN
        DEC(self.col, sub);
      ELSE
        RAISE Error("Adjust: cannot push back characters");
      END;
    END;
  END Adjust;
---------------------------------------------------------------------------
PROCEDURE Matches(self : T; token : TEXT) : BOOLEAN =
  BEGIN
    WITH tlen = Text.Length(token) DO
      WITH dlen = Text.Length(self.actData) DO
        IF tlen > dlen THEN
          RETURN FALSE;
        END;
        RETURN Text.Equal(Text.Sub(self.actData, 0, tlen), token);
      END;
    END;
  END Matches;
---------------------------------------------------------------------------
PROCEDURE ReadString(self : T) RAISES {Error} =
  CONST OctalSet = ASCII.Set{'0'..'7'};
  VAR
    s := ASCII.All;
    t := ASCII.Set{};
    d :  TEXT;
  BEGIN
    WITH slen = Text.Length(self.stringOpenSym) DO
      self.actData := Text.Sub(self.actData, slen);
    END;
    (* string open symbol stripped off *)
    IF Text.Length(self.actData) > 0 THEN
      (* check if the string end has already been read *)
      (* lots of special symbols may have been accumulated in actData
         by ReadSymbols, eg. "\\" or "\\\" *)
      (* We just throw them away (to be read again), because there is no
         other way to interpret the escape sequences in strings *)
      Adjust(self, "");
      self.actData := "";
    END;
    FOR i := 0 TO Text.Length(self.stringCloseSym) - 1 DO
      t := t + ASCII.Set{Text.GetChar(self.stringCloseSym, i)};
    END;
    s := s - ASCII.Set{Backslash};
    s := s - t;
    LOOP
      ReadSymbols(self, s);
      IF self.actChar() = Backslash THEN
        EVAL self.nextChar();
        VAR n := self.nextChar(); BEGIN
          IF n = 'n' THEN
            n := ASCII.NL;
          ELSIF n = 'r' THEN
            n := ASCII.CR;
          ELSIF n = 't' THEN
            n := ASCII.HT;
          ELSIF n = 'h' THEN
            n := ASCII.BS;
          ELSIF n = 'g' THEN
            n := ASCII.BEL;
          ELSIF n = '[' THEN
            n := ASCII.ESC;
          ELSIF n IN OctalSet THEN
            VAR val := ORD(n) - ORD('0'); numOct := 1; BEGIN
              n := self.nextChar();
              WHILE n IN OctalSet AND numOct < 3 DO
                INC(numOct);
                val := val * 8 + ORD(n) - ORD('0');
                n := self.nextChar();
              END;
              DEC(self.col); (* last char must be read again *)
              val := val MOD 256;
              n := VAL(val, CHAR);
            END;
          END;
          self.actData := self.actData & Text.FromChar(n);
        END;
      ELSE
        d := self.actData;
        self.actData := "";
        ReadSymbols(self, t);
        IF Matches(self, self.stringCloseSym) THEN
          Adjust(self, self.stringCloseSym);
          self.actData := d;
          RETURN;
        ELSE
          self.actData := d & Text.Sub(self.actData, 0, 1);
          Adjust(self, "x");
        END;
      END;
    END;
  END ReadString;
---------------------------------------------------------------------------
PROCEDURE ReadOneLineComment(self : T) =
  BEGIN
    self.actData := Text.Sub(self.actLine, self.col);
    self.col := self.actLen + 1;
  END ReadOneLineComment;
---------------------------------------------------------------------------
PROCEDURE ReadMultiLineComment(self : T) RAISES {Error} =
  VAR
    s := ASCII.All;
    t := ASCII.Set{};
    d :  TEXT;
    level := 1;
  BEGIN
    WITH slen = Text.Length(self.commentOpenSym) DO
      self.actData := Text.Sub(self.actData, slen);
    END;
    FOR i := 0 TO Text.Length(self.commentOpenSym) - 1 DO
      t := t + ASCII.Set{Text.GetChar(self.commentOpenSym, i)};
    END;
    FOR i := 0 TO Text.Length(self.commentCloseSym) - 1 DO
      t := t + ASCII.Set{Text.GetChar(self.commentCloseSym, i)};
    END;
    s := s - ASCII.Set{Backslash};
    s := s - t;
    LOOP
      ReadSymbols(self, s);
      IF self.actChar() = Backslash THEN
        EVAL self.nextChar();
        self.actData := self.actData & Text.FromChar(self.nextChar());
      ELSE
        d := self.actData;
        self.actData := "";
        ReadSymbols(self, t);
        IF Matches(self, self.commentCloseSym) THEN
          DEC(level);
          IF NOT self.nestingComments OR level = 0 THEN
            Adjust(self, self.commentCloseSym);
            self.actData := d;
            RETURN;
          ELSE
            Adjust(self, "x");
            self.actData := d & Text.Sub(self.actData, 0, 1);
          END;
        ELSIF Matches(self, self.commentOpenSym) THEN
          INC(level);
          Adjust(self, self.commentOpenSym);
          self.actData := d & self.commentOpenSym;
        ELSE
          Adjust(self, "x");
          self.actData := d & Text.Sub(self.actData, 0, 1);
        END;
      END;
    END;
  END ReadMultiLineComment;
---------------------------------------------------------------------------
PROCEDURE NextToken(self : T) : Token RAISES {Error} =
  VAR token : Token;
  BEGIN
    Debug("scanner.nextToken");
    (* if there are tokens on the stack, return the top one *)
    IF self.stack # NIL AND self.stack.size() > 0 THEN
      token := self.stack.remlo();
      DebugToken(token);
      RETURN token;
    END;
    (* stack is empty *)
    token := NEW(Token);
    LOOP (* read until a complete syntactical element has been recognized *)
      SkipWhiteSpace(self);
      self.actData := "";
      token.line := self.line;
      token.col  := self.col;
      (* new token refers to correct position *)
      IF self.endOfFile THEN
        (* first check for end of file *)
        token.kind := ScanToken.Kind.EndOfFile;
        token.repr := "";
        EXIT;
      (* no end of file *)
      ELSIF self.actChar() IN self.identSymbols THEN
        (* keyword or identifier found *)
        ReadSymbols(self, self.identSymbols);
        token.repr := self.actData;
        IF self.keywordToken.member(token.repr) THEN
          token.kind := ScanToken.Kind.Keyword;
        ELSE
          token.kind := ScanToken.Kind.Ident;
        END;
        EXIT;
      (* no end of file, no keyword, no identifier *)
      ELSIF self.actChar() IN self.specialSymbols THEN
        (* last chance: anything consisting of special symbols *)
        ReadSymbols(self, self.specialSymbols);
        IF Matches(self, self.lineCommentSym) AND self.oneLineComments THEN
          (* one line comment found *)
          ReadOneLineComment(self);
          IF NOT self.skipComments THEN
            (* only return it if comments are not to be skipped *)
            token.kind := ScanToken.Kind.Comment;
            token.repr := self.actData;
            EXIT;
          END;
          (* continue at start of loop *)
        ELSIF Matches(self, self.commentOpenSym) THEN
          (* multi line comment found *)
          ReadMultiLineComment(self);
          IF NOT self.skipComments THEN
            (* only return it if comments are not to be skipped *)
            token.kind := ScanToken.Kind.Comment;
            token.repr := self.actData;
            EXIT;
          END;
          (* continue at start of loop *)
        ELSE
          (* check for compound symbol *)
          VAR iter  := self.compoundToken.iterate();
              tok   :  TEXT;
              found := FALSE;
          BEGIN
            WHILE iter.next(tok) DO
              IF Matches(self, tok) THEN
                token.repr := tok;
                token.kind := ScanToken.Kind.CompoundSymbol;
                Adjust(self, tok);
                found := TRUE;
                EXIT;
              END;
            END;
            IF found THEN EXIT END;
            (* no compound token but special symbol *)
            IF Matches(self, self.stringOpenSym) THEN
              (* string found *)
              ReadString(self);
              token.repr := self.actData;
              token.kind := ScanToken.Kind.String;
            ELSE
              (* single special character found *)
              token.repr := Text.Sub(self.actData, 0, 1);
              token.kind := ScanToken.Kind.Other;
              Adjust(self, token.repr);
            END;
            EXIT;
          END;
        END;
      ELSE
        (* what we found does not belong to any defined lexical class *)
        VAR err := "Unexpected symbol at " & self.pos() & ": " &
              self.errorState();
        BEGIN
          INC(self.col);
          RAISE Error(err);
        END;
      END;
    END;
    DebugToken(token);
    RETURN token;
  END NextToken;
---------------------------------------------------------------------------
PROCEDURE PushBack(self : T; t : Token) =
  BEGIN
    Debug("scanner.pushBack");
    DebugToken(t);
    IF self.stack = NIL THEN
      self.stack := NEW(ScanTokenSeq.T).init(5);
    END;
    self.stack.addlo(t);
  END PushBack;
---------------------------------------------------------------------------
PROCEDURE Eof(self : T) : BOOLEAN =
  BEGIN
    RETURN self.endOfFile;
  END Eof;
---------------------------------------------------------------------------
PROCEDURE Debug(msg : TEXT) =
  BEGIN
    IF debugScanner THEN
      IF Msg.dFlag THEN
        Msg.D(msg);
      ELSIF Msg.tFlag THEN
        Msg.T(msg);
      ELSE
        Msg.Debug(msg);
      END;
    END;
  END Debug;
---------------------------------------------------------------------------
PROCEDURE DebugToken(token : ScanToken.T) =
  BEGIN
    IF debugScanner THEN
      ScanToken.DebugToken(token);
    END;
  END DebugToken;
---------------------------------------------------------------------------
BEGIN
  debugScanner := FALSE;
END SimpleScanner.

interface ASCII is in: