import java.io.*; /** * Location describes the position of a single character in a file. * There are several ways to do this: * * + A single incrementing character index. * + A line and column number. * + Indices and column numbers can be zero- or one-based. * + The index can point AT or BETWEEN bytes in the file. * * In particular the interaction between the last two conventions * can lead to great confusion and off-by-one errors. * Strictly speaking, the structure of this class doesn't impose * any convention. But the behavior of the newline() method * does imply that we're using one-based column numbers. */ class Location { public int num; // Zero-based index of character in source text. public int line; // One-based line number of character. public int col; // One based column number of character. /* ------------------------------------------------------------------------- */ public Location(int n, int l, int c) { num = n; line = l; col = c; } /* ------------------------------------------------------------------------- */ public void advance() { num++; col++; } public void newline() { num++; line++; col = 1; } public void retreat() { num--; col--; } } /* ========================================================================= */ /** * Scanner is responsible for slurping up a source file, iterating * through it and keeping track of the position of the current character. * Note that as it stands, it counts the DOS sequence \r\n as two lines. */ class Scanner { public char chr; // The current character. public Location loc; // Location of current character. private byte[] sourceText; // The whole input file. public static final byte EOT = 4; // ASCII eot, used to mark end of file. /* ------------------------------------------------------------------------- */ public Scanner(String filename) throws FileNotFoundException, IOException { FileInputStream fis = new FileInputStream(filename); int size = fis.available(); sourceText = new byte[size + 1]; fis.read(sourceText); fis.close(); sourceText[size] = EOT; // Stuff a sentinel at end. loc = new Location(0,1,1); chr = (char) sourceText[0]; } /* ------------------------------------------------------------------------- */ public void advance() { if (chr == '\n' || chr == '\r') loc.newline(); else loc.advance(); chr = (char) sourceText[loc.num]; } public void retreat() { loc.retreat(); chr = (char) sourceText[loc.num]; } } /* ========================================================================= */ /** * Range uses two instances of Location to describe the position of * a range of characters in a file. It uses one-based column numbers, * pointing AT the character (not to the position between them), * and is inclusive at both ends. * * The main utility of this class is its toString() method which * returns a nicely formatted description of the range. */ class Range { public Location first; public Location last; /* ------------------------------------------------------------------------- */ // We need to instantiate first & last -- see Lexer.advance() and Lexer.takeIt(). public Range(Location f) { first = new Location(f.num, f.line, f.col); last = new Location(f.num - 1, f.line, f.col - 1); // Make it a zero-length range. } public void finish(Location l) { last = l; } /* ------------------------------------------------------------------------- */ public String toString() { int length = last.num - first.num + 1; if (length < 0) // We haven't yet set its last coordinates. return "line " + String.valueOf(first.line) + " starting at char " + String.valueOf(first.col); else if (length == 0) return "line " + String.valueOf(first.line) + " before char " + String.valueOf(first.col); else if (length == 1) return "line " + String.valueOf(first.line) + " char " + String.valueOf(first.col); else if (first.line == last.line) return "line " + String.valueOf(first.line) + " chars " + String.valueOf(first.col) + " to " + String.valueOf( last.col); else return "line " + String.valueOf(first.line) + " char " + String.valueOf(first.col) + " to " + "line " + String.valueOf(last.line) + " char " + String.valueOf( last.col); } } /* ========================================================================= */ /** * Token is a "smart" token class. It contains all the detailed knowledge * about the lexical structure of the language: what the type numbers are, * what the keywords and characters are. When the Lexer is done loading in * the characters, it calls the Token's methods to decide what kind of * token it is and set its type field. */ class Token { public int type; public Range range; public StringBuffer spelling; /* ------------------------------------------------------------------------- */ public static final int UNINIT = 0; public static final int BOOLEAN = 1; // Reserved words, alphabetical order. public static final int CLASS = 2; public static final int ELSE = 3; public static final int IF = 4; public static final int INT = 5; public static final int RETURN = 6; public static final int STATIC = 7; public static final int WHILE = 8; public static final int BOOL = 9; // Boolean literal. public static final int NUM = 10; // Integer literal. public static final int ID = 11; // Identifier. public static final int SEMICOLON = 12; // Separators. public static final int COMMA = 13; public static final int LPAREN = 14; public static final int RPAREN = 15; public static final int LBRACE = 16; public static final int RBRACE = 17; public static final int GETS = 18; // Single-char operators. public static final int PLUS = 19; public static final int MINUS = 20; public static final int TIMES = 21; public static final int DIV = 22; public static final int MOD = 23; public static final int EQUALS = 24; // Multi-char operators. public static final int NOTEQUAL = 25; public static final int OR = 26; public static final int AND = 27; public static final int EOT = 28; // Housekeeping. public static final int ERR = 29; // These are for the Parser's error messages. private static final String[] spellingOf = { "*uninit*", "boolean", "class", "else", "if", "int", "return", "static", "while", "bool", "number", "identifier", ";", ",", "(", ")", "{", "}", "=", "+", "-", "*", "/", "%", "==", "!=", "||", "&&", "end of file", "*err*" }; // We use a method as public accessor in order to present the same // interface that the JLex/JavaCUP version does. public static String spellingOf(int t) { return spellingOf[t]; } /* ------------------------------------------------------------------------- */ public static final int TOKEN_SIZE = 16; public Token(Location f) { type = UNINIT; range = new Range(f); spelling = new StringBuffer(TOKEN_SIZE); } /* ------------------------------------------------------------------------- */ public static boolean couldBeDoubleChar(char c) { return c == '=' || c == '!' || c == '|' || c == '&'; } public static boolean validSecondChar(char first, char second) { return first == '=' && second == '=' || first == '!' && second == '=' || first == '|' && second == '|' || first == '&' && second == '&'; } /* ------------------------------------------------------------------------- */ public void finishSymbol() { int tt; // token type. int compare = 1; String s = spelling.toString(); for (tt = SEMICOLON; tt <= AND && compare != 0; tt++) compare = s.compareTo(Token.spellingOf(tt)); if (compare == 0) type = tt - 1; else // Couldn't find it. type = ERR; } /* ------------------------------------------------------------------------- */ public void finishNum() { type = NUM; } public void finishEot() { type = EOT; } /* ------------------------------------------------------------------------- */ public void finishWord() { int tt; // token type. int compare = 1; String s = spelling.toString(); for (tt = BOOLEAN; tt <= WHILE && compare > 0; tt++) compare = s.compareTo(Token.spellingOf(tt)); if (compare == 0) type = tt - 1; // the string matches a reserved words else if (s.compareTo("true") == 0) type = BOOL; // string matches a boolean literal else if (s.compareTo("false") == 0) type = BOOL; // string matches a boolean literal else type = ID; // string is an identifier } /* ------------------------------------------------------------------------- */ public String toString() { if (type == NUM || type == ID || type == BOOL) return "\"" + spelling + "\" " + range; else return spelling + " " + range; } } /* ========================================================================= */ class LexicalException extends Exception { public LexicalException(String s) { super(s); } } /* ========================================================================= */ /** * Lexer uses a Scanner to read characters from a text file, and loads them into * a Token. It only knows that words start with letters, numbers start with * digits, and symbols are always one character. It lets Token contain all the * detailed knowledge about the language's low-level structure. */ class Lexer { public Token token; // When advance() exits, the just-read token. private Scanner scanner; // Feeds us the input file. public boolean debug = false; // True => print out debug info. /* ------------------------------------------------------------------------- */ public Lexer(Scanner s) throws LexicalException { scanner = s; advance(); } /* ------------------------------------------------------------------------- */ // Grab another character from the Scanner. private void takeIt() throws LexicalException { if (token.spelling.length() == Token.TOKEN_SIZE) throw new LexicalException("Token too long; " + token.range + "."); token.range.last.advance(); token.spelling.append(scanner.chr); scanner.advance(); } /* ------------------------------------------------------------------------- */ // Skip whitespace and comments. private void skipFluff() throws LexicalException { boolean done; do { done = true; while (Character.isWhitespace(scanner.chr)) scanner.advance(); if (scanner.chr == '/') { scanner.advance(); if (scanner.chr == '/') { while (scanner.chr != '\n') scanner.advance(); done = false; } else if (scanner.chr == '*') { while (scanner.chr != '/') { while (scanner.chr != '*') scanner.advance(); scanner.advance(); } scanner.advance(); done = false; } else scanner.retreat(); } } while (!done); } /* ------------------------------------------------------------------------- */ // Our main interface to the world. Fills the next token, then tells it // to identify itself. Sets our field "token" which the world can read. public void advance() throws LexicalException { skipFluff(); token = new Token(scanner.loc); if (Character.isDigit(scanner.chr)) // Integer literal. { while (Character.isDigit(scanner.chr)) takeIt(); token.finishNum(); } else if (Character.isLetter(scanner.chr)) // Identifier or reserved word. { while (Character.isLetterOrDigit(scanner.chr)) takeIt(); token.finishWord(); } else if (scanner.chr == Scanner.EOT) // End of text. token.finishEot(); // ...but don't take it. else if (Token.couldBeDoubleChar(scanner.chr)) { char first = scanner.chr; takeIt(); if (Token.validSecondChar(first, scanner.chr)) takeIt(); token.finishSymbol(); } else // Single character. { takeIt(); token.finishSymbol(); } if (debug) System.out.println(token); } }