/* * This is a port of tokenize.py by Ka-Ping Yee. * * each call to readline should return one line of input as a string, or * undefined if it's finished. * * callback is called for each token with 5 args: * 1. the token type * 2. the token string * 3. [ start_row, start_col ] * 4. [ end_row, end_col ] * 5. logical line where the token was found, including continuation lines * * callback can return true to abort. * */ /** * @constructor */ Sk.Tokenizer = function (filename, interactive, callback) { this.filename = filename; this.callback = callback; this.lnum = 0; this.parenlev = 0; this.continued = false; this.namechars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_"; this.numchars = "0123456789"; this.contstr = ""; this.needcont = false; this.contline = undefined; this.indents = [0]; this.endprog = /.*/; this.strstart = [-1, -1]; this.interactive = interactive; this.doneFunc = function () { var i; for (i = 1; i < this.indents.length; ++i) // pop remaining indent levels { if (this.callback(Sk.Tokenizer.Tokens.T_DEDENT, "", [this.lnum, 0], [this.lnum, 0], "")) { return "done"; } } if (this.callback(Sk.Tokenizer.Tokens.T_ENDMARKER, "", [this.lnum, 0], [this.lnum, 0], "")) { return "done"; } return "failed"; }; }; /** * @enum {number} */ Sk.Tokenizer.Tokens = { T_ENDMARKER : 0, T_NAME : 1, T_NUMBER : 2, T_STRING : 3, T_NEWLINE : 4, T_INDENT : 5, T_DEDENT : 6, T_LPAR : 7, T_RPAR : 8, T_LSQB : 9, T_RSQB : 10, T_COLON : 11, T_COMMA : 12, T_SEMI : 13, T_PLUS : 14, T_MINUS : 15, T_STAR : 16, T_SLASH : 17, T_VBAR : 18, T_AMPER : 19, T_LESS : 20, T_GREATER : 21, T_EQUAL : 22, T_DOT : 23, T_PERCENT : 24, T_BACKQUOTE : 25, T_LBRACE : 26, T_RBRACE : 27, T_EQEQUAL : 28, T_NOTEQUAL : 29, T_LESSEQUAL : 30, T_GREATEREQUAL : 31, T_TILDE : 32, T_CIRCUMFLEX : 33, T_LEFTSHIFT : 34, T_RIGHTSHIFT : 35, T_DOUBLESTAR : 36, T_PLUSEQUAL : 37, T_MINEQUAL : 38, T_STAREQUAL : 39, T_SLASHEQUAL : 40, T_PERCENTEQUAL : 41, T_AMPEREQUAL : 42, T_VBAREQUAL : 43, T_CIRCUMFLEXEQUAL : 44, T_LEFTSHIFTEQUAL : 45, T_RIGHTSHIFTEQUAL : 46, T_DOUBLESTAREQUAL : 47, T_DOUBLESLASH : 48, T_DOUBLESLASHEQUAL: 49, T_AT : 50, T_OP : 51, T_COMMENT : 52, T_NL : 53, T_RARROW : 54, T_ERRORTOKEN : 55, T_N_TOKENS : 56, T_NT_OFFSET : 256 }; /** @param {...*} x */ function group (x) { var args = Array.prototype.slice.call(arguments); return "(" + args.join("|") + ")"; } /** @param {...*} x */ function any (x) { return group.apply(null, arguments) + "*"; } /** @param {...*} x */ function maybe (x) { return group.apply(null, arguments) + "?"; } /* we have to use string and ctor to be able to build patterns up. + on /.../ * does something strange. */ var Whitespace = "[ \\f\\t]*"; var Comment_ = "#[^\\r\\n]*"; var Ident = "[a-zA-Z_]\\w*"; var Binnumber = "0[bB][01]*"; var Hexnumber = "0[xX][\\da-fA-F]*[lL]?"; var Octnumber = "0[oO]?[0-7]*[lL]?"; var Decnumber = "[1-9]\\d*[lL]?"; var Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber); var Exponent = "[eE][-+]?\\d+"; var Pointfloat = group("\\d+\\.\\d*", "\\.\\d+") + maybe(Exponent); var Expfloat = "\\d+" + Exponent; var Floatnumber = group(Pointfloat, Expfloat); var Imagnumber = group("\\d+[jJ]", Floatnumber + "[jJ]"); var Number_ = group(Imagnumber, Floatnumber, Intnumber); // tail end of ' string var Single = "^[^'\\\\]*(?:\\\\.[^'\\\\]*)*'"; // tail end of " string var Double_ = '^[^"\\\\]*(?:\\\\.[^"\\\\]*)*"'; // tail end of ''' string var Single3 = "[^'\\\\]*(?:(?:\\\\.|'(?!''))[^'\\\\]*)*'''"; // tail end of """ string var Double3 = '[^"\\\\]*(?:(?:\\\\.|"(?!""))[^"\\\\]*)*"""'; var Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""'); var String_ = group("[uU]?[rR]?'[^\\n'\\\\]*(?:\\\\.[^\\n'\\\\]*)*'", '[uU]?[rR]?"[^\\n"\\\\]*(?:\\\\.[^\\n"\\\\]*)*"'); // Because of leftmost-then-longest match semantics, be sure to put the // longest operators first (e.g., if = came before ==, == would get // recognized as two instances of =). var Operator = group("\\*\\*=?", ">>=?", "<<=?", "<>", "!=", "//=?", "->", "[+\\-*/%&|^=<>]=?", "~"); var Bracket = "[\\][(){}]"; var Special = group("\\r?\\n", "[:;.,`@]"); var Funny = group(Operator, Bracket, Special); var ContStr = group("[uUbB]?[rR]?'[^\\n'\\\\]*(?:\\\\.[^\\n'\\\\]*)*" + group("'", "\\\\\\r?\\n"), "[uUbB]?[rR]?\"[^\\n\"\\\\]*(?:\\\\.[^\\n\"\\\\]*)*" + group("\"", "\\\\\\r?\\n")); var PseudoExtras = group("\\\\\\r?\\n", Comment_, Triple); // Need to prefix with "^" as we only want to match what's next var PseudoToken = "^" + group(PseudoExtras, Number_, Funny, ContStr, Ident); var triple_quoted = { "'''" : true, '"""': true, "r'''" : true, 'r"""': true, "R'''": true, 'R"""': true, "u'''" : true, 'u"""': true, "U'''": true, 'U"""': true, "b'''" : true, 'b"""': true, "B'''": true, 'B"""': true, "ur'''": true, 'ur"""': true, "Ur'''": true, 'Ur"""': true, "uR'''": true, 'uR"""': true, "UR'''": true, 'UR"""': true, "br'''": true, 'br"""': true, "Br'''": true, 'Br"""': true, "bR'''": true, 'bR"""': true, "BR'''": true, 'BR"""': true }; var single_quoted = { "'" : true, '"': true, "r'" : true, 'r"': true, "R'": true, 'R"': true, "u'" : true, 'u"': true, "U'": true, 'U"': true, "b'" : true, 'b"': true, "B'": true, 'B"': true, "ur'": true, 'ur"': true, "Ur'": true, 'Ur"': true, "uR'": true, 'uR"': true, "UR'": true, 'UR"': true, "br'": true, 'br"': true, "Br'": true, 'Br"': true, "bR'": true, 'bR"': true, "BR'": true, 'BR"': true }; // hack to make closure keep those objects. not sure what a better way is. (function () { var k; for (k in triple_quoted) { } for (k in single_quoted) { } }()); var tabsize = 8; function contains (a, obj) { var i = a.length; while (i--) { if (a[i] === obj) { return true; } } return false; } function rstrip (input, what) { var i; for (i = input.length; i > 0; --i) { if (what.indexOf(input.charAt(i - 1)) === -1) { break; } } return input.substring(0, i); } Sk.Tokenizer.prototype.generateTokens = function (line) { var nl_pos; var newl; var initial; var token; var epos; var spos; var start; var pseudomatch; var capos; var comment_token; var endmatch, pos, column, end, max; // bnm - Move these definitions in this function otherwise test state is preserved between // calls on single3prog and double3prog causing weird errors with having multiple instances // of triple quoted strings in the same program. var pseudoprog = new RegExp(PseudoToken); var single3prog = new RegExp(Single3, "g"); var double3prog = new RegExp(Double3, "g"); var endprogs = { "'": new RegExp(Single, "g"), "\"": new RegExp(Double_, "g"), "'''" : single3prog, '"""': double3prog, "r'''" : single3prog, 'r"""': double3prog, "u'''" : single3prog, 'u"""': double3prog, "b'''" : single3prog, 'b"""': double3prog, "ur'''" : single3prog, 'ur"""': double3prog, "br'''" : single3prog, 'br"""': double3prog, "R'''" : single3prog, 'R"""': double3prog, "U'''" : single3prog, 'U"""': double3prog, "B'''" : single3prog, 'B"""': double3prog, "uR'''" : single3prog, 'uR"""': double3prog, "Ur'''" : single3prog, 'Ur"""': double3prog, "UR'''" : single3prog, 'UR"""': double3prog, "bR'''" : single3prog, 'bR"""': double3prog, "Br'''" : single3prog, 'Br"""': double3prog, "BR'''" : single3prog, 'BR"""': double3prog, 'r' : null, 'R': null, 'u' : null, 'U': null, 'b' : null, 'B': null }; if (!line) { line = ''; } //print("LINE:'"+line+"'"); this.lnum += 1; pos = 0; max = line.length; if (this.contstr.length > 0) { if (!line) { throw new Sk.builtin.SyntaxError("EOF in multi-line string", this.filename, this.strstart[0], this.strstart[1], this.contline); } this.endprog.lastIndex = 0; endmatch = this.endprog.test(line); if (endmatch) { pos = end = this.endprog.lastIndex; if (this.callback(Sk.Tokenizer.Tokens.T_STRING, this.contstr + line.substring(0, end), this.strstart, [this.lnum, end], this.contline + line)) { return 'done'; } this.contstr = ''; this.needcont = false; this.contline = undefined; } else if (this.needcont && line.substring(line.length - 2) !== "\\\n" && line.substring(line.length - 3) !== "\\\r\n") { if (this.callback(Sk.Tokenizer.Tokens.T_ERRORTOKEN, this.contstr + line, this.strstart, [this.lnum, line.length], this.contline)) { return 'done'; } this.contstr = ''; this.contline = undefined; return false; } else { this.contstr += line; this.contline = this.contline + line; return false; } } else if (this.parenlev === 0 && !this.continued) { if (!line) { return this.doneFunc(); } column = 0; while (pos < max) { if (line.charAt(pos) === ' ') { column += 1; } else if (line.charAt(pos) === '\t') { column = (column / tabsize + 1) * tabsize; } else if (line.charAt(pos) === '\f') { column = 0; } else { break; } pos = pos + 1; } if (pos === max) { return this.doneFunc(); } if ("#\r\n".indexOf(line.charAt(pos)) !== -1) // skip comments or blank lines { if (line.charAt(pos) === '#') { comment_token = rstrip(line.substring(pos), '\r\n'); nl_pos = pos + comment_token.length; if (this.callback(Sk.Tokenizer.Tokens.T_COMMENT, comment_token, [this.lnum, pos], [this.lnum, pos + comment_token.length], line)) { return 'done'; } //print("HERE:1"); if (this.callback(Sk.Tokenizer.Tokens.T_NL, line.substring(nl_pos), [this.lnum, nl_pos], [this.lnum, line.length], line)) { return 'done'; } return false; } else { //print("HERE:2"); if (this.callback(Sk.Tokenizer.Tokens.T_NL, line.substring(pos), [this.lnum, pos], [this.lnum, line.length], line)) { return 'done'; } if (!this.interactive) { return false; } } } if (column > this.indents[this.indents.length - 1]) // count indents or dedents { this.indents.push(column); if (this.callback(Sk.Tokenizer.Tokens.T_INDENT, line.substring(0, pos), [this.lnum, 0], [this.lnum, pos], line)) { return 'done'; } } while (column < this.indents[this.indents.length - 1]) { if (!contains(this.indents, column)) { throw new Sk.builtin.IndentationError("unindent does not match any outer indentation level", this.filename, this.lnum, pos, line); } this.indents.splice(this.indents.length - 1, 1); //print("dedent here"); if (this.callback(Sk.Tokenizer.Tokens.T_DEDENT, '', [this.lnum, pos], [this.lnum, pos], line)) { return 'done'; } } } else // continued statement { if (!line) { throw new Sk.builtin.SyntaxError("EOF in multi-line statement", this.filename, this.lnum, 0, line); } this.continued = false; } while (pos < max) { //print("pos:"+pos+":"+max); // js regexes don't return any info about matches, other than the // content. we'd like to put a \w+ before pseudomatch, but then we // can't get any data capos = line.charAt(pos); while (capos === ' ' || capos === '\f' || capos === '\t') { pos += 1; capos = line.charAt(pos); } pseudoprog.lastIndex = 0; pseudomatch = pseudoprog.exec(line.substring(pos)); if (pseudomatch) { start = pos; end = start + pseudomatch[1].length; spos = [this.lnum, start]; epos = [this.lnum, end]; pos = end; token = line.substring(start, end); initial = line.charAt(start); //Sk.debugout("token:",token, "initial:",initial, start, end); if (this.numchars.indexOf(initial) !== -1 || (initial === '.' && token !== '.')) { if (this.callback(Sk.Tokenizer.Tokens.T_NUMBER, token, spos, epos, line)) { return 'done'; } } else if (initial === '\r' || initial === '\n') { newl = Sk.Tokenizer.Tokens.T_NEWLINE; //print("HERE:3"); if (this.parenlev > 0) { newl = Sk.Tokenizer.Tokens.T_NL; } if (this.callback(newl, token, spos, epos, line)) { return 'done'; } } else if (initial === '#') { if (this.callback(Sk.Tokenizer.Tokens.T_COMMENT, token, spos, epos, line)) { return 'done'; } } else if (triple_quoted.hasOwnProperty(token)) { this.endprog = endprogs[token]; this.endprog.lastIndex = 0; endmatch = this.endprog.test(line.substring(pos)); if (endmatch) { pos = this.endprog.lastIndex + pos; token = line.substring(start, pos); if (this.callback(Sk.Tokenizer.Tokens.T_STRING, token, spos, [this.lnum, pos], line)) { return 'done'; } } else { this.strstart = [this.lnum, start]; this.contstr = line.substring(start); this.contline = line; return false; } } else if (single_quoted.hasOwnProperty(initial) || single_quoted.hasOwnProperty(token.substring(0, 2)) || single_quoted.hasOwnProperty(token.substring(0, 3))) { if (token[token.length - 1] === '\n') { this.strstart = [this.lnum, start]; this.endprog = endprogs[initial] || endprogs[token[1]] || endprogs[token[2]]; this.contstr = line.substring(start); this.needcont = true; this.contline = line; //print("i, t1, t2", initial, token[1], token[2]); //print("ep, cs", this.endprog, this.contstr); return false; } else { if (this.callback(Sk.Tokenizer.Tokens.T_STRING, token, spos, epos, line)) { return 'done'; } } } else if (this.namechars.indexOf(initial) !== -1) { if (this.callback(Sk.Tokenizer.Tokens.T_NAME, token, spos, epos, line)) { return 'done'; } } else if (initial === '\\') { //print("HERE:4"); if (this.callback(Sk.Tokenizer.Tokens.T_NL, token, spos, [this.lnum, pos], line)) { return 'done'; } this.continued = true; } else { if ('([{'.indexOf(initial) !== -1) { this.parenlev += 1; } else if (')]}'.indexOf(initial) !== -1) { this.parenlev -= 1; } if (this.callback(Sk.Tokenizer.Tokens.T_OP, token, spos, epos, line)) { return 'done'; } } } else { if (this.callback(Sk.Tokenizer.Tokens.T_ERRORTOKEN, line.charAt(pos), [this.lnum, pos], [this.lnum, pos + 1], line)) { return 'done'; } pos += 1; } } return false; }; Sk.Tokenizer.tokenNames = { 0 : 'T_ENDMARKER', 1: 'T_NAME', 2: 'T_NUMBER', 3: 'T_STRING', 4: 'T_NEWLINE', 5 : 'T_INDENT', 6: 'T_DEDENT', 7: 'T_LPAR', 8: 'T_RPAR', 9: 'T_LSQB', 10 : 'T_RSQB', 11: 'T_COLON', 12: 'T_COMMA', 13: 'T_SEMI', 14: 'T_PLUS', 15 : 'T_MINUS', 16: 'T_STAR', 17: 'T_SLASH', 18: 'T_VBAR', 19: 'T_AMPER', 20 : 'T_LESS', 21: 'T_GREATER', 22: 'T_EQUAL', 23: 'T_DOT', 24: 'T_PERCENT', 25 : 'T_BACKQUOTE', 26: 'T_LBRACE', 27: 'T_RBRACE', 28: 'T_EQEQUAL', 29: 'T_NOTEQUAL', 30 : 'T_LESSEQUAL', 31: 'T_GREATEREQUAL', 32: 'T_TILDE', 33: 'T_CIRCUMFLEX', 34: 'T_LEFTSHIFT', 35 : 'T_RIGHTSHIFT', 36: 'T_DOUBLESTAR', 37: 'T_PLUSEQUAL', 38: 'T_MINEQUAL', 39: 'T_STAREQUAL', 40 : 'T_SLASHEQUAL', 41: 'T_PERCENTEQUAL', 42: 'T_AMPEREQUAL', 43: 'T_VBAREQUAL', 44: 'T_CIRCUMFLEXEQUAL', 45 : 'T_LEFTSHIFTEQUAL', 46: 'T_RIGHTSHIFTEQUAL', 47: 'T_DOUBLESTAREQUAL', 48: 'T_DOUBLESLASH', 49: 'T_DOUBLESLASHEQUAL', 50 : 'T_AT', 51: 'T_OP', 52: 'T_COMMENT', 53: 'T_NL', 54: 'T_RARROW', 55 : 'T_ERRORTOKEN', 56: 'T_N_TOKENS', 256: 'T_NT_OFFSET' }; goog.exportSymbol("Sk.Tokenizer", Sk.Tokenizer); goog.exportSymbol("Sk.Tokenizer.prototype.generateTokens", Sk.Tokenizer.prototype.generateTokens); goog.exportSymbol("Sk.Tokenizer.tokenNames", Sk.Tokenizer.tokenNames);