| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534 |
- /*
- * This is a port of tokenize.py by Ka-Ping Yee.
- *
- * each call to readline should return one line of input as a string, or
- * undefined if it's finished.
- *
- * callback is called for each token with 5 args:
- * 1. the token type
- * 2. the token string
- * 3. [ start_row, start_col ]
- * 4. [ end_row, end_col ]
- * 5. logical line where the token was found, including continuation lines
- *
- * callback can return true to abort.
- *
- */
- /**
- * @constructor
- */
- Sk.Tokenizer = function (filename, interactive, callback) {
- this.filename = filename;
- this.callback = callback;
- this.lnum = 0;
- this.parenlev = 0;
- this.continued = false;
- this.namechars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_";
- this.numchars = "0123456789";
- this.contstr = "";
- this.needcont = false;
- this.contline = undefined;
- this.indents = [0];
- this.endprog = /.*/;
- this.strstart = [-1, -1];
- this.interactive = interactive;
- this.doneFunc = function () {
- var i;
- for (i = 1; i < this.indents.length; ++i) // pop remaining indent levels
- {
- if (this.callback(Sk.Tokenizer.Tokens.T_DEDENT, "", [this.lnum, 0], [this.lnum, 0], "")) {
- return "done";
- }
- }
- if (this.callback(Sk.Tokenizer.Tokens.T_ENDMARKER, "", [this.lnum, 0], [this.lnum, 0], "")) {
- return "done";
- }
- return "failed";
- };
- };
- /**
- * @enum {number}
- */
- Sk.Tokenizer.Tokens = {
- T_ENDMARKER : 0,
- T_NAME : 1,
- T_NUMBER : 2,
- T_STRING : 3,
- T_NEWLINE : 4,
- T_INDENT : 5,
- T_DEDENT : 6,
- T_LPAR : 7,
- T_RPAR : 8,
- T_LSQB : 9,
- T_RSQB : 10,
- T_COLON : 11,
- T_COMMA : 12,
- T_SEMI : 13,
- T_PLUS : 14,
- T_MINUS : 15,
- T_STAR : 16,
- T_SLASH : 17,
- T_VBAR : 18,
- T_AMPER : 19,
- T_LESS : 20,
- T_GREATER : 21,
- T_EQUAL : 22,
- T_DOT : 23,
- T_PERCENT : 24,
- T_BACKQUOTE : 25,
- T_LBRACE : 26,
- T_RBRACE : 27,
- T_EQEQUAL : 28,
- T_NOTEQUAL : 29,
- T_LESSEQUAL : 30,
- T_GREATEREQUAL : 31,
- T_TILDE : 32,
- T_CIRCUMFLEX : 33,
- T_LEFTSHIFT : 34,
- T_RIGHTSHIFT : 35,
- T_DOUBLESTAR : 36,
- T_PLUSEQUAL : 37,
- T_MINEQUAL : 38,
- T_STAREQUAL : 39,
- T_SLASHEQUAL : 40,
- T_PERCENTEQUAL : 41,
- T_AMPEREQUAL : 42,
- T_VBAREQUAL : 43,
- T_CIRCUMFLEXEQUAL : 44,
- T_LEFTSHIFTEQUAL : 45,
- T_RIGHTSHIFTEQUAL : 46,
- T_DOUBLESTAREQUAL : 47,
- T_DOUBLESLASH : 48,
- T_DOUBLESLASHEQUAL: 49,
- T_AT : 50,
- T_OP : 51,
- T_COMMENT : 52,
- T_NL : 53,
- T_RARROW : 54,
- T_ERRORTOKEN : 55,
- T_N_TOKENS : 56,
- T_NT_OFFSET : 256
- };
- /** @param {...*} x */
- function group (x) {
- var args = Array.prototype.slice.call(arguments);
- return "(" + args.join("|") + ")";
- }
- /** @param {...*} x */
- function any (x) {
- return group.apply(null, arguments) + "*";
- }
- /** @param {...*} x */
- function maybe (x) {
- return group.apply(null, arguments) + "?";
- }
- /* we have to use string and ctor to be able to build patterns up. + on /.../
- * does something strange. */
- var Whitespace = "[ \\f\\t]*";
- var Comment_ = "#[^\\r\\n]*";
- var Ident = "[a-zA-Z_]\\w*";
- var Binnumber = "0[bB][01]*";
- var Hexnumber = "0[xX][\\da-fA-F]*[lL]?";
- var Octnumber = "0[oO]?[0-7]*[lL]?";
- var Decnumber = "[1-9]\\d*[lL]?";
- var Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber);
- var Exponent = "[eE][-+]?\\d+";
- var Pointfloat = group("\\d+\\.\\d*", "\\.\\d+") + maybe(Exponent);
- var Expfloat = "\\d+" + Exponent;
- var Floatnumber = group(Pointfloat, Expfloat);
- var Imagnumber = group("\\d+[jJ]", Floatnumber + "[jJ]");
- var Number_ = group(Imagnumber, Floatnumber, Intnumber);
- // tail end of ' string
- var Single = "^[^'\\\\]*(?:\\\\.[^'\\\\]*)*'";
- // tail end of " string
- var Double_ = '^[^"\\\\]*(?:\\\\.[^"\\\\]*)*"';
- // tail end of ''' string
- var Single3 = "[^'\\\\]*(?:(?:\\\\.|'(?!''))[^'\\\\]*)*'''";
- // tail end of """ string
- var Double3 = '[^"\\\\]*(?:(?:\\\\.|"(?!""))[^"\\\\]*)*"""';
- var Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""');
- var String_ = group("[uU]?[rR]?'[^\\n'\\\\]*(?:\\\\.[^\\n'\\\\]*)*'",
- '[uU]?[rR]?"[^\\n"\\\\]*(?:\\\\.[^\\n"\\\\]*)*"');
- // Because of leftmost-then-longest match semantics, be sure to put the
- // longest operators first (e.g., if = came before ==, == would get
- // recognized as two instances of =).
- var Operator = group("\\*\\*=?", ">>=?", "<<=?", "<>", "!=",
- "//=?", "->",
- "[+\\-*/%&|^=<>]=?",
- "~");
- var Bracket = "[\\][(){}]";
- var Special = group("\\r?\\n", "[:;.,`@]");
- var Funny = group(Operator, Bracket, Special);
- var ContStr = group("[uUbB]?[rR]?'[^\\n'\\\\]*(?:\\\\.[^\\n'\\\\]*)*" +
- group("'", "\\\\\\r?\\n"),
- "[uUbB]?[rR]?\"[^\\n\"\\\\]*(?:\\\\.[^\\n\"\\\\]*)*" +
- group("\"", "\\\\\\r?\\n"));
- var PseudoExtras = group("\\\\\\r?\\n", Comment_, Triple);
- // Need to prefix with "^" as we only want to match what's next
- var PseudoToken = "^" + group(PseudoExtras, Number_, Funny, ContStr, Ident);
- var triple_quoted = {
- "'''" : true, '"""': true,
- "r'''" : true, 'r"""': true, "R'''": true, 'R"""': true,
- "u'''" : true, 'u"""': true, "U'''": true, 'U"""': true,
- "b'''" : true, 'b"""': true, "B'''": true, 'B"""': true,
- "ur'''": true, 'ur"""': true, "Ur'''": true, 'Ur"""': true,
- "uR'''": true, 'uR"""': true, "UR'''": true, 'UR"""': true,
- "br'''": true, 'br"""': true, "Br'''": true, 'Br"""': true,
- "bR'''": true, 'bR"""': true, "BR'''": true, 'BR"""': true
- };
- var single_quoted = {
- "'" : true, '"': true,
- "r'" : true, 'r"': true, "R'": true, 'R"': true,
- "u'" : true, 'u"': true, "U'": true, 'U"': true,
- "b'" : true, 'b"': true, "B'": true, 'B"': true,
- "ur'": true, 'ur"': true, "Ur'": true, 'Ur"': true,
- "uR'": true, 'uR"': true, "UR'": true, 'UR"': true,
- "br'": true, 'br"': true, "Br'": true, 'Br"': true,
- "bR'": true, 'bR"': true, "BR'": true, 'BR"': true
- };
- // hack to make closure keep those objects. not sure what a better way is.
- (function () {
- var k;
- for (k in triple_quoted) {
- }
- for (k in single_quoted) {
- }
- }());
- var tabsize = 8;
- function contains (a, obj) {
- var i = a.length;
- while (i--) {
- if (a[i] === obj) {
- return true;
- }
- }
- return false;
- }
- function rstrip (input, what) {
- var i;
- for (i = input.length; i > 0; --i) {
- if (what.indexOf(input.charAt(i - 1)) === -1) {
- break;
- }
- }
- return input.substring(0, i);
- }
- Sk.Tokenizer.prototype.generateTokens = function (line) {
- var nl_pos;
- var newl;
- var initial;
- var token;
- var epos;
- var spos;
- var start;
- var pseudomatch;
- var capos;
- var comment_token;
- var endmatch, pos, column, end, max;
- // bnm - Move these definitions in this function otherwise test state is preserved between
- // calls on single3prog and double3prog causing weird errors with having multiple instances
- // of triple quoted strings in the same program.
- var pseudoprog = new RegExp(PseudoToken);
- var single3prog = new RegExp(Single3, "g");
- var double3prog = new RegExp(Double3, "g");
- var endprogs = { "'": new RegExp(Single, "g"), "\"": new RegExp(Double_, "g"),
- "'''" : single3prog, '"""': double3prog,
- "r'''" : single3prog, 'r"""': double3prog,
- "u'''" : single3prog, 'u"""': double3prog,
- "b'''" : single3prog, 'b"""': double3prog,
- "ur'''" : single3prog, 'ur"""': double3prog,
- "br'''" : single3prog, 'br"""': double3prog,
- "R'''" : single3prog, 'R"""': double3prog,
- "U'''" : single3prog, 'U"""': double3prog,
- "B'''" : single3prog, 'B"""': double3prog,
- "uR'''" : single3prog, 'uR"""': double3prog,
- "Ur'''" : single3prog, 'Ur"""': double3prog,
- "UR'''" : single3prog, 'UR"""': double3prog,
- "bR'''" : single3prog, 'bR"""': double3prog,
- "Br'''" : single3prog, 'Br"""': double3prog,
- "BR'''" : single3prog, 'BR"""': double3prog,
- 'r' : null, 'R': null,
- 'u' : null, 'U': null,
- 'b' : null, 'B': null
- };
- if (!line) {
- line = '';
- }
- //print("LINE:'"+line+"'");
- this.lnum += 1;
- pos = 0;
- max = line.length;
- if (this.contstr.length > 0) {
- if (!line) {
- throw new Sk.builtin.SyntaxError("EOF in multi-line string", this.filename, this.strstart[0], this.strstart[1], this.contline);
- }
- this.endprog.lastIndex = 0;
- endmatch = this.endprog.test(line);
- if (endmatch) {
- pos = end = this.endprog.lastIndex;
- if (this.callback(Sk.Tokenizer.Tokens.T_STRING, this.contstr + line.substring(0, end),
- this.strstart, [this.lnum, end], this.contline + line)) {
- return 'done';
- }
- this.contstr = '';
- this.needcont = false;
- this.contline = undefined;
- }
- else if (this.needcont && line.substring(line.length - 2) !== "\\\n" && line.substring(line.length - 3) !== "\\\r\n") {
- if (this.callback(Sk.Tokenizer.Tokens.T_ERRORTOKEN, this.contstr + line,
- this.strstart, [this.lnum, line.length], this.contline)) {
- return 'done';
- }
- this.contstr = '';
- this.contline = undefined;
- return false;
- }
- else {
- this.contstr += line;
- this.contline = this.contline + line;
- return false;
- }
- }
- else if (this.parenlev === 0 && !this.continued) {
- if (!line) {
- return this.doneFunc();
- }
- column = 0;
- while (pos < max) {
- if (line.charAt(pos) === ' ') {
- column += 1;
- }
- else if (line.charAt(pos) === '\t') {
- column = (column / tabsize + 1) * tabsize;
- }
- else if (line.charAt(pos) === '\f') {
- column = 0;
- }
- else {
- break;
- }
- pos = pos + 1;
- }
- if (pos === max) {
- return this.doneFunc();
- }
- if ("#\r\n".indexOf(line.charAt(pos)) !== -1) // skip comments or blank lines
- {
- if (line.charAt(pos) === '#') {
- comment_token = rstrip(line.substring(pos), '\r\n');
- nl_pos = pos + comment_token.length;
- if (this.callback(Sk.Tokenizer.Tokens.T_COMMENT, comment_token,
- [this.lnum, pos], [this.lnum, pos + comment_token.length], line)) {
- return 'done';
- }
- //print("HERE:1");
- if (this.callback(Sk.Tokenizer.Tokens.T_NL, line.substring(nl_pos),
- [this.lnum, nl_pos], [this.lnum, line.length], line)) {
- return 'done';
- }
- return false;
- }
- else {
- //print("HERE:2");
- if (this.callback(Sk.Tokenizer.Tokens.T_NL, line.substring(pos),
- [this.lnum, pos], [this.lnum, line.length], line)) {
- return 'done';
- }
- if (!this.interactive) {
- return false;
- }
- }
- }
- if (column > this.indents[this.indents.length - 1]) // count indents or dedents
- {
- this.indents.push(column);
- if (this.callback(Sk.Tokenizer.Tokens.T_INDENT, line.substring(0, pos), [this.lnum, 0], [this.lnum, pos], line)) {
- return 'done';
- }
- }
- while (column < this.indents[this.indents.length - 1]) {
- if (!contains(this.indents, column)) {
- throw new Sk.builtin.IndentationError("unindent does not match any outer indentation level",
- this.filename, this.lnum, pos, line);
- }
- this.indents.splice(this.indents.length - 1, 1);
- //print("dedent here");
- if (this.callback(Sk.Tokenizer.Tokens.T_DEDENT, '', [this.lnum, pos], [this.lnum, pos], line)) {
- return 'done';
- }
- }
- }
- else // continued statement
- {
- if (!line) {
- throw new Sk.builtin.SyntaxError("EOF in multi-line statement", this.filename, this.lnum, 0, line);
- }
- this.continued = false;
- }
- while (pos < max) {
- //print("pos:"+pos+":"+max);
- // js regexes don't return any info about matches, other than the
- // content. we'd like to put a \w+ before pseudomatch, but then we
- // can't get any data
- capos = line.charAt(pos);
- while (capos === ' ' || capos === '\f' || capos === '\t') {
- pos += 1;
- capos = line.charAt(pos);
- }
- pseudoprog.lastIndex = 0;
- pseudomatch = pseudoprog.exec(line.substring(pos));
- if (pseudomatch) {
- start = pos;
- end = start + pseudomatch[1].length;
- spos = [this.lnum, start];
- epos = [this.lnum, end];
- pos = end;
- token = line.substring(start, end);
- initial = line.charAt(start);
- //Sk.debugout("token:",token, "initial:",initial, start, end);
- if (this.numchars.indexOf(initial) !== -1 || (initial === '.' && token !== '.')) {
- if (this.callback(Sk.Tokenizer.Tokens.T_NUMBER, token, spos, epos, line)) {
- return 'done';
- }
- }
- else if (initial === '\r' || initial === '\n') {
- newl = Sk.Tokenizer.Tokens.T_NEWLINE;
- //print("HERE:3");
- if (this.parenlev > 0) {
- newl = Sk.Tokenizer.Tokens.T_NL;
- }
- if (this.callback(newl, token, spos, epos, line)) {
- return 'done';
- }
- }
- else if (initial === '#') {
- if (this.callback(Sk.Tokenizer.Tokens.T_COMMENT, token, spos, epos, line)) {
- return 'done';
- }
- }
- else if (triple_quoted.hasOwnProperty(token)) {
- this.endprog = endprogs[token];
- this.endprog.lastIndex = 0;
- endmatch = this.endprog.test(line.substring(pos));
- if (endmatch) {
- pos = this.endprog.lastIndex + pos;
- token = line.substring(start, pos);
- if (this.callback(Sk.Tokenizer.Tokens.T_STRING, token, spos, [this.lnum, pos], line)) {
- return 'done';
- }
- }
- else {
- this.strstart = [this.lnum, start];
- this.contstr = line.substring(start);
- this.contline = line;
- return false;
- }
- }
- else if (single_quoted.hasOwnProperty(initial) ||
- single_quoted.hasOwnProperty(token.substring(0, 2)) ||
- single_quoted.hasOwnProperty(token.substring(0, 3))) {
- if (token[token.length - 1] === '\n') {
- this.strstart = [this.lnum, start];
- this.endprog = endprogs[initial] || endprogs[token[1]] || endprogs[token[2]];
- this.contstr = line.substring(start);
- this.needcont = true;
- this.contline = line;
- //print("i, t1, t2", initial, token[1], token[2]);
- //print("ep, cs", this.endprog, this.contstr);
- return false;
- }
- else {
- if (this.callback(Sk.Tokenizer.Tokens.T_STRING, token, spos, epos, line)) {
- return 'done';
- }
- }
- }
- else if (this.namechars.indexOf(initial) !== -1) {
- if (this.callback(Sk.Tokenizer.Tokens.T_NAME, token, spos, epos, line)) {
- return 'done';
- }
- }
- else if (initial === '\\') {
- //print("HERE:4");
- if (this.callback(Sk.Tokenizer.Tokens.T_NL, token, spos, [this.lnum, pos], line)) {
- return 'done';
- }
- this.continued = true;
- }
- else {
- if ('([{'.indexOf(initial) !== -1) {
- this.parenlev += 1;
- }
- else if (')]}'.indexOf(initial) !== -1) {
- this.parenlev -= 1;
- }
- if (this.callback(Sk.Tokenizer.Tokens.T_OP, token, spos, epos, line)) {
- return 'done';
- }
- }
- }
- else {
- if (this.callback(Sk.Tokenizer.Tokens.T_ERRORTOKEN, line.charAt(pos),
- [this.lnum, pos], [this.lnum, pos + 1], line)) {
- return 'done';
- }
- pos += 1;
- }
- }
- return false;
- };
- Sk.Tokenizer.tokenNames = {
- 0 : 'T_ENDMARKER', 1: 'T_NAME', 2: 'T_NUMBER', 3: 'T_STRING', 4: 'T_NEWLINE',
- 5 : 'T_INDENT', 6: 'T_DEDENT', 7: 'T_LPAR', 8: 'T_RPAR', 9: 'T_LSQB',
- 10 : 'T_RSQB', 11: 'T_COLON', 12: 'T_COMMA', 13: 'T_SEMI', 14: 'T_PLUS',
- 15 : 'T_MINUS', 16: 'T_STAR', 17: 'T_SLASH', 18: 'T_VBAR', 19: 'T_AMPER',
- 20 : 'T_LESS', 21: 'T_GREATER', 22: 'T_EQUAL', 23: 'T_DOT', 24: 'T_PERCENT',
- 25 : 'T_BACKQUOTE', 26: 'T_LBRACE', 27: 'T_RBRACE', 28: 'T_EQEQUAL', 29: 'T_NOTEQUAL',
- 30 : 'T_LESSEQUAL', 31: 'T_GREATEREQUAL', 32: 'T_TILDE', 33: 'T_CIRCUMFLEX', 34: 'T_LEFTSHIFT',
- 35 : 'T_RIGHTSHIFT', 36: 'T_DOUBLESTAR', 37: 'T_PLUSEQUAL', 38: 'T_MINEQUAL', 39: 'T_STAREQUAL',
- 40 : 'T_SLASHEQUAL', 41: 'T_PERCENTEQUAL', 42: 'T_AMPEREQUAL', 43: 'T_VBAREQUAL', 44: 'T_CIRCUMFLEXEQUAL',
- 45 : 'T_LEFTSHIFTEQUAL', 46: 'T_RIGHTSHIFTEQUAL', 47: 'T_DOUBLESTAREQUAL', 48: 'T_DOUBLESLASH', 49: 'T_DOUBLESLASHEQUAL',
- 50 : 'T_AT', 51: 'T_OP', 52: 'T_COMMENT', 53: 'T_NL', 54: 'T_RARROW',
- 55 : 'T_ERRORTOKEN', 56: 'T_N_TOKENS',
- 256: 'T_NT_OFFSET'
- };
- goog.exportSymbol("Sk.Tokenizer", Sk.Tokenizer);
- goog.exportSymbol("Sk.Tokenizer.prototype.generateTokens", Sk.Tokenizer.prototype.generateTokens);
- goog.exportSymbol("Sk.Tokenizer.tokenNames", Sk.Tokenizer.tokenNames);
|