tokenize.js 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. /*
  2. * This is a port of tokenize.py by Ka-Ping Yee.
  3. *
  4. * each call to readline should return one line of input as a string, or
  5. * undefined if it's finished.
  6. *
  7. * callback is called for each token with 5 args:
  8. * 1. the token type
  9. * 2. the token string
  10. * 3. [ start_row, start_col ]
  11. * 4. [ end_row, end_col ]
  12. * 5. logical line where the token was found, including continuation lines
  13. *
  14. * callback can return true to abort.
  15. *
  16. */
  17. /**
  18. * @constructor
  19. */
  20. Sk.Tokenizer = function (filename, interactive, callback) {
  21. this.filename = filename;
  22. this.callback = callback;
  23. this.lnum = 0;
  24. this.parenlev = 0;
  25. this.continued = false;
  26. this.namechars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_";
  27. this.numchars = "0123456789";
  28. this.contstr = "";
  29. this.needcont = false;
  30. this.contline = undefined;
  31. this.indents = [0];
  32. this.endprog = /.*/;
  33. this.strstart = [-1, -1];
  34. this.interactive = interactive;
  35. this.doneFunc = function () {
  36. var i;
  37. for (i = 1; i < this.indents.length; ++i) // pop remaining indent levels
  38. {
  39. if (this.callback(Sk.Tokenizer.Tokens.T_DEDENT, "", [this.lnum, 0], [this.lnum, 0], "")) {
  40. return "done";
  41. }
  42. }
  43. if (this.callback(Sk.Tokenizer.Tokens.T_ENDMARKER, "", [this.lnum, 0], [this.lnum, 0], "")) {
  44. return "done";
  45. }
  46. return "failed";
  47. };
  48. };
  49. /**
  50. * @enum {number}
  51. */
  52. Sk.Tokenizer.Tokens = {
  53. T_ENDMARKER : 0,
  54. T_NAME : 1,
  55. T_NUMBER : 2,
  56. T_STRING : 3,
  57. T_NEWLINE : 4,
  58. T_INDENT : 5,
  59. T_DEDENT : 6,
  60. T_LPAR : 7,
  61. T_RPAR : 8,
  62. T_LSQB : 9,
  63. T_RSQB : 10,
  64. T_COLON : 11,
  65. T_COMMA : 12,
  66. T_SEMI : 13,
  67. T_PLUS : 14,
  68. T_MINUS : 15,
  69. T_STAR : 16,
  70. T_SLASH : 17,
  71. T_VBAR : 18,
  72. T_AMPER : 19,
  73. T_LESS : 20,
  74. T_GREATER : 21,
  75. T_EQUAL : 22,
  76. T_DOT : 23,
  77. T_PERCENT : 24,
  78. T_BACKQUOTE : 25,
  79. T_LBRACE : 26,
  80. T_RBRACE : 27,
  81. T_EQEQUAL : 28,
  82. T_NOTEQUAL : 29,
  83. T_LESSEQUAL : 30,
  84. T_GREATEREQUAL : 31,
  85. T_TILDE : 32,
  86. T_CIRCUMFLEX : 33,
  87. T_LEFTSHIFT : 34,
  88. T_RIGHTSHIFT : 35,
  89. T_DOUBLESTAR : 36,
  90. T_PLUSEQUAL : 37,
  91. T_MINEQUAL : 38,
  92. T_STAREQUAL : 39,
  93. T_SLASHEQUAL : 40,
  94. T_PERCENTEQUAL : 41,
  95. T_AMPEREQUAL : 42,
  96. T_VBAREQUAL : 43,
  97. T_CIRCUMFLEXEQUAL : 44,
  98. T_LEFTSHIFTEQUAL : 45,
  99. T_RIGHTSHIFTEQUAL : 46,
  100. T_DOUBLESTAREQUAL : 47,
  101. T_DOUBLESLASH : 48,
  102. T_DOUBLESLASHEQUAL: 49,
  103. T_AT : 50,
  104. T_OP : 51,
  105. T_COMMENT : 52,
  106. T_NL : 53,
  107. T_RARROW : 54,
  108. T_ERRORTOKEN : 55,
  109. T_N_TOKENS : 56,
  110. T_NT_OFFSET : 256
  111. };
  112. /** @param {...*} x */
  113. function group (x) {
  114. var args = Array.prototype.slice.call(arguments);
  115. return "(" + args.join("|") + ")";
  116. }
  117. /** @param {...*} x */
  118. function any (x) {
  119. return group.apply(null, arguments) + "*";
  120. }
  121. /** @param {...*} x */
  122. function maybe (x) {
  123. return group.apply(null, arguments) + "?";
  124. }
  125. /* we have to use string and ctor to be able to build patterns up. + on /.../
  126. * does something strange. */
  127. var Whitespace = "[ \\f\\t]*";
  128. var Comment_ = "#[^\\r\\n]*";
  129. var Ident = "[a-zA-Z_]\\w*";
  130. var Binnumber = "0[bB][01]*";
  131. var Hexnumber = "0[xX][\\da-fA-F]*[lL]?";
  132. var Octnumber = "0[oO]?[0-7]*[lL]?";
  133. var Decnumber = "[1-9]\\d*[lL]?";
  134. var Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber);
  135. var Exponent = "[eE][-+]?\\d+";
  136. var Pointfloat = group("\\d+\\.\\d*", "\\.\\d+") + maybe(Exponent);
  137. var Expfloat = "\\d+" + Exponent;
  138. var Floatnumber = group(Pointfloat, Expfloat);
  139. var Imagnumber = group("\\d+[jJ]", Floatnumber + "[jJ]");
  140. var Number_ = group(Imagnumber, Floatnumber, Intnumber);
  141. // tail end of ' string
  142. var Single = "^[^'\\\\]*(?:\\\\.[^'\\\\]*)*'";
  143. // tail end of " string
  144. var Double_ = '^[^"\\\\]*(?:\\\\.[^"\\\\]*)*"';
  145. // tail end of ''' string
  146. var Single3 = "[^'\\\\]*(?:(?:\\\\.|'(?!''))[^'\\\\]*)*'''";
  147. // tail end of """ string
  148. var Double3 = '[^"\\\\]*(?:(?:\\\\.|"(?!""))[^"\\\\]*)*"""';
  149. var Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""');
  150. var String_ = group("[uU]?[rR]?'[^\\n'\\\\]*(?:\\\\.[^\\n'\\\\]*)*'",
  151. '[uU]?[rR]?"[^\\n"\\\\]*(?:\\\\.[^\\n"\\\\]*)*"');
  152. // Because of leftmost-then-longest match semantics, be sure to put the
  153. // longest operators first (e.g., if = came before ==, == would get
  154. // recognized as two instances of =).
  155. var Operator = group("\\*\\*=?", ">>=?", "<<=?", "<>", "!=",
  156. "//=?", "->",
  157. "[+\\-*/%&|^=<>]=?",
  158. "~");
  159. var Bracket = "[\\][(){}]";
  160. var Special = group("\\r?\\n", "[:;.,`@]");
  161. var Funny = group(Operator, Bracket, Special);
  162. var ContStr = group("[uUbB]?[rR]?'[^\\n'\\\\]*(?:\\\\.[^\\n'\\\\]*)*" +
  163. group("'", "\\\\\\r?\\n"),
  164. "[uUbB]?[rR]?\"[^\\n\"\\\\]*(?:\\\\.[^\\n\"\\\\]*)*" +
  165. group("\"", "\\\\\\r?\\n"));
  166. var PseudoExtras = group("\\\\\\r?\\n", Comment_, Triple);
  167. // Need to prefix with "^" as we only want to match what's next
  168. var PseudoToken = "^" + group(PseudoExtras, Number_, Funny, ContStr, Ident);
  169. var triple_quoted = {
  170. "'''" : true, '"""': true,
  171. "r'''" : true, 'r"""': true, "R'''": true, 'R"""': true,
  172. "u'''" : true, 'u"""': true, "U'''": true, 'U"""': true,
  173. "b'''" : true, 'b"""': true, "B'''": true, 'B"""': true,
  174. "ur'''": true, 'ur"""': true, "Ur'''": true, 'Ur"""': true,
  175. "uR'''": true, 'uR"""': true, "UR'''": true, 'UR"""': true,
  176. "br'''": true, 'br"""': true, "Br'''": true, 'Br"""': true,
  177. "bR'''": true, 'bR"""': true, "BR'''": true, 'BR"""': true
  178. };
  179. var single_quoted = {
  180. "'" : true, '"': true,
  181. "r'" : true, 'r"': true, "R'": true, 'R"': true,
  182. "u'" : true, 'u"': true, "U'": true, 'U"': true,
  183. "b'" : true, 'b"': true, "B'": true, 'B"': true,
  184. "ur'": true, 'ur"': true, "Ur'": true, 'Ur"': true,
  185. "uR'": true, 'uR"': true, "UR'": true, 'UR"': true,
  186. "br'": true, 'br"': true, "Br'": true, 'Br"': true,
  187. "bR'": true, 'bR"': true, "BR'": true, 'BR"': true
  188. };
  189. // hack to make closure keep those objects. not sure what a better way is.
  190. (function () {
  191. var k;
  192. for (k in triple_quoted) {
  193. }
  194. for (k in single_quoted) {
  195. }
  196. }());
  197. var tabsize = 8;
  198. function contains (a, obj) {
  199. var i = a.length;
  200. while (i--) {
  201. if (a[i] === obj) {
  202. return true;
  203. }
  204. }
  205. return false;
  206. }
  207. function rstrip (input, what) {
  208. var i;
  209. for (i = input.length; i > 0; --i) {
  210. if (what.indexOf(input.charAt(i - 1)) === -1) {
  211. break;
  212. }
  213. }
  214. return input.substring(0, i);
  215. }
  216. Sk.Tokenizer.prototype.generateTokens = function (line) {
  217. var nl_pos;
  218. var newl;
  219. var initial;
  220. var token;
  221. var epos;
  222. var spos;
  223. var start;
  224. var pseudomatch;
  225. var capos;
  226. var comment_token;
  227. var endmatch, pos, column, end, max;
  228. // bnm - Move these definitions in this function otherwise test state is preserved between
  229. // calls on single3prog and double3prog causing weird errors with having multiple instances
  230. // of triple quoted strings in the same program.
  231. var pseudoprog = new RegExp(PseudoToken);
  232. var single3prog = new RegExp(Single3, "g");
  233. var double3prog = new RegExp(Double3, "g");
  234. var endprogs = { "'": new RegExp(Single, "g"), "\"": new RegExp(Double_, "g"),
  235. "'''" : single3prog, '"""': double3prog,
  236. "r'''" : single3prog, 'r"""': double3prog,
  237. "u'''" : single3prog, 'u"""': double3prog,
  238. "b'''" : single3prog, 'b"""': double3prog,
  239. "ur'''" : single3prog, 'ur"""': double3prog,
  240. "br'''" : single3prog, 'br"""': double3prog,
  241. "R'''" : single3prog, 'R"""': double3prog,
  242. "U'''" : single3prog, 'U"""': double3prog,
  243. "B'''" : single3prog, 'B"""': double3prog,
  244. "uR'''" : single3prog, 'uR"""': double3prog,
  245. "Ur'''" : single3prog, 'Ur"""': double3prog,
  246. "UR'''" : single3prog, 'UR"""': double3prog,
  247. "bR'''" : single3prog, 'bR"""': double3prog,
  248. "Br'''" : single3prog, 'Br"""': double3prog,
  249. "BR'''" : single3prog, 'BR"""': double3prog,
  250. 'r' : null, 'R': null,
  251. 'u' : null, 'U': null,
  252. 'b' : null, 'B': null
  253. };
  254. if (!line) {
  255. line = '';
  256. }
  257. //print("LINE:'"+line+"'");
  258. this.lnum += 1;
  259. pos = 0;
  260. max = line.length;
  261. if (this.contstr.length > 0) {
  262. if (!line) {
  263. throw new Sk.builtin.SyntaxError("EOF in multi-line string", this.filename, this.strstart[0], this.strstart[1], this.contline);
  264. }
  265. this.endprog.lastIndex = 0;
  266. endmatch = this.endprog.test(line);
  267. if (endmatch) {
  268. pos = end = this.endprog.lastIndex;
  269. if (this.callback(Sk.Tokenizer.Tokens.T_STRING, this.contstr + line.substring(0, end),
  270. this.strstart, [this.lnum, end], this.contline + line)) {
  271. return 'done';
  272. }
  273. this.contstr = '';
  274. this.needcont = false;
  275. this.contline = undefined;
  276. }
  277. else if (this.needcont && line.substring(line.length - 2) !== "\\\n" && line.substring(line.length - 3) !== "\\\r\n") {
  278. if (this.callback(Sk.Tokenizer.Tokens.T_ERRORTOKEN, this.contstr + line,
  279. this.strstart, [this.lnum, line.length], this.contline)) {
  280. return 'done';
  281. }
  282. this.contstr = '';
  283. this.contline = undefined;
  284. return false;
  285. }
  286. else {
  287. this.contstr += line;
  288. this.contline = this.contline + line;
  289. return false;
  290. }
  291. }
  292. else if (this.parenlev === 0 && !this.continued) {
  293. if (!line) {
  294. return this.doneFunc();
  295. }
  296. column = 0;
  297. while (pos < max) {
  298. if (line.charAt(pos) === ' ') {
  299. column += 1;
  300. }
  301. else if (line.charAt(pos) === '\t') {
  302. column = (column / tabsize + 1) * tabsize;
  303. }
  304. else if (line.charAt(pos) === '\f') {
  305. column = 0;
  306. }
  307. else {
  308. break;
  309. }
  310. pos = pos + 1;
  311. }
  312. if (pos === max) {
  313. return this.doneFunc();
  314. }
  315. if ("#\r\n".indexOf(line.charAt(pos)) !== -1) // skip comments or blank lines
  316. {
  317. if (line.charAt(pos) === '#') {
  318. comment_token = rstrip(line.substring(pos), '\r\n');
  319. nl_pos = pos + comment_token.length;
  320. if (this.callback(Sk.Tokenizer.Tokens.T_COMMENT, comment_token,
  321. [this.lnum, pos], [this.lnum, pos + comment_token.length], line)) {
  322. return 'done';
  323. }
  324. //print("HERE:1");
  325. if (this.callback(Sk.Tokenizer.Tokens.T_NL, line.substring(nl_pos),
  326. [this.lnum, nl_pos], [this.lnum, line.length], line)) {
  327. return 'done';
  328. }
  329. return false;
  330. }
  331. else {
  332. //print("HERE:2");
  333. if (this.callback(Sk.Tokenizer.Tokens.T_NL, line.substring(pos),
  334. [this.lnum, pos], [this.lnum, line.length], line)) {
  335. return 'done';
  336. }
  337. if (!this.interactive) {
  338. return false;
  339. }
  340. }
  341. }
  342. if (column > this.indents[this.indents.length - 1]) // count indents or dedents
  343. {
  344. this.indents.push(column);
  345. if (this.callback(Sk.Tokenizer.Tokens.T_INDENT, line.substring(0, pos), [this.lnum, 0], [this.lnum, pos], line)) {
  346. return 'done';
  347. }
  348. }
  349. while (column < this.indents[this.indents.length - 1]) {
  350. if (!contains(this.indents, column)) {
  351. throw new Sk.builtin.IndentationError("unindent does not match any outer indentation level",
  352. this.filename, this.lnum, pos, line);
  353. }
  354. this.indents.splice(this.indents.length - 1, 1);
  355. //print("dedent here");
  356. if (this.callback(Sk.Tokenizer.Tokens.T_DEDENT, '', [this.lnum, pos], [this.lnum, pos], line)) {
  357. return 'done';
  358. }
  359. }
  360. }
  361. else // continued statement
  362. {
  363. if (!line) {
  364. throw new Sk.builtin.SyntaxError("EOF in multi-line statement", this.filename, this.lnum, 0, line);
  365. }
  366. this.continued = false;
  367. }
  368. while (pos < max) {
  369. //print("pos:"+pos+":"+max);
  370. // js regexes don't return any info about matches, other than the
  371. // content. we'd like to put a \w+ before pseudomatch, but then we
  372. // can't get any data
  373. capos = line.charAt(pos);
  374. while (capos === ' ' || capos === '\f' || capos === '\t') {
  375. pos += 1;
  376. capos = line.charAt(pos);
  377. }
  378. pseudoprog.lastIndex = 0;
  379. pseudomatch = pseudoprog.exec(line.substring(pos));
  380. if (pseudomatch) {
  381. start = pos;
  382. end = start + pseudomatch[1].length;
  383. spos = [this.lnum, start];
  384. epos = [this.lnum, end];
  385. pos = end;
  386. token = line.substring(start, end);
  387. initial = line.charAt(start);
  388. //Sk.debugout("token:",token, "initial:",initial, start, end);
  389. if (this.numchars.indexOf(initial) !== -1 || (initial === '.' && token !== '.')) {
  390. if (this.callback(Sk.Tokenizer.Tokens.T_NUMBER, token, spos, epos, line)) {
  391. return 'done';
  392. }
  393. }
  394. else if (initial === '\r' || initial === '\n') {
  395. newl = Sk.Tokenizer.Tokens.T_NEWLINE;
  396. //print("HERE:3");
  397. if (this.parenlev > 0) {
  398. newl = Sk.Tokenizer.Tokens.T_NL;
  399. }
  400. if (this.callback(newl, token, spos, epos, line)) {
  401. return 'done';
  402. }
  403. }
  404. else if (initial === '#') {
  405. if (this.callback(Sk.Tokenizer.Tokens.T_COMMENT, token, spos, epos, line)) {
  406. return 'done';
  407. }
  408. }
  409. else if (triple_quoted.hasOwnProperty(token)) {
  410. this.endprog = endprogs[token];
  411. this.endprog.lastIndex = 0;
  412. endmatch = this.endprog.test(line.substring(pos));
  413. if (endmatch) {
  414. pos = this.endprog.lastIndex + pos;
  415. token = line.substring(start, pos);
  416. if (this.callback(Sk.Tokenizer.Tokens.T_STRING, token, spos, [this.lnum, pos], line)) {
  417. return 'done';
  418. }
  419. }
  420. else {
  421. this.strstart = [this.lnum, start];
  422. this.contstr = line.substring(start);
  423. this.contline = line;
  424. return false;
  425. }
  426. }
  427. else if (single_quoted.hasOwnProperty(initial) ||
  428. single_quoted.hasOwnProperty(token.substring(0, 2)) ||
  429. single_quoted.hasOwnProperty(token.substring(0, 3))) {
  430. if (token[token.length - 1] === '\n') {
  431. this.strstart = [this.lnum, start];
  432. this.endprog = endprogs[initial] || endprogs[token[1]] || endprogs[token[2]];
  433. this.contstr = line.substring(start);
  434. this.needcont = true;
  435. this.contline = line;
  436. //print("i, t1, t2", initial, token[1], token[2]);
  437. //print("ep, cs", this.endprog, this.contstr);
  438. return false;
  439. }
  440. else {
  441. if (this.callback(Sk.Tokenizer.Tokens.T_STRING, token, spos, epos, line)) {
  442. return 'done';
  443. }
  444. }
  445. }
  446. else if (this.namechars.indexOf(initial) !== -1) {
  447. if (this.callback(Sk.Tokenizer.Tokens.T_NAME, token, spos, epos, line)) {
  448. return 'done';
  449. }
  450. }
  451. else if (initial === '\\') {
  452. //print("HERE:4");
  453. if (this.callback(Sk.Tokenizer.Tokens.T_NL, token, spos, [this.lnum, pos], line)) {
  454. return 'done';
  455. }
  456. this.continued = true;
  457. }
  458. else {
  459. if ('([{'.indexOf(initial) !== -1) {
  460. this.parenlev += 1;
  461. }
  462. else if (')]}'.indexOf(initial) !== -1) {
  463. this.parenlev -= 1;
  464. }
  465. if (this.callback(Sk.Tokenizer.Tokens.T_OP, token, spos, epos, line)) {
  466. return 'done';
  467. }
  468. }
  469. }
  470. else {
  471. if (this.callback(Sk.Tokenizer.Tokens.T_ERRORTOKEN, line.charAt(pos),
  472. [this.lnum, pos], [this.lnum, pos + 1], line)) {
  473. return 'done';
  474. }
  475. pos += 1;
  476. }
  477. }
  478. return false;
  479. };
  480. Sk.Tokenizer.tokenNames = {
  481. 0 : 'T_ENDMARKER', 1: 'T_NAME', 2: 'T_NUMBER', 3: 'T_STRING', 4: 'T_NEWLINE',
  482. 5 : 'T_INDENT', 6: 'T_DEDENT', 7: 'T_LPAR', 8: 'T_RPAR', 9: 'T_LSQB',
  483. 10 : 'T_RSQB', 11: 'T_COLON', 12: 'T_COMMA', 13: 'T_SEMI', 14: 'T_PLUS',
  484. 15 : 'T_MINUS', 16: 'T_STAR', 17: 'T_SLASH', 18: 'T_VBAR', 19: 'T_AMPER',
  485. 20 : 'T_LESS', 21: 'T_GREATER', 22: 'T_EQUAL', 23: 'T_DOT', 24: 'T_PERCENT',
  486. 25 : 'T_BACKQUOTE', 26: 'T_LBRACE', 27: 'T_RBRACE', 28: 'T_EQEQUAL', 29: 'T_NOTEQUAL',
  487. 30 : 'T_LESSEQUAL', 31: 'T_GREATEREQUAL', 32: 'T_TILDE', 33: 'T_CIRCUMFLEX', 34: 'T_LEFTSHIFT',
  488. 35 : 'T_RIGHTSHIFT', 36: 'T_DOUBLESTAR', 37: 'T_PLUSEQUAL', 38: 'T_MINEQUAL', 39: 'T_STAREQUAL',
  489. 40 : 'T_SLASHEQUAL', 41: 'T_PERCENTEQUAL', 42: 'T_AMPEREQUAL', 43: 'T_VBAREQUAL', 44: 'T_CIRCUMFLEXEQUAL',
  490. 45 : 'T_LEFTSHIFTEQUAL', 46: 'T_RIGHTSHIFTEQUAL', 47: 'T_DOUBLESTAREQUAL', 48: 'T_DOUBLESLASH', 49: 'T_DOUBLESLASHEQUAL',
  491. 50 : 'T_AT', 51: 'T_OP', 52: 'T_COMMENT', 53: 'T_NL', 54: 'T_RARROW',
  492. 55 : 'T_ERRORTOKEN', 56: 'T_N_TOKENS',
  493. 256: 'T_NT_OFFSET'
  494. };
  495. goog.exportSymbol("Sk.Tokenizer", Sk.Tokenizer);
  496. goog.exportSymbol("Sk.Tokenizer.prototype.generateTokens", Sk.Tokenizer.prototype.generateTokens);
  497. goog.exportSymbol("Sk.Tokenizer.tokenNames", Sk.Tokenizer.tokenNames);