Lexer.js 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. import { Tokenizer } from './Tokenizer.js';
  2. import { defaults } from './defaults.js';
  3. import { block, inline } from './rules.js';
  4. import { repeatString } from './helpers.js';
  5. /**
  6. * smartypants text replacement
  7. */
  8. function smartypants(text) {
  9. return text
  10. // em-dashes
  11. .replace(/---/g, '\u2014')
  12. // en-dashes
  13. .replace(/--/g, '\u2013')
  14. // opening singles
  15. .replace(/(^|[-\u2014/(\[{"\s])'/g, '$1\u2018')
  16. // closing singles & apostrophes
  17. .replace(/'/g, '\u2019')
  18. // opening doubles
  19. .replace(/(^|[-\u2014/(\[{\u2018\s])"/g, '$1\u201c')
  20. // closing doubles
  21. .replace(/"/g, '\u201d')
  22. // ellipses
  23. .replace(/\.{3}/g, '\u2026');
  24. }
  25. /**
  26. * mangle email addresses
  27. */
  28. function mangle(text) {
  29. let out = '',
  30. i,
  31. ch;
  32. const l = text.length;
  33. for (i = 0; i < l; i++) {
  34. ch = text.charCodeAt(i);
  35. if (Math.random() > 0.5) {
  36. ch = 'x' + ch.toString(16);
  37. }
  38. out += '&#' + ch + ';';
  39. }
  40. return out;
  41. }
  42. /**
  43. * Block Lexer
  44. */
  45. export class Lexer {
  46. constructor(options) {
  47. this.tokens = [];
  48. this.tokens.links = Object.create(null);
  49. this.options = options || defaults;
  50. this.options.tokenizer = this.options.tokenizer || new Tokenizer();
  51. this.tokenizer = this.options.tokenizer;
  52. this.tokenizer.options = this.options;
  53. this.tokenizer.lexer = this;
  54. this.inlineQueue = [];
  55. this.state = {
  56. inLink: false,
  57. inRawBlock: false,
  58. top: true
  59. };
  60. const rules = {
  61. block: block.normal,
  62. inline: inline.normal
  63. };
  64. if (this.options.pedantic) {
  65. rules.block = block.pedantic;
  66. rules.inline = inline.pedantic;
  67. } else if (this.options.gfm) {
  68. rules.block = block.gfm;
  69. if (this.options.breaks) {
  70. rules.inline = inline.breaks;
  71. } else {
  72. rules.inline = inline.gfm;
  73. }
  74. }
  75. this.tokenizer.rules = rules;
  76. }
  77. /**
  78. * Expose Rules
  79. */
  80. static get rules() {
  81. return {
  82. block,
  83. inline
  84. };
  85. }
  86. /**
  87. * Static Lex Method
  88. */
  89. static lex(src, options) {
  90. const lexer = new Lexer(options);
  91. return lexer.lex(src);
  92. }
  93. /**
  94. * Static Lex Inline Method
  95. */
  96. static lexInline(src, options) {
  97. const lexer = new Lexer(options);
  98. return lexer.inlineTokens(src);
  99. }
  100. /**
  101. * Preprocessing
  102. */
  103. lex(src) {
  104. src = src
  105. .replace(/\r\n|\r/g, '\n')
  106. .replace(/\t/g, ' ');
  107. this.blockTokens(src, this.tokens);
  108. let next;
  109. while (next = this.inlineQueue.shift()) {
  110. this.inlineTokens(next.src, next.tokens);
  111. }
  112. return this.tokens;
  113. }
  114. /**
  115. * Lexing
  116. */
  117. blockTokens(src, tokens = []) {
  118. if (this.options.pedantic) {
  119. src = src.replace(/^ +$/gm, '');
  120. }
  121. let token, lastToken, cutSrc, lastParagraphClipped;
  122. while (src) {
  123. if (this.options.extensions
  124. && this.options.extensions.block
  125. && this.options.extensions.block.some((extTokenizer) => {
  126. if (token = extTokenizer.call({ lexer: this }, src, tokens)) {
  127. src = src.substring(token.raw.length);
  128. tokens.push(token);
  129. return true;
  130. }
  131. return false;
  132. })) {
  133. continue;
  134. }
  135. // newline
  136. if (token = this.tokenizer.space(src)) {
  137. src = src.substring(token.raw.length);
  138. if (token.raw.length === 1 && tokens.length > 0) {
  139. // if there's a single \n as a spacer, it's terminating the last line,
  140. // so move it there so that we don't get unecessary paragraph tags
  141. tokens[tokens.length - 1].raw += '\n';
  142. } else {
  143. tokens.push(token);
  144. }
  145. continue;
  146. }
  147. // code
  148. if (token = this.tokenizer.code(src)) {
  149. src = src.substring(token.raw.length);
  150. lastToken = tokens[tokens.length - 1];
  151. // An indented code block cannot interrupt a paragraph.
  152. if (lastToken && (lastToken.type === 'paragraph' || lastToken.type === 'text')) {
  153. lastToken.raw += '\n' + token.raw;
  154. lastToken.text += '\n' + token.text;
  155. this.inlineQueue[this.inlineQueue.length - 1].src = lastToken.text;
  156. } else {
  157. tokens.push(token);
  158. }
  159. continue;
  160. }
  161. // fences
  162. if (token = this.tokenizer.fences(src)) {
  163. src = src.substring(token.raw.length);
  164. tokens.push(token);
  165. continue;
  166. }
  167. // heading
  168. if (token = this.tokenizer.heading(src)) {
  169. src = src.substring(token.raw.length);
  170. tokens.push(token);
  171. continue;
  172. }
  173. // hr
  174. if (token = this.tokenizer.hr(src)) {
  175. src = src.substring(token.raw.length);
  176. tokens.push(token);
  177. continue;
  178. }
  179. // blockquote
  180. if (token = this.tokenizer.blockquote(src)) {
  181. src = src.substring(token.raw.length);
  182. tokens.push(token);
  183. continue;
  184. }
  185. // list
  186. if (token = this.tokenizer.list(src)) {
  187. src = src.substring(token.raw.length);
  188. tokens.push(token);
  189. continue;
  190. }
  191. // html
  192. if (token = this.tokenizer.html(src)) {
  193. src = src.substring(token.raw.length);
  194. tokens.push(token);
  195. continue;
  196. }
  197. // def
  198. if (token = this.tokenizer.def(src)) {
  199. src = src.substring(token.raw.length);
  200. lastToken = tokens[tokens.length - 1];
  201. if (lastToken && (lastToken.type === 'paragraph' || lastToken.type === 'text')) {
  202. lastToken.raw += '\n' + token.raw;
  203. lastToken.text += '\n' + token.raw;
  204. this.inlineQueue[this.inlineQueue.length - 1].src = lastToken.text;
  205. } else if (!this.tokens.links[token.tag]) {
  206. this.tokens.links[token.tag] = {
  207. href: token.href,
  208. title: token.title
  209. };
  210. }
  211. continue;
  212. }
  213. // table (gfm)
  214. if (token = this.tokenizer.table(src)) {
  215. src = src.substring(token.raw.length);
  216. tokens.push(token);
  217. continue;
  218. }
  219. // lheading
  220. if (token = this.tokenizer.lheading(src)) {
  221. src = src.substring(token.raw.length);
  222. tokens.push(token);
  223. continue;
  224. }
  225. // top-level paragraph
  226. // prevent paragraph consuming extensions by clipping 'src' to extension start
  227. cutSrc = src;
  228. if (this.options.extensions && this.options.extensions.startBlock) {
  229. let startIndex = Infinity;
  230. const tempSrc = src.slice(1);
  231. let tempStart;
  232. this.options.extensions.startBlock.forEach(function(getStartIndex) {
  233. tempStart = getStartIndex.call({ lexer: this }, tempSrc);
  234. if (typeof tempStart === 'number' && tempStart >= 0) { startIndex = Math.min(startIndex, tempStart); }
  235. });
  236. if (startIndex < Infinity && startIndex >= 0) {
  237. cutSrc = src.substring(0, startIndex + 1);
  238. }
  239. }
  240. if (this.state.top && (token = this.tokenizer.paragraph(cutSrc))) {
  241. lastToken = tokens[tokens.length - 1];
  242. if (lastParagraphClipped && lastToken.type === 'paragraph') {
  243. lastToken.raw += '\n' + token.raw;
  244. lastToken.text += '\n' + token.text;
  245. this.inlineQueue.pop();
  246. this.inlineQueue[this.inlineQueue.length - 1].src = lastToken.text;
  247. } else {
  248. tokens.push(token);
  249. }
  250. lastParagraphClipped = (cutSrc.length !== src.length);
  251. src = src.substring(token.raw.length);
  252. continue;
  253. }
  254. // text
  255. if (token = this.tokenizer.text(src)) {
  256. src = src.substring(token.raw.length);
  257. lastToken = tokens[tokens.length - 1];
  258. if (lastToken && lastToken.type === 'text') {
  259. lastToken.raw += '\n' + token.raw;
  260. lastToken.text += '\n' + token.text;
  261. this.inlineQueue.pop();
  262. this.inlineQueue[this.inlineQueue.length - 1].src = lastToken.text;
  263. } else {
  264. tokens.push(token);
  265. }
  266. continue;
  267. }
  268. if (src) {
  269. const errMsg = 'Infinite loop on byte: ' + src.charCodeAt(0);
  270. if (this.options.silent) {
  271. console.error(errMsg);
  272. break;
  273. } else {
  274. throw new Error(errMsg);
  275. }
  276. }
  277. }
  278. this.state.top = true;
  279. return tokens;
  280. }
  281. inline(src, tokens) {
  282. this.inlineQueue.push({ src, tokens });
  283. }
  284. /**
  285. * Lexing/Compiling
  286. */
  287. inlineTokens(src, tokens = []) {
  288. let token, lastToken, cutSrc;
  289. // String with links masked to avoid interference with em and strong
  290. let maskedSrc = src;
  291. let match;
  292. let keepPrevChar, prevChar;
  293. // Mask out reflinks
  294. if (this.tokens.links) {
  295. const links = Object.keys(this.tokens.links);
  296. if (links.length > 0) {
  297. while ((match = this.tokenizer.rules.inline.reflinkSearch.exec(maskedSrc)) != null) {
  298. if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) {
  299. maskedSrc = maskedSrc.slice(0, match.index) + '[' + repeatString('a', match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.reflinkSearch.lastIndex);
  300. }
  301. }
  302. }
  303. }
  304. // Mask out other blocks
  305. while ((match = this.tokenizer.rules.inline.blockSkip.exec(maskedSrc)) != null) {
  306. maskedSrc = maskedSrc.slice(0, match.index) + '[' + repeatString('a', match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.blockSkip.lastIndex);
  307. }
  308. // Mask out escaped em & strong delimiters
  309. while ((match = this.tokenizer.rules.inline.escapedEmSt.exec(maskedSrc)) != null) {
  310. maskedSrc = maskedSrc.slice(0, match.index) + '++' + maskedSrc.slice(this.tokenizer.rules.inline.escapedEmSt.lastIndex);
  311. }
  312. while (src) {
  313. if (!keepPrevChar) {
  314. prevChar = '';
  315. }
  316. keepPrevChar = false;
  317. // extensions
  318. if (this.options.extensions
  319. && this.options.extensions.inline
  320. && this.options.extensions.inline.some((extTokenizer) => {
  321. if (token = extTokenizer.call({ lexer: this }, src, tokens)) {
  322. src = src.substring(token.raw.length);
  323. tokens.push(token);
  324. return true;
  325. }
  326. return false;
  327. })) {
  328. continue;
  329. }
  330. // escape
  331. if (token = this.tokenizer.escape(src)) {
  332. src = src.substring(token.raw.length);
  333. tokens.push(token);
  334. continue;
  335. }
  336. // tag
  337. if (token = this.tokenizer.tag(src)) {
  338. src = src.substring(token.raw.length);
  339. lastToken = tokens[tokens.length - 1];
  340. if (lastToken && token.type === 'text' && lastToken.type === 'text') {
  341. lastToken.raw += token.raw;
  342. lastToken.text += token.text;
  343. } else {
  344. tokens.push(token);
  345. }
  346. continue;
  347. }
  348. // link
  349. if (token = this.tokenizer.link(src)) {
  350. src = src.substring(token.raw.length);
  351. tokens.push(token);
  352. continue;
  353. }
  354. // reflink, nolink
  355. if (token = this.tokenizer.reflink(src, this.tokens.links)) {
  356. src = src.substring(token.raw.length);
  357. lastToken = tokens[tokens.length - 1];
  358. if (lastToken && token.type === 'text' && lastToken.type === 'text') {
  359. lastToken.raw += token.raw;
  360. lastToken.text += token.text;
  361. } else {
  362. tokens.push(token);
  363. }
  364. continue;
  365. }
  366. // em & strong
  367. if (token = this.tokenizer.emStrong(src, maskedSrc, prevChar)) {
  368. src = src.substring(token.raw.length);
  369. tokens.push(token);
  370. continue;
  371. }
  372. // code
  373. if (token = this.tokenizer.codespan(src)) {
  374. src = src.substring(token.raw.length);
  375. tokens.push(token);
  376. continue;
  377. }
  378. // br
  379. if (token = this.tokenizer.br(src)) {
  380. src = src.substring(token.raw.length);
  381. tokens.push(token);
  382. continue;
  383. }
  384. // del (gfm)
  385. if (token = this.tokenizer.del(src)) {
  386. src = src.substring(token.raw.length);
  387. tokens.push(token);
  388. continue;
  389. }
  390. // autolink
  391. if (token = this.tokenizer.autolink(src, mangle)) {
  392. src = src.substring(token.raw.length);
  393. tokens.push(token);
  394. continue;
  395. }
  396. // url (gfm)
  397. if (!this.state.inLink && (token = this.tokenizer.url(src, mangle))) {
  398. src = src.substring(token.raw.length);
  399. tokens.push(token);
  400. continue;
  401. }
  402. // text
  403. // prevent inlineText consuming extensions by clipping 'src' to extension start
  404. cutSrc = src;
  405. if (this.options.extensions && this.options.extensions.startInline) {
  406. let startIndex = Infinity;
  407. const tempSrc = src.slice(1);
  408. let tempStart;
  409. this.options.extensions.startInline.forEach(function(getStartIndex) {
  410. tempStart = getStartIndex.call({ lexer: this }, tempSrc);
  411. if (typeof tempStart === 'number' && tempStart >= 0) { startIndex = Math.min(startIndex, tempStart); }
  412. });
  413. if (startIndex < Infinity && startIndex >= 0) {
  414. cutSrc = src.substring(0, startIndex + 1);
  415. }
  416. }
  417. if (token = this.tokenizer.inlineText(cutSrc, smartypants)) {
  418. src = src.substring(token.raw.length);
  419. if (token.raw.slice(-1) !== '_') { // Track prevChar before string of ____ started
  420. prevChar = token.raw.slice(-1);
  421. }
  422. keepPrevChar = true;
  423. lastToken = tokens[tokens.length - 1];
  424. if (lastToken && lastToken.type === 'text') {
  425. lastToken.raw += token.raw;
  426. lastToken.text += token.text;
  427. } else {
  428. tokens.push(token);
  429. }
  430. continue;
  431. }
  432. if (src) {
  433. const errMsg = 'Infinite loop on byte: ' + src.charCodeAt(0);
  434. if (this.options.silent) {
  435. console.error(errMsg);
  436. break;
  437. } else {
  438. throw new Error(errMsg);
  439. }
  440. }
  441. }
  442. return tokens;
  443. }
  444. }