Tokenizer.js 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898
  1. import { htmlDecodeTree, xmlDecodeTree, BinTrieFlags, determineBranch, replaceCodePoint, } from "entities/lib/decode.js";
  2. var CharCodes;
  3. (function (CharCodes) {
  4. CharCodes[CharCodes["Tab"] = 9] = "Tab";
  5. CharCodes[CharCodes["NewLine"] = 10] = "NewLine";
  6. CharCodes[CharCodes["FormFeed"] = 12] = "FormFeed";
  7. CharCodes[CharCodes["CarriageReturn"] = 13] = "CarriageReturn";
  8. CharCodes[CharCodes["Space"] = 32] = "Space";
  9. CharCodes[CharCodes["ExclamationMark"] = 33] = "ExclamationMark";
  10. CharCodes[CharCodes["Num"] = 35] = "Num";
  11. CharCodes[CharCodes["Amp"] = 38] = "Amp";
  12. CharCodes[CharCodes["SingleQuote"] = 39] = "SingleQuote";
  13. CharCodes[CharCodes["DoubleQuote"] = 34] = "DoubleQuote";
  14. CharCodes[CharCodes["Dash"] = 45] = "Dash";
  15. CharCodes[CharCodes["Slash"] = 47] = "Slash";
  16. CharCodes[CharCodes["Zero"] = 48] = "Zero";
  17. CharCodes[CharCodes["Nine"] = 57] = "Nine";
  18. CharCodes[CharCodes["Semi"] = 59] = "Semi";
  19. CharCodes[CharCodes["Lt"] = 60] = "Lt";
  20. CharCodes[CharCodes["Eq"] = 61] = "Eq";
  21. CharCodes[CharCodes["Gt"] = 62] = "Gt";
  22. CharCodes[CharCodes["Questionmark"] = 63] = "Questionmark";
  23. CharCodes[CharCodes["UpperA"] = 65] = "UpperA";
  24. CharCodes[CharCodes["LowerA"] = 97] = "LowerA";
  25. CharCodes[CharCodes["UpperF"] = 70] = "UpperF";
  26. CharCodes[CharCodes["LowerF"] = 102] = "LowerF";
  27. CharCodes[CharCodes["UpperZ"] = 90] = "UpperZ";
  28. CharCodes[CharCodes["LowerZ"] = 122] = "LowerZ";
  29. CharCodes[CharCodes["LowerX"] = 120] = "LowerX";
  30. CharCodes[CharCodes["OpeningSquareBracket"] = 91] = "OpeningSquareBracket";
  31. })(CharCodes || (CharCodes = {}));
  32. /** All the states the tokenizer can be in. */
  33. var State;
  34. (function (State) {
  35. State[State["Text"] = 1] = "Text";
  36. State[State["BeforeTagName"] = 2] = "BeforeTagName";
  37. State[State["InTagName"] = 3] = "InTagName";
  38. State[State["InSelfClosingTag"] = 4] = "InSelfClosingTag";
  39. State[State["BeforeClosingTagName"] = 5] = "BeforeClosingTagName";
  40. State[State["InClosingTagName"] = 6] = "InClosingTagName";
  41. State[State["AfterClosingTagName"] = 7] = "AfterClosingTagName";
  42. // Attributes
  43. State[State["BeforeAttributeName"] = 8] = "BeforeAttributeName";
  44. State[State["InAttributeName"] = 9] = "InAttributeName";
  45. State[State["AfterAttributeName"] = 10] = "AfterAttributeName";
  46. State[State["BeforeAttributeValue"] = 11] = "BeforeAttributeValue";
  47. State[State["InAttributeValueDq"] = 12] = "InAttributeValueDq";
  48. State[State["InAttributeValueSq"] = 13] = "InAttributeValueSq";
  49. State[State["InAttributeValueNq"] = 14] = "InAttributeValueNq";
  50. // Declarations
  51. State[State["BeforeDeclaration"] = 15] = "BeforeDeclaration";
  52. State[State["InDeclaration"] = 16] = "InDeclaration";
  53. // Processing instructions
  54. State[State["InProcessingInstruction"] = 17] = "InProcessingInstruction";
  55. // Comments & CDATA
  56. State[State["BeforeComment"] = 18] = "BeforeComment";
  57. State[State["CDATASequence"] = 19] = "CDATASequence";
  58. State[State["InSpecialComment"] = 20] = "InSpecialComment";
  59. State[State["InCommentLike"] = 21] = "InCommentLike";
  60. // Special tags
  61. State[State["BeforeSpecialS"] = 22] = "BeforeSpecialS";
  62. State[State["SpecialStartSequence"] = 23] = "SpecialStartSequence";
  63. State[State["InSpecialTag"] = 24] = "InSpecialTag";
  64. State[State["BeforeEntity"] = 25] = "BeforeEntity";
  65. State[State["BeforeNumericEntity"] = 26] = "BeforeNumericEntity";
  66. State[State["InNamedEntity"] = 27] = "InNamedEntity";
  67. State[State["InNumericEntity"] = 28] = "InNumericEntity";
  68. State[State["InHexEntity"] = 29] = "InHexEntity";
  69. })(State || (State = {}));
  70. function isWhitespace(c) {
  71. return (c === CharCodes.Space ||
  72. c === CharCodes.NewLine ||
  73. c === CharCodes.Tab ||
  74. c === CharCodes.FormFeed ||
  75. c === CharCodes.CarriageReturn);
  76. }
  77. function isEndOfTagSection(c) {
  78. return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c);
  79. }
  80. function isNumber(c) {
  81. return c >= CharCodes.Zero && c <= CharCodes.Nine;
  82. }
  83. function isASCIIAlpha(c) {
  84. return ((c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
  85. (c >= CharCodes.UpperA && c <= CharCodes.UpperZ));
  86. }
  87. function isHexDigit(c) {
  88. return ((c >= CharCodes.UpperA && c <= CharCodes.UpperF) ||
  89. (c >= CharCodes.LowerA && c <= CharCodes.LowerF));
  90. }
  91. export var QuoteType;
  92. (function (QuoteType) {
  93. QuoteType[QuoteType["NoValue"] = 0] = "NoValue";
  94. QuoteType[QuoteType["Unquoted"] = 1] = "Unquoted";
  95. QuoteType[QuoteType["Single"] = 2] = "Single";
  96. QuoteType[QuoteType["Double"] = 3] = "Double";
  97. })(QuoteType || (QuoteType = {}));
  98. /**
  99. * Sequences used to match longer strings.
  100. *
  101. * We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End
  102. * sequences with an increased offset.
  103. */
  104. const Sequences = {
  105. Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]),
  106. CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]),
  107. CommentEnd: new Uint8Array([0x2d, 0x2d, 0x3e]),
  108. ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]),
  109. StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]),
  110. TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title`
  111. };
  112. export default class Tokenizer {
  113. constructor({ xmlMode = false, decodeEntities = true, }, cbs) {
  114. this.cbs = cbs;
  115. /** The current state the tokenizer is in. */
  116. this.state = State.Text;
  117. /** The read buffer. */
  118. this.buffer = "";
  119. /** The beginning of the section that is currently being read. */
  120. this.sectionStart = 0;
  121. /** The index within the buffer that we are currently looking at. */
  122. this.index = 0;
  123. /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
  124. this.baseState = State.Text;
  125. /** For special parsing behavior inside of script and style tags. */
  126. this.isSpecial = false;
  127. /** Indicates whether the tokenizer has been paused. */
  128. this.running = true;
  129. /** The offset of the current buffer. */
  130. this.offset = 0;
  131. this.sequenceIndex = 0;
  132. this.trieIndex = 0;
  133. this.trieCurrent = 0;
  134. /** For named entities, the index of the value. For numeric entities, the code point. */
  135. this.entityResult = 0;
  136. this.entityExcess = 0;
  137. this.xmlMode = xmlMode;
  138. this.decodeEntities = decodeEntities;
  139. this.entityTrie = xmlMode ? xmlDecodeTree : htmlDecodeTree;
  140. }
  141. reset() {
  142. this.state = State.Text;
  143. this.buffer = "";
  144. this.sectionStart = 0;
  145. this.index = 0;
  146. this.baseState = State.Text;
  147. this.currentSequence = undefined;
  148. this.running = true;
  149. this.offset = 0;
  150. }
  151. write(chunk) {
  152. this.offset += this.buffer.length;
  153. this.buffer = chunk;
  154. this.parse();
  155. }
  156. end() {
  157. if (this.running)
  158. this.finish();
  159. }
  160. pause() {
  161. this.running = false;
  162. }
  163. resume() {
  164. this.running = true;
  165. if (this.index < this.buffer.length + this.offset) {
  166. this.parse();
  167. }
  168. }
  169. /**
  170. * The current index within all of the written data.
  171. */
  172. getIndex() {
  173. return this.index;
  174. }
  175. /**
  176. * The start of the current section.
  177. */
  178. getSectionStart() {
  179. return this.sectionStart;
  180. }
  181. stateText(c) {
  182. if (c === CharCodes.Lt ||
  183. (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))) {
  184. if (this.index > this.sectionStart) {
  185. this.cbs.ontext(this.sectionStart, this.index);
  186. }
  187. this.state = State.BeforeTagName;
  188. this.sectionStart = this.index;
  189. }
  190. else if (this.decodeEntities && c === CharCodes.Amp) {
  191. this.state = State.BeforeEntity;
  192. }
  193. }
  194. stateSpecialStartSequence(c) {
  195. const isEnd = this.sequenceIndex === this.currentSequence.length;
  196. const isMatch = isEnd
  197. ? // If we are at the end of the sequence, make sure the tag name has ended
  198. isEndOfTagSection(c)
  199. : // Otherwise, do a case-insensitive comparison
  200. (c | 0x20) === this.currentSequence[this.sequenceIndex];
  201. if (!isMatch) {
  202. this.isSpecial = false;
  203. }
  204. else if (!isEnd) {
  205. this.sequenceIndex++;
  206. return;
  207. }
  208. this.sequenceIndex = 0;
  209. this.state = State.InTagName;
  210. this.stateInTagName(c);
  211. }
  212. /** Look for an end tag. For <title> tags, also decode entities. */
  213. stateInSpecialTag(c) {
  214. if (this.sequenceIndex === this.currentSequence.length) {
  215. if (c === CharCodes.Gt || isWhitespace(c)) {
  216. const endOfText = this.index - this.currentSequence.length;
  217. if (this.sectionStart < endOfText) {
  218. // Spoof the index so that reported locations match up.
  219. const actualIndex = this.index;
  220. this.index = endOfText;
  221. this.cbs.ontext(this.sectionStart, endOfText);
  222. this.index = actualIndex;
  223. }
  224. this.isSpecial = false;
  225. this.sectionStart = endOfText + 2; // Skip over the `</`
  226. this.stateInClosingTagName(c);
  227. return; // We are done; skip the rest of the function.
  228. }
  229. this.sequenceIndex = 0;
  230. }
  231. if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) {
  232. this.sequenceIndex += 1;
  233. }
  234. else if (this.sequenceIndex === 0) {
  235. if (this.currentSequence === Sequences.TitleEnd) {
  236. // We have to parse entities in <title> tags.
  237. if (this.decodeEntities && c === CharCodes.Amp) {
  238. this.state = State.BeforeEntity;
  239. }
  240. }
  241. else if (this.fastForwardTo(CharCodes.Lt)) {
  242. // Outside of <title> tags, we can fast-forward.
  243. this.sequenceIndex = 1;
  244. }
  245. }
  246. else {
  247. // If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`.
  248. this.sequenceIndex = Number(c === CharCodes.Lt);
  249. }
  250. }
  251. stateCDATASequence(c) {
  252. if (c === Sequences.Cdata[this.sequenceIndex]) {
  253. if (++this.sequenceIndex === Sequences.Cdata.length) {
  254. this.state = State.InCommentLike;
  255. this.currentSequence = Sequences.CdataEnd;
  256. this.sequenceIndex = 0;
  257. this.sectionStart = this.index + 1;
  258. }
  259. }
  260. else {
  261. this.sequenceIndex = 0;
  262. this.state = State.InDeclaration;
  263. this.stateInDeclaration(c); // Reconsume the character
  264. }
  265. }
  266. /**
  267. * When we wait for one specific character, we can speed things up
  268. * by skipping through the buffer until we find it.
  269. *
  270. * @returns Whether the character was found.
  271. */
  272. fastForwardTo(c) {
  273. while (++this.index < this.buffer.length + this.offset) {
  274. if (this.buffer.charCodeAt(this.index - this.offset) === c) {
  275. return true;
  276. }
  277. }
  278. /*
  279. * We increment the index at the end of the `parse` loop,
  280. * so set it to `buffer.length - 1` here.
  281. *
  282. * TODO: Refactor `parse` to increment index before calling states.
  283. */
  284. this.index = this.buffer.length + this.offset - 1;
  285. return false;
  286. }
  287. /**
  288. * Comments and CDATA end with `-->` and `]]>`.
  289. *
  290. * Their common qualities are:
  291. * - Their end sequences have a distinct character they start with.
  292. * - That character is then repeated, so we have to check multiple repeats.
  293. * - All characters but the start character of the sequence can be skipped.
  294. */
  295. stateInCommentLike(c) {
  296. if (c === this.currentSequence[this.sequenceIndex]) {
  297. if (++this.sequenceIndex === this.currentSequence.length) {
  298. if (this.currentSequence === Sequences.CdataEnd) {
  299. this.cbs.oncdata(this.sectionStart, this.index, 2);
  300. }
  301. else {
  302. this.cbs.oncomment(this.sectionStart, this.index, 2);
  303. }
  304. this.sequenceIndex = 0;
  305. this.sectionStart = this.index + 1;
  306. this.state = State.Text;
  307. }
  308. }
  309. else if (this.sequenceIndex === 0) {
  310. // Fast-forward to the first character of the sequence
  311. if (this.fastForwardTo(this.currentSequence[0])) {
  312. this.sequenceIndex = 1;
  313. }
  314. }
  315. else if (c !== this.currentSequence[this.sequenceIndex - 1]) {
  316. // Allow long sequences, eg. --->, ]]]>
  317. this.sequenceIndex = 0;
  318. }
  319. }
  320. /**
  321. * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.
  322. *
  323. * XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar).
  324. * We allow anything that wouldn't end the tag.
  325. */
  326. isTagStartChar(c) {
  327. return this.xmlMode ? !isEndOfTagSection(c) : isASCIIAlpha(c);
  328. }
  329. startSpecial(sequence, offset) {
  330. this.isSpecial = true;
  331. this.currentSequence = sequence;
  332. this.sequenceIndex = offset;
  333. this.state = State.SpecialStartSequence;
  334. }
  335. stateBeforeTagName(c) {
  336. if (c === CharCodes.ExclamationMark) {
  337. this.state = State.BeforeDeclaration;
  338. this.sectionStart = this.index + 1;
  339. }
  340. else if (c === CharCodes.Questionmark) {
  341. this.state = State.InProcessingInstruction;
  342. this.sectionStart = this.index + 1;
  343. }
  344. else if (this.isTagStartChar(c)) {
  345. const lower = c | 0x20;
  346. this.sectionStart = this.index;
  347. if (!this.xmlMode && lower === Sequences.TitleEnd[2]) {
  348. this.startSpecial(Sequences.TitleEnd, 3);
  349. }
  350. else {
  351. this.state =
  352. !this.xmlMode && lower === Sequences.ScriptEnd[2]
  353. ? State.BeforeSpecialS
  354. : State.InTagName;
  355. }
  356. }
  357. else if (c === CharCodes.Slash) {
  358. this.state = State.BeforeClosingTagName;
  359. }
  360. else {
  361. this.state = State.Text;
  362. this.stateText(c);
  363. }
  364. }
  365. stateInTagName(c) {
  366. if (isEndOfTagSection(c)) {
  367. this.cbs.onopentagname(this.sectionStart, this.index);
  368. this.sectionStart = -1;
  369. this.state = State.BeforeAttributeName;
  370. this.stateBeforeAttributeName(c);
  371. }
  372. }
  373. stateBeforeClosingTagName(c) {
  374. if (isWhitespace(c)) {
  375. // Ignore
  376. }
  377. else if (c === CharCodes.Gt) {
  378. this.state = State.Text;
  379. }
  380. else {
  381. this.state = this.isTagStartChar(c)
  382. ? State.InClosingTagName
  383. : State.InSpecialComment;
  384. this.sectionStart = this.index;
  385. }
  386. }
  387. stateInClosingTagName(c) {
  388. if (c === CharCodes.Gt || isWhitespace(c)) {
  389. this.cbs.onclosetag(this.sectionStart, this.index);
  390. this.sectionStart = -1;
  391. this.state = State.AfterClosingTagName;
  392. this.stateAfterClosingTagName(c);
  393. }
  394. }
  395. stateAfterClosingTagName(c) {
  396. // Skip everything until ">"
  397. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  398. this.state = State.Text;
  399. this.sectionStart = this.index + 1;
  400. }
  401. }
  402. stateBeforeAttributeName(c) {
  403. if (c === CharCodes.Gt) {
  404. this.cbs.onopentagend(this.index);
  405. if (this.isSpecial) {
  406. this.state = State.InSpecialTag;
  407. this.sequenceIndex = 0;
  408. }
  409. else {
  410. this.state = State.Text;
  411. }
  412. this.baseState = this.state;
  413. this.sectionStart = this.index + 1;
  414. }
  415. else if (c === CharCodes.Slash) {
  416. this.state = State.InSelfClosingTag;
  417. }
  418. else if (!isWhitespace(c)) {
  419. this.state = State.InAttributeName;
  420. this.sectionStart = this.index;
  421. }
  422. }
  423. stateInSelfClosingTag(c) {
  424. if (c === CharCodes.Gt) {
  425. this.cbs.onselfclosingtag(this.index);
  426. this.state = State.Text;
  427. this.baseState = State.Text;
  428. this.sectionStart = this.index + 1;
  429. this.isSpecial = false; // Reset special state, in case of self-closing special tags
  430. }
  431. else if (!isWhitespace(c)) {
  432. this.state = State.BeforeAttributeName;
  433. this.stateBeforeAttributeName(c);
  434. }
  435. }
  436. stateInAttributeName(c) {
  437. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  438. this.cbs.onattribname(this.sectionStart, this.index);
  439. this.sectionStart = -1;
  440. this.state = State.AfterAttributeName;
  441. this.stateAfterAttributeName(c);
  442. }
  443. }
  444. stateAfterAttributeName(c) {
  445. if (c === CharCodes.Eq) {
  446. this.state = State.BeforeAttributeValue;
  447. }
  448. else if (c === CharCodes.Slash || c === CharCodes.Gt) {
  449. this.cbs.onattribend(QuoteType.NoValue, this.index);
  450. this.state = State.BeforeAttributeName;
  451. this.stateBeforeAttributeName(c);
  452. }
  453. else if (!isWhitespace(c)) {
  454. this.cbs.onattribend(QuoteType.NoValue, this.index);
  455. this.state = State.InAttributeName;
  456. this.sectionStart = this.index;
  457. }
  458. }
  459. stateBeforeAttributeValue(c) {
  460. if (c === CharCodes.DoubleQuote) {
  461. this.state = State.InAttributeValueDq;
  462. this.sectionStart = this.index + 1;
  463. }
  464. else if (c === CharCodes.SingleQuote) {
  465. this.state = State.InAttributeValueSq;
  466. this.sectionStart = this.index + 1;
  467. }
  468. else if (!isWhitespace(c)) {
  469. this.sectionStart = this.index;
  470. this.state = State.InAttributeValueNq;
  471. this.stateInAttributeValueNoQuotes(c); // Reconsume token
  472. }
  473. }
  474. handleInAttributeValue(c, quote) {
  475. if (c === quote ||
  476. (!this.decodeEntities && this.fastForwardTo(quote))) {
  477. this.cbs.onattribdata(this.sectionStart, this.index);
  478. this.sectionStart = -1;
  479. this.cbs.onattribend(quote === CharCodes.DoubleQuote
  480. ? QuoteType.Double
  481. : QuoteType.Single, this.index);
  482. this.state = State.BeforeAttributeName;
  483. }
  484. else if (this.decodeEntities && c === CharCodes.Amp) {
  485. this.baseState = this.state;
  486. this.state = State.BeforeEntity;
  487. }
  488. }
  489. stateInAttributeValueDoubleQuotes(c) {
  490. this.handleInAttributeValue(c, CharCodes.DoubleQuote);
  491. }
  492. stateInAttributeValueSingleQuotes(c) {
  493. this.handleInAttributeValue(c, CharCodes.SingleQuote);
  494. }
  495. stateInAttributeValueNoQuotes(c) {
  496. if (isWhitespace(c) || c === CharCodes.Gt) {
  497. this.cbs.onattribdata(this.sectionStart, this.index);
  498. this.sectionStart = -1;
  499. this.cbs.onattribend(QuoteType.Unquoted, this.index);
  500. this.state = State.BeforeAttributeName;
  501. this.stateBeforeAttributeName(c);
  502. }
  503. else if (this.decodeEntities && c === CharCodes.Amp) {
  504. this.baseState = this.state;
  505. this.state = State.BeforeEntity;
  506. }
  507. }
  508. stateBeforeDeclaration(c) {
  509. if (c === CharCodes.OpeningSquareBracket) {
  510. this.state = State.CDATASequence;
  511. this.sequenceIndex = 0;
  512. }
  513. else {
  514. this.state =
  515. c === CharCodes.Dash
  516. ? State.BeforeComment
  517. : State.InDeclaration;
  518. }
  519. }
  520. stateInDeclaration(c) {
  521. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  522. this.cbs.ondeclaration(this.sectionStart, this.index);
  523. this.state = State.Text;
  524. this.sectionStart = this.index + 1;
  525. }
  526. }
  527. stateInProcessingInstruction(c) {
  528. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  529. this.cbs.onprocessinginstruction(this.sectionStart, this.index);
  530. this.state = State.Text;
  531. this.sectionStart = this.index + 1;
  532. }
  533. }
  534. stateBeforeComment(c) {
  535. if (c === CharCodes.Dash) {
  536. this.state = State.InCommentLike;
  537. this.currentSequence = Sequences.CommentEnd;
  538. // Allow short comments (eg. <!-->)
  539. this.sequenceIndex = 2;
  540. this.sectionStart = this.index + 1;
  541. }
  542. else {
  543. this.state = State.InDeclaration;
  544. }
  545. }
  546. stateInSpecialComment(c) {
  547. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  548. this.cbs.oncomment(this.sectionStart, this.index, 0);
  549. this.state = State.Text;
  550. this.sectionStart = this.index + 1;
  551. }
  552. }
  553. stateBeforeSpecialS(c) {
  554. const lower = c | 0x20;
  555. if (lower === Sequences.ScriptEnd[3]) {
  556. this.startSpecial(Sequences.ScriptEnd, 4);
  557. }
  558. else if (lower === Sequences.StyleEnd[3]) {
  559. this.startSpecial(Sequences.StyleEnd, 4);
  560. }
  561. else {
  562. this.state = State.InTagName;
  563. this.stateInTagName(c); // Consume the token again
  564. }
  565. }
  566. stateBeforeEntity(c) {
  567. // Start excess with 1 to include the '&'
  568. this.entityExcess = 1;
  569. this.entityResult = 0;
  570. if (c === CharCodes.Num) {
  571. this.state = State.BeforeNumericEntity;
  572. }
  573. else if (c === CharCodes.Amp) {
  574. // We have two `&` characters in a row. Stay in the current state.
  575. }
  576. else {
  577. this.trieIndex = 0;
  578. this.trieCurrent = this.entityTrie[0];
  579. this.state = State.InNamedEntity;
  580. this.stateInNamedEntity(c);
  581. }
  582. }
  583. stateInNamedEntity(c) {
  584. this.entityExcess += 1;
  585. this.trieIndex = determineBranch(this.entityTrie, this.trieCurrent, this.trieIndex + 1, c);
  586. if (this.trieIndex < 0) {
  587. this.emitNamedEntity();
  588. this.index--;
  589. return;
  590. }
  591. this.trieCurrent = this.entityTrie[this.trieIndex];
  592. const masked = this.trieCurrent & BinTrieFlags.VALUE_LENGTH;
  593. // If the branch is a value, store it and continue
  594. if (masked) {
  595. // The mask is the number of bytes of the value, including the current byte.
  596. const valueLength = (masked >> 14) - 1;
  597. // If we have a legacy entity while parsing strictly, just skip the number of bytes
  598. if (!this.allowLegacyEntity() && c !== CharCodes.Semi) {
  599. this.trieIndex += valueLength;
  600. }
  601. else {
  602. // Add 1 as we have already incremented the excess
  603. const entityStart = this.index - this.entityExcess + 1;
  604. if (entityStart > this.sectionStart) {
  605. this.emitPartial(this.sectionStart, entityStart);
  606. }
  607. // If this is a surrogate pair, consume the next two bytes
  608. this.entityResult = this.trieIndex;
  609. this.trieIndex += valueLength;
  610. this.entityExcess = 0;
  611. this.sectionStart = this.index + 1;
  612. if (valueLength === 0) {
  613. this.emitNamedEntity();
  614. }
  615. }
  616. }
  617. }
  618. emitNamedEntity() {
  619. this.state = this.baseState;
  620. if (this.entityResult === 0) {
  621. return;
  622. }
  623. const valueLength = (this.entityTrie[this.entityResult] & BinTrieFlags.VALUE_LENGTH) >>
  624. 14;
  625. switch (valueLength) {
  626. case 1:
  627. this.emitCodePoint(this.entityTrie[this.entityResult] &
  628. ~BinTrieFlags.VALUE_LENGTH);
  629. break;
  630. case 2:
  631. this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
  632. break;
  633. case 3: {
  634. this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
  635. this.emitCodePoint(this.entityTrie[this.entityResult + 2]);
  636. }
  637. }
  638. }
  639. stateBeforeNumericEntity(c) {
  640. if ((c | 0x20) === CharCodes.LowerX) {
  641. this.entityExcess++;
  642. this.state = State.InHexEntity;
  643. }
  644. else {
  645. this.state = State.InNumericEntity;
  646. this.stateInNumericEntity(c);
  647. }
  648. }
  649. emitNumericEntity(strict) {
  650. const entityStart = this.index - this.entityExcess - 1;
  651. const numberStart = entityStart + 2 + Number(this.state === State.InHexEntity);
  652. if (numberStart !== this.index) {
  653. // Emit leading data if any
  654. if (entityStart > this.sectionStart) {
  655. this.emitPartial(this.sectionStart, entityStart);
  656. }
  657. this.sectionStart = this.index + Number(strict);
  658. this.emitCodePoint(replaceCodePoint(this.entityResult));
  659. }
  660. this.state = this.baseState;
  661. }
  662. stateInNumericEntity(c) {
  663. if (c === CharCodes.Semi) {
  664. this.emitNumericEntity(true);
  665. }
  666. else if (isNumber(c)) {
  667. this.entityResult = this.entityResult * 10 + (c - CharCodes.Zero);
  668. this.entityExcess++;
  669. }
  670. else {
  671. if (this.allowLegacyEntity()) {
  672. this.emitNumericEntity(false);
  673. }
  674. else {
  675. this.state = this.baseState;
  676. }
  677. this.index--;
  678. }
  679. }
  680. stateInHexEntity(c) {
  681. if (c === CharCodes.Semi) {
  682. this.emitNumericEntity(true);
  683. }
  684. else if (isNumber(c)) {
  685. this.entityResult = this.entityResult * 16 + (c - CharCodes.Zero);
  686. this.entityExcess++;
  687. }
  688. else if (isHexDigit(c)) {
  689. this.entityResult =
  690. this.entityResult * 16 + ((c | 0x20) - CharCodes.LowerA + 10);
  691. this.entityExcess++;
  692. }
  693. else {
  694. if (this.allowLegacyEntity()) {
  695. this.emitNumericEntity(false);
  696. }
  697. else {
  698. this.state = this.baseState;
  699. }
  700. this.index--;
  701. }
  702. }
  703. allowLegacyEntity() {
  704. return (!this.xmlMode &&
  705. (this.baseState === State.Text ||
  706. this.baseState === State.InSpecialTag));
  707. }
  708. /**
  709. * Remove data that has already been consumed from the buffer.
  710. */
  711. cleanup() {
  712. // If we are inside of text or attributes, emit what we already have.
  713. if (this.running && this.sectionStart !== this.index) {
  714. if (this.state === State.Text ||
  715. (this.state === State.InSpecialTag && this.sequenceIndex === 0)) {
  716. this.cbs.ontext(this.sectionStart, this.index);
  717. this.sectionStart = this.index;
  718. }
  719. else if (this.state === State.InAttributeValueDq ||
  720. this.state === State.InAttributeValueSq ||
  721. this.state === State.InAttributeValueNq) {
  722. this.cbs.onattribdata(this.sectionStart, this.index);
  723. this.sectionStart = this.index;
  724. }
  725. }
  726. }
  727. shouldContinue() {
  728. return this.index < this.buffer.length + this.offset && this.running;
  729. }
  730. /**
  731. * Iterates through the buffer, calling the function corresponding to the current state.
  732. *
  733. * States that are more likely to be hit are higher up, as a performance improvement.
  734. */
  735. parse() {
  736. while (this.shouldContinue()) {
  737. const c = this.buffer.charCodeAt(this.index - this.offset);
  738. if (this.state === State.Text) {
  739. this.stateText(c);
  740. }
  741. else if (this.state === State.SpecialStartSequence) {
  742. this.stateSpecialStartSequence(c);
  743. }
  744. else if (this.state === State.InSpecialTag) {
  745. this.stateInSpecialTag(c);
  746. }
  747. else if (this.state === State.CDATASequence) {
  748. this.stateCDATASequence(c);
  749. }
  750. else if (this.state === State.InAttributeValueDq) {
  751. this.stateInAttributeValueDoubleQuotes(c);
  752. }
  753. else if (this.state === State.InAttributeName) {
  754. this.stateInAttributeName(c);
  755. }
  756. else if (this.state === State.InCommentLike) {
  757. this.stateInCommentLike(c);
  758. }
  759. else if (this.state === State.InSpecialComment) {
  760. this.stateInSpecialComment(c);
  761. }
  762. else if (this.state === State.BeforeAttributeName) {
  763. this.stateBeforeAttributeName(c);
  764. }
  765. else if (this.state === State.InTagName) {
  766. this.stateInTagName(c);
  767. }
  768. else if (this.state === State.InClosingTagName) {
  769. this.stateInClosingTagName(c);
  770. }
  771. else if (this.state === State.BeforeTagName) {
  772. this.stateBeforeTagName(c);
  773. }
  774. else if (this.state === State.AfterAttributeName) {
  775. this.stateAfterAttributeName(c);
  776. }
  777. else if (this.state === State.InAttributeValueSq) {
  778. this.stateInAttributeValueSingleQuotes(c);
  779. }
  780. else if (this.state === State.BeforeAttributeValue) {
  781. this.stateBeforeAttributeValue(c);
  782. }
  783. else if (this.state === State.BeforeClosingTagName) {
  784. this.stateBeforeClosingTagName(c);
  785. }
  786. else if (this.state === State.AfterClosingTagName) {
  787. this.stateAfterClosingTagName(c);
  788. }
  789. else if (this.state === State.BeforeSpecialS) {
  790. this.stateBeforeSpecialS(c);
  791. }
  792. else if (this.state === State.InAttributeValueNq) {
  793. this.stateInAttributeValueNoQuotes(c);
  794. }
  795. else if (this.state === State.InSelfClosingTag) {
  796. this.stateInSelfClosingTag(c);
  797. }
  798. else if (this.state === State.InDeclaration) {
  799. this.stateInDeclaration(c);
  800. }
  801. else if (this.state === State.BeforeDeclaration) {
  802. this.stateBeforeDeclaration(c);
  803. }
  804. else if (this.state === State.BeforeComment) {
  805. this.stateBeforeComment(c);
  806. }
  807. else if (this.state === State.InProcessingInstruction) {
  808. this.stateInProcessingInstruction(c);
  809. }
  810. else if (this.state === State.InNamedEntity) {
  811. this.stateInNamedEntity(c);
  812. }
  813. else if (this.state === State.BeforeEntity) {
  814. this.stateBeforeEntity(c);
  815. }
  816. else if (this.state === State.InHexEntity) {
  817. this.stateInHexEntity(c);
  818. }
  819. else if (this.state === State.InNumericEntity) {
  820. this.stateInNumericEntity(c);
  821. }
  822. else {
  823. // `this._state === State.BeforeNumericEntity`
  824. this.stateBeforeNumericEntity(c);
  825. }
  826. this.index++;
  827. }
  828. this.cleanup();
  829. }
  830. finish() {
  831. if (this.state === State.InNamedEntity) {
  832. this.emitNamedEntity();
  833. }
  834. // If there is remaining data, emit it in a reasonable way
  835. if (this.sectionStart < this.index) {
  836. this.handleTrailingData();
  837. }
  838. this.cbs.onend();
  839. }
  840. /** Handle any trailing data. */
  841. handleTrailingData() {
  842. const endIndex = this.buffer.length + this.offset;
  843. if (this.state === State.InCommentLike) {
  844. if (this.currentSequence === Sequences.CdataEnd) {
  845. this.cbs.oncdata(this.sectionStart, endIndex, 0);
  846. }
  847. else {
  848. this.cbs.oncomment(this.sectionStart, endIndex, 0);
  849. }
  850. }
  851. else if (this.state === State.InNumericEntity &&
  852. this.allowLegacyEntity()) {
  853. this.emitNumericEntity(false);
  854. // All trailing data will have been consumed
  855. }
  856. else if (this.state === State.InHexEntity &&
  857. this.allowLegacyEntity()) {
  858. this.emitNumericEntity(false);
  859. // All trailing data will have been consumed
  860. }
  861. else if (this.state === State.InTagName ||
  862. this.state === State.BeforeAttributeName ||
  863. this.state === State.BeforeAttributeValue ||
  864. this.state === State.AfterAttributeName ||
  865. this.state === State.InAttributeName ||
  866. this.state === State.InAttributeValueSq ||
  867. this.state === State.InAttributeValueDq ||
  868. this.state === State.InAttributeValueNq ||
  869. this.state === State.InClosingTagName) {
  870. /*
  871. * If we are currently in an opening or closing tag, us not calling the
  872. * respective callback signals that the tag should be ignored.
  873. */
  874. }
  875. else {
  876. this.cbs.ontext(this.sectionStart, endIndex);
  877. }
  878. }
  879. emitPartial(start, endIndex) {
  880. if (this.baseState !== State.Text &&
  881. this.baseState !== State.InSpecialTag) {
  882. this.cbs.onattribdata(start, endIndex);
  883. }
  884. else {
  885. this.cbs.ontext(start, endIndex);
  886. }
  887. }
  888. emitCodePoint(cp) {
  889. if (this.baseState !== State.Text &&
  890. this.baseState !== State.InSpecialTag) {
  891. this.cbs.onattribentity(cp);
  892. }
  893. else {
  894. this.cbs.ontextentity(cp);
  895. }
  896. }
  897. }
  898. //# sourceMappingURL=Tokenizer.js.map