Tokenizer.d.ts 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. export declare enum QuoteType {
  2. NoValue = 0,
  3. Unquoted = 1,
  4. Single = 2,
  5. Double = 3
  6. }
  7. export interface Callbacks {
  8. onattribdata(start: number, endIndex: number): void;
  9. onattribentity(codepoint: number): void;
  10. onattribend(quote: QuoteType, endIndex: number): void;
  11. onattribname(start: number, endIndex: number): void;
  12. oncdata(start: number, endIndex: number, endOffset: number): void;
  13. onclosetag(start: number, endIndex: number): void;
  14. oncomment(start: number, endIndex: number, endOffset: number): void;
  15. ondeclaration(start: number, endIndex: number): void;
  16. onend(): void;
  17. onopentagend(endIndex: number): void;
  18. onopentagname(start: number, endIndex: number): void;
  19. onprocessinginstruction(start: number, endIndex: number): void;
  20. onselfclosingtag(endIndex: number): void;
  21. ontext(start: number, endIndex: number): void;
  22. ontextentity(codepoint: number): void;
  23. }
  24. export default class Tokenizer {
  25. private readonly cbs;
  26. /** The current state the tokenizer is in. */
  27. private state;
  28. /** The read buffer. */
  29. private buffer;
  30. /** The beginning of the section that is currently being read. */
  31. private sectionStart;
  32. /** The index within the buffer that we are currently looking at. */
  33. private index;
  34. /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
  35. private baseState;
  36. /** For special parsing behavior inside of script and style tags. */
  37. private isSpecial;
  38. /** Indicates whether the tokenizer has been paused. */
  39. running: boolean;
  40. /** The offset of the current buffer. */
  41. private offset;
  42. private readonly xmlMode;
  43. private readonly decodeEntities;
  44. private readonly entityTrie;
  45. constructor({ xmlMode, decodeEntities, }: {
  46. xmlMode?: boolean;
  47. decodeEntities?: boolean;
  48. }, cbs: Callbacks);
  49. reset(): void;
  50. write(chunk: string): void;
  51. end(): void;
  52. pause(): void;
  53. resume(): void;
  54. /**
  55. * The current index within all of the written data.
  56. */
  57. getIndex(): number;
  58. /**
  59. * The start of the current section.
  60. */
  61. getSectionStart(): number;
  62. private stateText;
  63. private currentSequence;
  64. private sequenceIndex;
  65. private stateSpecialStartSequence;
  66. /** Look for an end tag. For <title> tags, also decode entities. */
  67. private stateInSpecialTag;
  68. private stateCDATASequence;
  69. /**
  70. * When we wait for one specific character, we can speed things up
  71. * by skipping through the buffer until we find it.
  72. *
  73. * @returns Whether the character was found.
  74. */
  75. private fastForwardTo;
  76. /**
  77. * Comments and CDATA end with `-->` and `]]>`.
  78. *
  79. * Their common qualities are:
  80. * - Their end sequences have a distinct character they start with.
  81. * - That character is then repeated, so we have to check multiple repeats.
  82. * - All characters but the start character of the sequence can be skipped.
  83. */
  84. private stateInCommentLike;
  85. /**
  86. * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.
  87. *
  88. * XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar).
  89. * We allow anything that wouldn't end the tag.
  90. */
  91. private isTagStartChar;
  92. private startSpecial;
  93. private stateBeforeTagName;
  94. private stateInTagName;
  95. private stateBeforeClosingTagName;
  96. private stateInClosingTagName;
  97. private stateAfterClosingTagName;
  98. private stateBeforeAttributeName;
  99. private stateInSelfClosingTag;
  100. private stateInAttributeName;
  101. private stateAfterAttributeName;
  102. private stateBeforeAttributeValue;
  103. private handleInAttributeValue;
  104. private stateInAttributeValueDoubleQuotes;
  105. private stateInAttributeValueSingleQuotes;
  106. private stateInAttributeValueNoQuotes;
  107. private stateBeforeDeclaration;
  108. private stateInDeclaration;
  109. private stateInProcessingInstruction;
  110. private stateBeforeComment;
  111. private stateInSpecialComment;
  112. private stateBeforeSpecialS;
  113. private trieIndex;
  114. private trieCurrent;
  115. /** For named entities, the index of the value. For numeric entities, the code point. */
  116. private entityResult;
  117. private entityExcess;
  118. private stateBeforeEntity;
  119. private stateInNamedEntity;
  120. private emitNamedEntity;
  121. private stateBeforeNumericEntity;
  122. private emitNumericEntity;
  123. private stateInNumericEntity;
  124. private stateInHexEntity;
  125. private allowLegacyEntity;
  126. /**
  127. * Remove data that has already been consumed from the buffer.
  128. */
  129. private cleanup;
  130. private shouldContinue;
  131. /**
  132. * Iterates through the buffer, calling the function corresponding to the current state.
  133. *
  134. * States that are more likely to be hit are higher up, as a performance improvement.
  135. */
  136. private parse;
  137. private finish;
  138. /** Handle any trailing data. */
  139. private handleTrailingData;
  140. private emitPartial;
  141. private emitCodePoint;
  142. }
  143. //# sourceMappingURL=Tokenizer.d.ts.map