index.js 78 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196
  1. 'use strict';
  2. const Preprocessor = require('./preprocessor');
  3. const unicode = require('../common/unicode');
  4. const neTree = require('./named-entity-data');
  5. const ERR = require('../common/error-codes');
  6. //Aliases
  7. const $ = unicode.CODE_POINTS;
  8. const $$ = unicode.CODE_POINT_SEQUENCES;
  9. //C1 Unicode control character reference replacements
  10. const C1_CONTROLS_REFERENCE_REPLACEMENTS = {
  11. 0x80: 0x20ac,
  12. 0x82: 0x201a,
  13. 0x83: 0x0192,
  14. 0x84: 0x201e,
  15. 0x85: 0x2026,
  16. 0x86: 0x2020,
  17. 0x87: 0x2021,
  18. 0x88: 0x02c6,
  19. 0x89: 0x2030,
  20. 0x8a: 0x0160,
  21. 0x8b: 0x2039,
  22. 0x8c: 0x0152,
  23. 0x8e: 0x017d,
  24. 0x91: 0x2018,
  25. 0x92: 0x2019,
  26. 0x93: 0x201c,
  27. 0x94: 0x201d,
  28. 0x95: 0x2022,
  29. 0x96: 0x2013,
  30. 0x97: 0x2014,
  31. 0x98: 0x02dc,
  32. 0x99: 0x2122,
  33. 0x9a: 0x0161,
  34. 0x9b: 0x203a,
  35. 0x9c: 0x0153,
  36. 0x9e: 0x017e,
  37. 0x9f: 0x0178
  38. };
  39. // Named entity tree flags
  40. const HAS_DATA_FLAG = 1 << 0;
  41. const DATA_DUPLET_FLAG = 1 << 1;
  42. const HAS_BRANCHES_FLAG = 1 << 2;
  43. const MAX_BRANCH_MARKER_VALUE = HAS_DATA_FLAG | DATA_DUPLET_FLAG | HAS_BRANCHES_FLAG;
  44. //States
  45. const DATA_STATE = 'DATA_STATE';
  46. const RCDATA_STATE = 'RCDATA_STATE';
  47. const RAWTEXT_STATE = 'RAWTEXT_STATE';
  48. const SCRIPT_DATA_STATE = 'SCRIPT_DATA_STATE';
  49. const PLAINTEXT_STATE = 'PLAINTEXT_STATE';
  50. const TAG_OPEN_STATE = 'TAG_OPEN_STATE';
  51. const END_TAG_OPEN_STATE = 'END_TAG_OPEN_STATE';
  52. const TAG_NAME_STATE = 'TAG_NAME_STATE';
  53. const RCDATA_LESS_THAN_SIGN_STATE = 'RCDATA_LESS_THAN_SIGN_STATE';
  54. const RCDATA_END_TAG_OPEN_STATE = 'RCDATA_END_TAG_OPEN_STATE';
  55. const RCDATA_END_TAG_NAME_STATE = 'RCDATA_END_TAG_NAME_STATE';
  56. const RAWTEXT_LESS_THAN_SIGN_STATE = 'RAWTEXT_LESS_THAN_SIGN_STATE';
  57. const RAWTEXT_END_TAG_OPEN_STATE = 'RAWTEXT_END_TAG_OPEN_STATE';
  58. const RAWTEXT_END_TAG_NAME_STATE = 'RAWTEXT_END_TAG_NAME_STATE';
  59. const SCRIPT_DATA_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_LESS_THAN_SIGN_STATE';
  60. const SCRIPT_DATA_END_TAG_OPEN_STATE = 'SCRIPT_DATA_END_TAG_OPEN_STATE';
  61. const SCRIPT_DATA_END_TAG_NAME_STATE = 'SCRIPT_DATA_END_TAG_NAME_STATE';
  62. const SCRIPT_DATA_ESCAPE_START_STATE = 'SCRIPT_DATA_ESCAPE_START_STATE';
  63. const SCRIPT_DATA_ESCAPE_START_DASH_STATE = 'SCRIPT_DATA_ESCAPE_START_DASH_STATE';
  64. const SCRIPT_DATA_ESCAPED_STATE = 'SCRIPT_DATA_ESCAPED_STATE';
  65. const SCRIPT_DATA_ESCAPED_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_STATE';
  66. const SCRIPT_DATA_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_DASH_STATE';
  67. const SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE';
  68. const SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE';
  69. const SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE';
  70. const SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE';
  71. const SCRIPT_DATA_DOUBLE_ESCAPED_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_STATE';
  72. const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE';
  73. const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE';
  74. const SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE';
  75. const SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE';
  76. const BEFORE_ATTRIBUTE_NAME_STATE = 'BEFORE_ATTRIBUTE_NAME_STATE';
  77. const ATTRIBUTE_NAME_STATE = 'ATTRIBUTE_NAME_STATE';
  78. const AFTER_ATTRIBUTE_NAME_STATE = 'AFTER_ATTRIBUTE_NAME_STATE';
  79. const BEFORE_ATTRIBUTE_VALUE_STATE = 'BEFORE_ATTRIBUTE_VALUE_STATE';
  80. const ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE';
  81. const ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE';
  82. const ATTRIBUTE_VALUE_UNQUOTED_STATE = 'ATTRIBUTE_VALUE_UNQUOTED_STATE';
  83. const AFTER_ATTRIBUTE_VALUE_QUOTED_STATE = 'AFTER_ATTRIBUTE_VALUE_QUOTED_STATE';
  84. const SELF_CLOSING_START_TAG_STATE = 'SELF_CLOSING_START_TAG_STATE';
  85. const BOGUS_COMMENT_STATE = 'BOGUS_COMMENT_STATE';
  86. const MARKUP_DECLARATION_OPEN_STATE = 'MARKUP_DECLARATION_OPEN_STATE';
  87. const COMMENT_START_STATE = 'COMMENT_START_STATE';
  88. const COMMENT_START_DASH_STATE = 'COMMENT_START_DASH_STATE';
  89. const COMMENT_STATE = 'COMMENT_STATE';
  90. const COMMENT_LESS_THAN_SIGN_STATE = 'COMMENT_LESS_THAN_SIGN_STATE';
  91. const COMMENT_LESS_THAN_SIGN_BANG_STATE = 'COMMENT_LESS_THAN_SIGN_BANG_STATE';
  92. const COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE = 'COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE';
  93. const COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE = 'COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE';
  94. const COMMENT_END_DASH_STATE = 'COMMENT_END_DASH_STATE';
  95. const COMMENT_END_STATE = 'COMMENT_END_STATE';
  96. const COMMENT_END_BANG_STATE = 'COMMENT_END_BANG_STATE';
  97. const DOCTYPE_STATE = 'DOCTYPE_STATE';
  98. const BEFORE_DOCTYPE_NAME_STATE = 'BEFORE_DOCTYPE_NAME_STATE';
  99. const DOCTYPE_NAME_STATE = 'DOCTYPE_NAME_STATE';
  100. const AFTER_DOCTYPE_NAME_STATE = 'AFTER_DOCTYPE_NAME_STATE';
  101. const AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE = 'AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE';
  102. const BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE';
  103. const DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE';
  104. const DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE';
  105. const AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE';
  106. const BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE = 'BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE';
  107. const AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE = 'AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE';
  108. const BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE';
  109. const DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE';
  110. const DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE';
  111. const AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE';
  112. const BOGUS_DOCTYPE_STATE = 'BOGUS_DOCTYPE_STATE';
  113. const CDATA_SECTION_STATE = 'CDATA_SECTION_STATE';
  114. const CDATA_SECTION_BRACKET_STATE = 'CDATA_SECTION_BRACKET_STATE';
  115. const CDATA_SECTION_END_STATE = 'CDATA_SECTION_END_STATE';
  116. const CHARACTER_REFERENCE_STATE = 'CHARACTER_REFERENCE_STATE';
  117. const NAMED_CHARACTER_REFERENCE_STATE = 'NAMED_CHARACTER_REFERENCE_STATE';
  118. const AMBIGUOUS_AMPERSAND_STATE = 'AMBIGUOS_AMPERSAND_STATE';
  119. const NUMERIC_CHARACTER_REFERENCE_STATE = 'NUMERIC_CHARACTER_REFERENCE_STATE';
  120. const HEXADEMICAL_CHARACTER_REFERENCE_START_STATE = 'HEXADEMICAL_CHARACTER_REFERENCE_START_STATE';
  121. const DECIMAL_CHARACTER_REFERENCE_START_STATE = 'DECIMAL_CHARACTER_REFERENCE_START_STATE';
  122. const HEXADEMICAL_CHARACTER_REFERENCE_STATE = 'HEXADEMICAL_CHARACTER_REFERENCE_STATE';
  123. const DECIMAL_CHARACTER_REFERENCE_STATE = 'DECIMAL_CHARACTER_REFERENCE_STATE';
  124. const NUMERIC_CHARACTER_REFERENCE_END_STATE = 'NUMERIC_CHARACTER_REFERENCE_END_STATE';
  125. //Utils
  126. //OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline
  127. //this functions if they will be situated in another module due to context switch.
  128. //Always perform inlining check before modifying this functions ('node --trace-inlining').
  129. function isWhitespace(cp) {
  130. return cp === $.SPACE || cp === $.LINE_FEED || cp === $.TABULATION || cp === $.FORM_FEED;
  131. }
  132. function isAsciiDigit(cp) {
  133. return cp >= $.DIGIT_0 && cp <= $.DIGIT_9;
  134. }
  135. function isAsciiUpper(cp) {
  136. return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_Z;
  137. }
  138. function isAsciiLower(cp) {
  139. return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_Z;
  140. }
  141. function isAsciiLetter(cp) {
  142. return isAsciiLower(cp) || isAsciiUpper(cp);
  143. }
  144. function isAsciiAlphaNumeric(cp) {
  145. return isAsciiLetter(cp) || isAsciiDigit(cp);
  146. }
  147. function isAsciiUpperHexDigit(cp) {
  148. return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_F;
  149. }
  150. function isAsciiLowerHexDigit(cp) {
  151. return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_F;
  152. }
  153. function isAsciiHexDigit(cp) {
  154. return isAsciiDigit(cp) || isAsciiUpperHexDigit(cp) || isAsciiLowerHexDigit(cp);
  155. }
  156. function toAsciiLowerCodePoint(cp) {
  157. return cp + 0x0020;
  158. }
  159. //NOTE: String.fromCharCode() function can handle only characters from BMP subset.
  160. //So, we need to workaround this manually.
  161. //(see: https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/String/fromCharCode#Getting_it_to_work_with_higher_values)
  162. function toChar(cp) {
  163. if (cp <= 0xffff) {
  164. return String.fromCharCode(cp);
  165. }
  166. cp -= 0x10000;
  167. return String.fromCharCode(((cp >>> 10) & 0x3ff) | 0xd800) + String.fromCharCode(0xdc00 | (cp & 0x3ff));
  168. }
  169. function toAsciiLowerChar(cp) {
  170. return String.fromCharCode(toAsciiLowerCodePoint(cp));
  171. }
  172. function findNamedEntityTreeBranch(nodeIx, cp) {
  173. const branchCount = neTree[++nodeIx];
  174. let lo = ++nodeIx;
  175. let hi = lo + branchCount - 1;
  176. while (lo <= hi) {
  177. const mid = (lo + hi) >>> 1;
  178. const midCp = neTree[mid];
  179. if (midCp < cp) {
  180. lo = mid + 1;
  181. } else if (midCp > cp) {
  182. hi = mid - 1;
  183. } else {
  184. return neTree[mid + branchCount];
  185. }
  186. }
  187. return -1;
  188. }
  189. //Tokenizer
  190. class Tokenizer {
  191. constructor() {
  192. this.preprocessor = new Preprocessor();
  193. this.tokenQueue = [];
  194. this.allowCDATA = false;
  195. this.state = DATA_STATE;
  196. this.returnState = '';
  197. this.charRefCode = -1;
  198. this.tempBuff = [];
  199. this.lastStartTagName = '';
  200. this.consumedAfterSnapshot = -1;
  201. this.active = false;
  202. this.currentCharacterToken = null;
  203. this.currentToken = null;
  204. this.currentAttr = null;
  205. }
  206. //Errors
  207. _err() {
  208. // NOTE: err reporting is noop by default. Enabled by mixin.
  209. }
  210. _errOnNextCodePoint(err) {
  211. this._consume();
  212. this._err(err);
  213. this._unconsume();
  214. }
  215. //API
  216. getNextToken() {
  217. while (!this.tokenQueue.length && this.active) {
  218. this.consumedAfterSnapshot = 0;
  219. const cp = this._consume();
  220. if (!this._ensureHibernation()) {
  221. this[this.state](cp);
  222. }
  223. }
  224. return this.tokenQueue.shift();
  225. }
  226. write(chunk, isLastChunk) {
  227. this.active = true;
  228. this.preprocessor.write(chunk, isLastChunk);
  229. }
  230. insertHtmlAtCurrentPos(chunk) {
  231. this.active = true;
  232. this.preprocessor.insertHtmlAtCurrentPos(chunk);
  233. }
  234. //Hibernation
  235. _ensureHibernation() {
  236. if (this.preprocessor.endOfChunkHit) {
  237. for (; this.consumedAfterSnapshot > 0; this.consumedAfterSnapshot--) {
  238. this.preprocessor.retreat();
  239. }
  240. this.active = false;
  241. this.tokenQueue.push({ type: Tokenizer.HIBERNATION_TOKEN });
  242. return true;
  243. }
  244. return false;
  245. }
  246. //Consumption
  247. _consume() {
  248. this.consumedAfterSnapshot++;
  249. return this.preprocessor.advance();
  250. }
  251. _unconsume() {
  252. this.consumedAfterSnapshot--;
  253. this.preprocessor.retreat();
  254. }
  255. _reconsumeInState(state) {
  256. this.state = state;
  257. this._unconsume();
  258. }
  259. _consumeSequenceIfMatch(pattern, startCp, caseSensitive) {
  260. let consumedCount = 0;
  261. let isMatch = true;
  262. const patternLength = pattern.length;
  263. let patternPos = 0;
  264. let cp = startCp;
  265. let patternCp = void 0;
  266. for (; patternPos < patternLength; patternPos++) {
  267. if (patternPos > 0) {
  268. cp = this._consume();
  269. consumedCount++;
  270. }
  271. if (cp === $.EOF) {
  272. isMatch = false;
  273. break;
  274. }
  275. patternCp = pattern[patternPos];
  276. if (cp !== patternCp && (caseSensitive || cp !== toAsciiLowerCodePoint(patternCp))) {
  277. isMatch = false;
  278. break;
  279. }
  280. }
  281. if (!isMatch) {
  282. while (consumedCount--) {
  283. this._unconsume();
  284. }
  285. }
  286. return isMatch;
  287. }
  288. //Temp buffer
  289. _isTempBufferEqualToScriptString() {
  290. if (this.tempBuff.length !== $$.SCRIPT_STRING.length) {
  291. return false;
  292. }
  293. for (let i = 0; i < this.tempBuff.length; i++) {
  294. if (this.tempBuff[i] !== $$.SCRIPT_STRING[i]) {
  295. return false;
  296. }
  297. }
  298. return true;
  299. }
  300. //Token creation
  301. _createStartTagToken() {
  302. this.currentToken = {
  303. type: Tokenizer.START_TAG_TOKEN,
  304. tagName: '',
  305. selfClosing: false,
  306. ackSelfClosing: false,
  307. attrs: []
  308. };
  309. }
  310. _createEndTagToken() {
  311. this.currentToken = {
  312. type: Tokenizer.END_TAG_TOKEN,
  313. tagName: '',
  314. selfClosing: false,
  315. attrs: []
  316. };
  317. }
  318. _createCommentToken() {
  319. this.currentToken = {
  320. type: Tokenizer.COMMENT_TOKEN,
  321. data: ''
  322. };
  323. }
  324. _createDoctypeToken(initialName) {
  325. this.currentToken = {
  326. type: Tokenizer.DOCTYPE_TOKEN,
  327. name: initialName,
  328. forceQuirks: false,
  329. publicId: null,
  330. systemId: null
  331. };
  332. }
  333. _createCharacterToken(type, ch) {
  334. this.currentCharacterToken = {
  335. type: type,
  336. chars: ch
  337. };
  338. }
  339. _createEOFToken() {
  340. this.currentToken = { type: Tokenizer.EOF_TOKEN };
  341. }
  342. //Tag attributes
  343. _createAttr(attrNameFirstCh) {
  344. this.currentAttr = {
  345. name: attrNameFirstCh,
  346. value: ''
  347. };
  348. }
  349. _leaveAttrName(toState) {
  350. if (Tokenizer.getTokenAttr(this.currentToken, this.currentAttr.name) === null) {
  351. this.currentToken.attrs.push(this.currentAttr);
  352. } else {
  353. this._err(ERR.duplicateAttribute);
  354. }
  355. this.state = toState;
  356. }
  357. _leaveAttrValue(toState) {
  358. this.state = toState;
  359. }
  360. //Token emission
  361. _emitCurrentToken() {
  362. this._emitCurrentCharacterToken();
  363. const ct = this.currentToken;
  364. this.currentToken = null;
  365. //NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate.
  366. if (ct.type === Tokenizer.START_TAG_TOKEN) {
  367. this.lastStartTagName = ct.tagName;
  368. } else if (ct.type === Tokenizer.END_TAG_TOKEN) {
  369. if (ct.attrs.length > 0) {
  370. this._err(ERR.endTagWithAttributes);
  371. }
  372. if (ct.selfClosing) {
  373. this._err(ERR.endTagWithTrailingSolidus);
  374. }
  375. }
  376. this.tokenQueue.push(ct);
  377. }
  378. _emitCurrentCharacterToken() {
  379. if (this.currentCharacterToken) {
  380. this.tokenQueue.push(this.currentCharacterToken);
  381. this.currentCharacterToken = null;
  382. }
  383. }
  384. _emitEOFToken() {
  385. this._createEOFToken();
  386. this._emitCurrentToken();
  387. }
  388. //Characters emission
  389. //OPTIMIZATION: specification uses only one type of character tokens (one token per character).
  390. //This causes a huge memory overhead and a lot of unnecessary parser loops. parse5 uses 3 groups of characters.
  391. //If we have a sequence of characters that belong to the same group, parser can process it
  392. //as a single solid character token.
  393. //So, there are 3 types of character tokens in parse5:
  394. //1)NULL_CHARACTER_TOKEN - \u0000-character sequences (e.g. '\u0000\u0000\u0000')
  395. //2)WHITESPACE_CHARACTER_TOKEN - any whitespace/new-line character sequences (e.g. '\n \r\t \f')
  396. //3)CHARACTER_TOKEN - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^')
  397. _appendCharToCurrentCharacterToken(type, ch) {
  398. if (this.currentCharacterToken && this.currentCharacterToken.type !== type) {
  399. this._emitCurrentCharacterToken();
  400. }
  401. if (this.currentCharacterToken) {
  402. this.currentCharacterToken.chars += ch;
  403. } else {
  404. this._createCharacterToken(type, ch);
  405. }
  406. }
  407. _emitCodePoint(cp) {
  408. let type = Tokenizer.CHARACTER_TOKEN;
  409. if (isWhitespace(cp)) {
  410. type = Tokenizer.WHITESPACE_CHARACTER_TOKEN;
  411. } else if (cp === $.NULL) {
  412. type = Tokenizer.NULL_CHARACTER_TOKEN;
  413. }
  414. this._appendCharToCurrentCharacterToken(type, toChar(cp));
  415. }
  416. _emitSeveralCodePoints(codePoints) {
  417. for (let i = 0; i < codePoints.length; i++) {
  418. this._emitCodePoint(codePoints[i]);
  419. }
  420. }
  421. //NOTE: used then we emit character explicitly. This is always a non-whitespace and a non-null character.
  422. //So we can avoid additional checks here.
  423. _emitChars(ch) {
  424. this._appendCharToCurrentCharacterToken(Tokenizer.CHARACTER_TOKEN, ch);
  425. }
  426. // Character reference helpers
  427. _matchNamedCharacterReference(startCp) {
  428. let result = null;
  429. let excess = 1;
  430. let i = findNamedEntityTreeBranch(0, startCp);
  431. this.tempBuff.push(startCp);
  432. while (i > -1) {
  433. const current = neTree[i];
  434. const inNode = current < MAX_BRANCH_MARKER_VALUE;
  435. const nodeWithData = inNode && current & HAS_DATA_FLAG;
  436. if (nodeWithData) {
  437. //NOTE: we use greedy search, so we continue lookup at this point
  438. result = current & DATA_DUPLET_FLAG ? [neTree[++i], neTree[++i]] : [neTree[++i]];
  439. excess = 0;
  440. }
  441. const cp = this._consume();
  442. this.tempBuff.push(cp);
  443. excess++;
  444. if (cp === $.EOF) {
  445. break;
  446. }
  447. if (inNode) {
  448. i = current & HAS_BRANCHES_FLAG ? findNamedEntityTreeBranch(i, cp) : -1;
  449. } else {
  450. i = cp === current ? ++i : -1;
  451. }
  452. }
  453. while (excess--) {
  454. this.tempBuff.pop();
  455. this._unconsume();
  456. }
  457. return result;
  458. }
  459. _isCharacterReferenceInAttribute() {
  460. return (
  461. this.returnState === ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ||
  462. this.returnState === ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ||
  463. this.returnState === ATTRIBUTE_VALUE_UNQUOTED_STATE
  464. );
  465. }
  466. _isCharacterReferenceAttributeQuirk(withSemicolon) {
  467. if (!withSemicolon && this._isCharacterReferenceInAttribute()) {
  468. const nextCp = this._consume();
  469. this._unconsume();
  470. return nextCp === $.EQUALS_SIGN || isAsciiAlphaNumeric(nextCp);
  471. }
  472. return false;
  473. }
  474. _flushCodePointsConsumedAsCharacterReference() {
  475. if (this._isCharacterReferenceInAttribute()) {
  476. for (let i = 0; i < this.tempBuff.length; i++) {
  477. this.currentAttr.value += toChar(this.tempBuff[i]);
  478. }
  479. } else {
  480. this._emitSeveralCodePoints(this.tempBuff);
  481. }
  482. this.tempBuff = [];
  483. }
  484. // State machine
  485. // Data state
  486. //------------------------------------------------------------------
  487. [DATA_STATE](cp) {
  488. this.preprocessor.dropParsedChunk();
  489. if (cp === $.LESS_THAN_SIGN) {
  490. this.state = TAG_OPEN_STATE;
  491. } else if (cp === $.AMPERSAND) {
  492. this.returnState = DATA_STATE;
  493. this.state = CHARACTER_REFERENCE_STATE;
  494. } else if (cp === $.NULL) {
  495. this._err(ERR.unexpectedNullCharacter);
  496. this._emitCodePoint(cp);
  497. } else if (cp === $.EOF) {
  498. this._emitEOFToken();
  499. } else {
  500. this._emitCodePoint(cp);
  501. }
  502. }
  503. // RCDATA state
  504. //------------------------------------------------------------------
  505. [RCDATA_STATE](cp) {
  506. this.preprocessor.dropParsedChunk();
  507. if (cp === $.AMPERSAND) {
  508. this.returnState = RCDATA_STATE;
  509. this.state = CHARACTER_REFERENCE_STATE;
  510. } else if (cp === $.LESS_THAN_SIGN) {
  511. this.state = RCDATA_LESS_THAN_SIGN_STATE;
  512. } else if (cp === $.NULL) {
  513. this._err(ERR.unexpectedNullCharacter);
  514. this._emitChars(unicode.REPLACEMENT_CHARACTER);
  515. } else if (cp === $.EOF) {
  516. this._emitEOFToken();
  517. } else {
  518. this._emitCodePoint(cp);
  519. }
  520. }
  521. // RAWTEXT state
  522. //------------------------------------------------------------------
  523. [RAWTEXT_STATE](cp) {
  524. this.preprocessor.dropParsedChunk();
  525. if (cp === $.LESS_THAN_SIGN) {
  526. this.state = RAWTEXT_LESS_THAN_SIGN_STATE;
  527. } else if (cp === $.NULL) {
  528. this._err(ERR.unexpectedNullCharacter);
  529. this._emitChars(unicode.REPLACEMENT_CHARACTER);
  530. } else if (cp === $.EOF) {
  531. this._emitEOFToken();
  532. } else {
  533. this._emitCodePoint(cp);
  534. }
  535. }
  536. // Script data state
  537. //------------------------------------------------------------------
  538. [SCRIPT_DATA_STATE](cp) {
  539. this.preprocessor.dropParsedChunk();
  540. if (cp === $.LESS_THAN_SIGN) {
  541. this.state = SCRIPT_DATA_LESS_THAN_SIGN_STATE;
  542. } else if (cp === $.NULL) {
  543. this._err(ERR.unexpectedNullCharacter);
  544. this._emitChars(unicode.REPLACEMENT_CHARACTER);
  545. } else if (cp === $.EOF) {
  546. this._emitEOFToken();
  547. } else {
  548. this._emitCodePoint(cp);
  549. }
  550. }
  551. // PLAINTEXT state
  552. //------------------------------------------------------------------
  553. [PLAINTEXT_STATE](cp) {
  554. this.preprocessor.dropParsedChunk();
  555. if (cp === $.NULL) {
  556. this._err(ERR.unexpectedNullCharacter);
  557. this._emitChars(unicode.REPLACEMENT_CHARACTER);
  558. } else if (cp === $.EOF) {
  559. this._emitEOFToken();
  560. } else {
  561. this._emitCodePoint(cp);
  562. }
  563. }
  564. // Tag open state
  565. //------------------------------------------------------------------
  566. [TAG_OPEN_STATE](cp) {
  567. if (cp === $.EXCLAMATION_MARK) {
  568. this.state = MARKUP_DECLARATION_OPEN_STATE;
  569. } else if (cp === $.SOLIDUS) {
  570. this.state = END_TAG_OPEN_STATE;
  571. } else if (isAsciiLetter(cp)) {
  572. this._createStartTagToken();
  573. this._reconsumeInState(TAG_NAME_STATE);
  574. } else if (cp === $.QUESTION_MARK) {
  575. this._err(ERR.unexpectedQuestionMarkInsteadOfTagName);
  576. this._createCommentToken();
  577. this._reconsumeInState(BOGUS_COMMENT_STATE);
  578. } else if (cp === $.EOF) {
  579. this._err(ERR.eofBeforeTagName);
  580. this._emitChars('<');
  581. this._emitEOFToken();
  582. } else {
  583. this._err(ERR.invalidFirstCharacterOfTagName);
  584. this._emitChars('<');
  585. this._reconsumeInState(DATA_STATE);
  586. }
  587. }
  588. // End tag open state
  589. //------------------------------------------------------------------
  590. [END_TAG_OPEN_STATE](cp) {
  591. if (isAsciiLetter(cp)) {
  592. this._createEndTagToken();
  593. this._reconsumeInState(TAG_NAME_STATE);
  594. } else if (cp === $.GREATER_THAN_SIGN) {
  595. this._err(ERR.missingEndTagName);
  596. this.state = DATA_STATE;
  597. } else if (cp === $.EOF) {
  598. this._err(ERR.eofBeforeTagName);
  599. this._emitChars('</');
  600. this._emitEOFToken();
  601. } else {
  602. this._err(ERR.invalidFirstCharacterOfTagName);
  603. this._createCommentToken();
  604. this._reconsumeInState(BOGUS_COMMENT_STATE);
  605. }
  606. }
  607. // Tag name state
  608. //------------------------------------------------------------------
  609. [TAG_NAME_STATE](cp) {
  610. if (isWhitespace(cp)) {
  611. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  612. } else if (cp === $.SOLIDUS) {
  613. this.state = SELF_CLOSING_START_TAG_STATE;
  614. } else if (cp === $.GREATER_THAN_SIGN) {
  615. this.state = DATA_STATE;
  616. this._emitCurrentToken();
  617. } else if (isAsciiUpper(cp)) {
  618. this.currentToken.tagName += toAsciiLowerChar(cp);
  619. } else if (cp === $.NULL) {
  620. this._err(ERR.unexpectedNullCharacter);
  621. this.currentToken.tagName += unicode.REPLACEMENT_CHARACTER;
  622. } else if (cp === $.EOF) {
  623. this._err(ERR.eofInTag);
  624. this._emitEOFToken();
  625. } else {
  626. this.currentToken.tagName += toChar(cp);
  627. }
  628. }
  629. // RCDATA less-than sign state
  630. //------------------------------------------------------------------
  631. [RCDATA_LESS_THAN_SIGN_STATE](cp) {
  632. if (cp === $.SOLIDUS) {
  633. this.tempBuff = [];
  634. this.state = RCDATA_END_TAG_OPEN_STATE;
  635. } else {
  636. this._emitChars('<');
  637. this._reconsumeInState(RCDATA_STATE);
  638. }
  639. }
  640. // RCDATA end tag open state
  641. //------------------------------------------------------------------
  642. [RCDATA_END_TAG_OPEN_STATE](cp) {
  643. if (isAsciiLetter(cp)) {
  644. this._createEndTagToken();
  645. this._reconsumeInState(RCDATA_END_TAG_NAME_STATE);
  646. } else {
  647. this._emitChars('</');
  648. this._reconsumeInState(RCDATA_STATE);
  649. }
  650. }
  651. // RCDATA end tag name state
  652. //------------------------------------------------------------------
  653. [RCDATA_END_TAG_NAME_STATE](cp) {
  654. if (isAsciiUpper(cp)) {
  655. this.currentToken.tagName += toAsciiLowerChar(cp);
  656. this.tempBuff.push(cp);
  657. } else if (isAsciiLower(cp)) {
  658. this.currentToken.tagName += toChar(cp);
  659. this.tempBuff.push(cp);
  660. } else {
  661. if (this.lastStartTagName === this.currentToken.tagName) {
  662. if (isWhitespace(cp)) {
  663. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  664. return;
  665. }
  666. if (cp === $.SOLIDUS) {
  667. this.state = SELF_CLOSING_START_TAG_STATE;
  668. return;
  669. }
  670. if (cp === $.GREATER_THAN_SIGN) {
  671. this.state = DATA_STATE;
  672. this._emitCurrentToken();
  673. return;
  674. }
  675. }
  676. this._emitChars('</');
  677. this._emitSeveralCodePoints(this.tempBuff);
  678. this._reconsumeInState(RCDATA_STATE);
  679. }
  680. }
  681. // RAWTEXT less-than sign state
  682. //------------------------------------------------------------------
  683. [RAWTEXT_LESS_THAN_SIGN_STATE](cp) {
  684. if (cp === $.SOLIDUS) {
  685. this.tempBuff = [];
  686. this.state = RAWTEXT_END_TAG_OPEN_STATE;
  687. } else {
  688. this._emitChars('<');
  689. this._reconsumeInState(RAWTEXT_STATE);
  690. }
  691. }
  692. // RAWTEXT end tag open state
  693. //------------------------------------------------------------------
  694. [RAWTEXT_END_TAG_OPEN_STATE](cp) {
  695. if (isAsciiLetter(cp)) {
  696. this._createEndTagToken();
  697. this._reconsumeInState(RAWTEXT_END_TAG_NAME_STATE);
  698. } else {
  699. this._emitChars('</');
  700. this._reconsumeInState(RAWTEXT_STATE);
  701. }
  702. }
  703. // RAWTEXT end tag name state
  704. //------------------------------------------------------------------
  705. [RAWTEXT_END_TAG_NAME_STATE](cp) {
  706. if (isAsciiUpper(cp)) {
  707. this.currentToken.tagName += toAsciiLowerChar(cp);
  708. this.tempBuff.push(cp);
  709. } else if (isAsciiLower(cp)) {
  710. this.currentToken.tagName += toChar(cp);
  711. this.tempBuff.push(cp);
  712. } else {
  713. if (this.lastStartTagName === this.currentToken.tagName) {
  714. if (isWhitespace(cp)) {
  715. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  716. return;
  717. }
  718. if (cp === $.SOLIDUS) {
  719. this.state = SELF_CLOSING_START_TAG_STATE;
  720. return;
  721. }
  722. if (cp === $.GREATER_THAN_SIGN) {
  723. this._emitCurrentToken();
  724. this.state = DATA_STATE;
  725. return;
  726. }
  727. }
  728. this._emitChars('</');
  729. this._emitSeveralCodePoints(this.tempBuff);
  730. this._reconsumeInState(RAWTEXT_STATE);
  731. }
  732. }
  733. // Script data less-than sign state
  734. //------------------------------------------------------------------
  735. [SCRIPT_DATA_LESS_THAN_SIGN_STATE](cp) {
  736. if (cp === $.SOLIDUS) {
  737. this.tempBuff = [];
  738. this.state = SCRIPT_DATA_END_TAG_OPEN_STATE;
  739. } else if (cp === $.EXCLAMATION_MARK) {
  740. this.state = SCRIPT_DATA_ESCAPE_START_STATE;
  741. this._emitChars('<!');
  742. } else {
  743. this._emitChars('<');
  744. this._reconsumeInState(SCRIPT_DATA_STATE);
  745. }
  746. }
  747. // Script data end tag open state
  748. //------------------------------------------------------------------
  749. [SCRIPT_DATA_END_TAG_OPEN_STATE](cp) {
  750. if (isAsciiLetter(cp)) {
  751. this._createEndTagToken();
  752. this._reconsumeInState(SCRIPT_DATA_END_TAG_NAME_STATE);
  753. } else {
  754. this._emitChars('</');
  755. this._reconsumeInState(SCRIPT_DATA_STATE);
  756. }
  757. }
  758. // Script data end tag name state
  759. //------------------------------------------------------------------
  760. [SCRIPT_DATA_END_TAG_NAME_STATE](cp) {
  761. if (isAsciiUpper(cp)) {
  762. this.currentToken.tagName += toAsciiLowerChar(cp);
  763. this.tempBuff.push(cp);
  764. } else if (isAsciiLower(cp)) {
  765. this.currentToken.tagName += toChar(cp);
  766. this.tempBuff.push(cp);
  767. } else {
  768. if (this.lastStartTagName === this.currentToken.tagName) {
  769. if (isWhitespace(cp)) {
  770. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  771. return;
  772. } else if (cp === $.SOLIDUS) {
  773. this.state = SELF_CLOSING_START_TAG_STATE;
  774. return;
  775. } else if (cp === $.GREATER_THAN_SIGN) {
  776. this._emitCurrentToken();
  777. this.state = DATA_STATE;
  778. return;
  779. }
  780. }
  781. this._emitChars('</');
  782. this._emitSeveralCodePoints(this.tempBuff);
  783. this._reconsumeInState(SCRIPT_DATA_STATE);
  784. }
  785. }
  786. // Script data escape start state
  787. //------------------------------------------------------------------
  788. [SCRIPT_DATA_ESCAPE_START_STATE](cp) {
  789. if (cp === $.HYPHEN_MINUS) {
  790. this.state = SCRIPT_DATA_ESCAPE_START_DASH_STATE;
  791. this._emitChars('-');
  792. } else {
  793. this._reconsumeInState(SCRIPT_DATA_STATE);
  794. }
  795. }
  796. // Script data escape start dash state
  797. //------------------------------------------------------------------
  798. [SCRIPT_DATA_ESCAPE_START_DASH_STATE](cp) {
  799. if (cp === $.HYPHEN_MINUS) {
  800. this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
  801. this._emitChars('-');
  802. } else {
  803. this._reconsumeInState(SCRIPT_DATA_STATE);
  804. }
  805. }
  806. // Script data escaped state
  807. //------------------------------------------------------------------
  808. [SCRIPT_DATA_ESCAPED_STATE](cp) {
  809. if (cp === $.HYPHEN_MINUS) {
  810. this.state = SCRIPT_DATA_ESCAPED_DASH_STATE;
  811. this._emitChars('-');
  812. } else if (cp === $.LESS_THAN_SIGN) {
  813. this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
  814. } else if (cp === $.NULL) {
  815. this._err(ERR.unexpectedNullCharacter);
  816. this._emitChars(unicode.REPLACEMENT_CHARACTER);
  817. } else if (cp === $.EOF) {
  818. this._err(ERR.eofInScriptHtmlCommentLikeText);
  819. this._emitEOFToken();
  820. } else {
  821. this._emitCodePoint(cp);
  822. }
  823. }
  824. // Script data escaped dash state
  825. //------------------------------------------------------------------
  826. [SCRIPT_DATA_ESCAPED_DASH_STATE](cp) {
  827. if (cp === $.HYPHEN_MINUS) {
  828. this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
  829. this._emitChars('-');
  830. } else if (cp === $.LESS_THAN_SIGN) {
  831. this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
  832. } else if (cp === $.NULL) {
  833. this._err(ERR.unexpectedNullCharacter);
  834. this.state = SCRIPT_DATA_ESCAPED_STATE;
  835. this._emitChars(unicode.REPLACEMENT_CHARACTER);
  836. } else if (cp === $.EOF) {
  837. this._err(ERR.eofInScriptHtmlCommentLikeText);
  838. this._emitEOFToken();
  839. } else {
  840. this.state = SCRIPT_DATA_ESCAPED_STATE;
  841. this._emitCodePoint(cp);
  842. }
  843. }
  844. // Script data escaped dash dash state
  845. //------------------------------------------------------------------
  846. [SCRIPT_DATA_ESCAPED_DASH_DASH_STATE](cp) {
  847. if (cp === $.HYPHEN_MINUS) {
  848. this._emitChars('-');
  849. } else if (cp === $.LESS_THAN_SIGN) {
  850. this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
  851. } else if (cp === $.GREATER_THAN_SIGN) {
  852. this.state = SCRIPT_DATA_STATE;
  853. this._emitChars('>');
  854. } else if (cp === $.NULL) {
  855. this._err(ERR.unexpectedNullCharacter);
  856. this.state = SCRIPT_DATA_ESCAPED_STATE;
  857. this._emitChars(unicode.REPLACEMENT_CHARACTER);
  858. } else if (cp === $.EOF) {
  859. this._err(ERR.eofInScriptHtmlCommentLikeText);
  860. this._emitEOFToken();
  861. } else {
  862. this.state = SCRIPT_DATA_ESCAPED_STATE;
  863. this._emitCodePoint(cp);
  864. }
  865. }
  866. // Script data escaped less-than sign state
  867. //------------------------------------------------------------------
  868. [SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE](cp) {
  869. if (cp === $.SOLIDUS) {
  870. this.tempBuff = [];
  871. this.state = SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE;
  872. } else if (isAsciiLetter(cp)) {
  873. this.tempBuff = [];
  874. this._emitChars('<');
  875. this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE);
  876. } else {
  877. this._emitChars('<');
  878. this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
  879. }
  880. }
  881. // Script data escaped end tag open state
  882. //------------------------------------------------------------------
  883. [SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE](cp) {
  884. if (isAsciiLetter(cp)) {
  885. this._createEndTagToken();
  886. this._reconsumeInState(SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE);
  887. } else {
  888. this._emitChars('</');
  889. this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
  890. }
  891. }
  892. // Script data escaped end tag name state
  893. //------------------------------------------------------------------
  894. [SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE](cp) {
  895. if (isAsciiUpper(cp)) {
  896. this.currentToken.tagName += toAsciiLowerChar(cp);
  897. this.tempBuff.push(cp);
  898. } else if (isAsciiLower(cp)) {
  899. this.currentToken.tagName += toChar(cp);
  900. this.tempBuff.push(cp);
  901. } else {
  902. if (this.lastStartTagName === this.currentToken.tagName) {
  903. if (isWhitespace(cp)) {
  904. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  905. return;
  906. }
  907. if (cp === $.SOLIDUS) {
  908. this.state = SELF_CLOSING_START_TAG_STATE;
  909. return;
  910. }
  911. if (cp === $.GREATER_THAN_SIGN) {
  912. this._emitCurrentToken();
  913. this.state = DATA_STATE;
  914. return;
  915. }
  916. }
  917. this._emitChars('</');
  918. this._emitSeveralCodePoints(this.tempBuff);
  919. this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
  920. }
  921. }
  922. // Script data double escape start state
  923. //------------------------------------------------------------------
  924. [SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE](cp) {
  925. if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) {
  926. this.state = this._isTempBufferEqualToScriptString()
  927. ? SCRIPT_DATA_DOUBLE_ESCAPED_STATE
  928. : SCRIPT_DATA_ESCAPED_STATE;
  929. this._emitCodePoint(cp);
  930. } else if (isAsciiUpper(cp)) {
  931. this.tempBuff.push(toAsciiLowerCodePoint(cp));
  932. this._emitCodePoint(cp);
  933. } else if (isAsciiLower(cp)) {
  934. this.tempBuff.push(cp);
  935. this._emitCodePoint(cp);
  936. } else {
  937. this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
  938. }
  939. }
  940. // Script data double escaped state
  941. //------------------------------------------------------------------
  942. [SCRIPT_DATA_DOUBLE_ESCAPED_STATE](cp) {
  943. if (cp === $.HYPHEN_MINUS) {
  944. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE;
  945. this._emitChars('-');
  946. } else if (cp === $.LESS_THAN_SIGN) {
  947. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
  948. this._emitChars('<');
  949. } else if (cp === $.NULL) {
  950. this._err(ERR.unexpectedNullCharacter);
  951. this._emitChars(unicode.REPLACEMENT_CHARACTER);
  952. } else if (cp === $.EOF) {
  953. this._err(ERR.eofInScriptHtmlCommentLikeText);
  954. this._emitEOFToken();
  955. } else {
  956. this._emitCodePoint(cp);
  957. }
  958. }
  959. // Script data double escaped dash state
  960. //------------------------------------------------------------------
  961. [SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE](cp) {
  962. if (cp === $.HYPHEN_MINUS) {
  963. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE;
  964. this._emitChars('-');
  965. } else if (cp === $.LESS_THAN_SIGN) {
  966. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
  967. this._emitChars('<');
  968. } else if (cp === $.NULL) {
  969. this._err(ERR.unexpectedNullCharacter);
  970. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  971. this._emitChars(unicode.REPLACEMENT_CHARACTER);
  972. } else if (cp === $.EOF) {
  973. this._err(ERR.eofInScriptHtmlCommentLikeText);
  974. this._emitEOFToken();
  975. } else {
  976. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  977. this._emitCodePoint(cp);
  978. }
  979. }
  980. // Script data double escaped dash dash state
  981. //------------------------------------------------------------------
  982. [SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE](cp) {
  983. if (cp === $.HYPHEN_MINUS) {
  984. this._emitChars('-');
  985. } else if (cp === $.LESS_THAN_SIGN) {
  986. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
  987. this._emitChars('<');
  988. } else if (cp === $.GREATER_THAN_SIGN) {
  989. this.state = SCRIPT_DATA_STATE;
  990. this._emitChars('>');
  991. } else if (cp === $.NULL) {
  992. this._err(ERR.unexpectedNullCharacter);
  993. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  994. this._emitChars(unicode.REPLACEMENT_CHARACTER);
  995. } else if (cp === $.EOF) {
  996. this._err(ERR.eofInScriptHtmlCommentLikeText);
  997. this._emitEOFToken();
  998. } else {
  999. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  1000. this._emitCodePoint(cp);
  1001. }
  1002. }
  1003. // Script data double escaped less-than sign state
  1004. //------------------------------------------------------------------
  1005. [SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE](cp) {
  1006. if (cp === $.SOLIDUS) {
  1007. this.tempBuff = [];
  1008. this.state = SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE;
  1009. this._emitChars('/');
  1010. } else {
  1011. this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE);
  1012. }
  1013. }
  1014. // Script data double escape end state
  1015. //------------------------------------------------------------------
  1016. [SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE](cp) {
  1017. if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) {
  1018. this.state = this._isTempBufferEqualToScriptString()
  1019. ? SCRIPT_DATA_ESCAPED_STATE
  1020. : SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  1021. this._emitCodePoint(cp);
  1022. } else if (isAsciiUpper(cp)) {
  1023. this.tempBuff.push(toAsciiLowerCodePoint(cp));
  1024. this._emitCodePoint(cp);
  1025. } else if (isAsciiLower(cp)) {
  1026. this.tempBuff.push(cp);
  1027. this._emitCodePoint(cp);
  1028. } else {
  1029. this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE);
  1030. }
  1031. }
  1032. // Before attribute name state
  1033. //------------------------------------------------------------------
  1034. [BEFORE_ATTRIBUTE_NAME_STATE](cp) {
  1035. if (isWhitespace(cp)) {
  1036. return;
  1037. }
  1038. if (cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN || cp === $.EOF) {
  1039. this._reconsumeInState(AFTER_ATTRIBUTE_NAME_STATE);
  1040. } else if (cp === $.EQUALS_SIGN) {
  1041. this._err(ERR.unexpectedEqualsSignBeforeAttributeName);
  1042. this._createAttr('=');
  1043. this.state = ATTRIBUTE_NAME_STATE;
  1044. } else {
  1045. this._createAttr('');
  1046. this._reconsumeInState(ATTRIBUTE_NAME_STATE);
  1047. }
  1048. }
  1049. // Attribute name state
  1050. //------------------------------------------------------------------
  1051. [ATTRIBUTE_NAME_STATE](cp) {
  1052. if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN || cp === $.EOF) {
  1053. this._leaveAttrName(AFTER_ATTRIBUTE_NAME_STATE);
  1054. this._unconsume();
  1055. } else if (cp === $.EQUALS_SIGN) {
  1056. this._leaveAttrName(BEFORE_ATTRIBUTE_VALUE_STATE);
  1057. } else if (isAsciiUpper(cp)) {
  1058. this.currentAttr.name += toAsciiLowerChar(cp);
  1059. } else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN) {
  1060. this._err(ERR.unexpectedCharacterInAttributeName);
  1061. this.currentAttr.name += toChar(cp);
  1062. } else if (cp === $.NULL) {
  1063. this._err(ERR.unexpectedNullCharacter);
  1064. this.currentAttr.name += unicode.REPLACEMENT_CHARACTER;
  1065. } else {
  1066. this.currentAttr.name += toChar(cp);
  1067. }
  1068. }
  1069. // After attribute name state
  1070. //------------------------------------------------------------------
  1071. [AFTER_ATTRIBUTE_NAME_STATE](cp) {
  1072. if (isWhitespace(cp)) {
  1073. return;
  1074. }
  1075. if (cp === $.SOLIDUS) {
  1076. this.state = SELF_CLOSING_START_TAG_STATE;
  1077. } else if (cp === $.EQUALS_SIGN) {
  1078. this.state = BEFORE_ATTRIBUTE_VALUE_STATE;
  1079. } else if (cp === $.GREATER_THAN_SIGN) {
  1080. this.state = DATA_STATE;
  1081. this._emitCurrentToken();
  1082. } else if (cp === $.EOF) {
  1083. this._err(ERR.eofInTag);
  1084. this._emitEOFToken();
  1085. } else {
  1086. this._createAttr('');
  1087. this._reconsumeInState(ATTRIBUTE_NAME_STATE);
  1088. }
  1089. }
  1090. // Before attribute value state
  1091. //------------------------------------------------------------------
  1092. [BEFORE_ATTRIBUTE_VALUE_STATE](cp) {
  1093. if (isWhitespace(cp)) {
  1094. return;
  1095. }
  1096. if (cp === $.QUOTATION_MARK) {
  1097. this.state = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
  1098. } else if (cp === $.APOSTROPHE) {
  1099. this.state = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
  1100. } else if (cp === $.GREATER_THAN_SIGN) {
  1101. this._err(ERR.missingAttributeValue);
  1102. this.state = DATA_STATE;
  1103. this._emitCurrentToken();
  1104. } else {
  1105. this._reconsumeInState(ATTRIBUTE_VALUE_UNQUOTED_STATE);
  1106. }
  1107. }
  1108. // Attribute value (double-quoted) state
  1109. //------------------------------------------------------------------
  1110. [ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE](cp) {
  1111. if (cp === $.QUOTATION_MARK) {
  1112. this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
  1113. } else if (cp === $.AMPERSAND) {
  1114. this.returnState = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
  1115. this.state = CHARACTER_REFERENCE_STATE;
  1116. } else if (cp === $.NULL) {
  1117. this._err(ERR.unexpectedNullCharacter);
  1118. this.currentAttr.value += unicode.REPLACEMENT_CHARACTER;
  1119. } else if (cp === $.EOF) {
  1120. this._err(ERR.eofInTag);
  1121. this._emitEOFToken();
  1122. } else {
  1123. this.currentAttr.value += toChar(cp);
  1124. }
  1125. }
  1126. // Attribute value (single-quoted) state
  1127. //------------------------------------------------------------------
  1128. [ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE](cp) {
  1129. if (cp === $.APOSTROPHE) {
  1130. this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
  1131. } else if (cp === $.AMPERSAND) {
  1132. this.returnState = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
  1133. this.state = CHARACTER_REFERENCE_STATE;
  1134. } else if (cp === $.NULL) {
  1135. this._err(ERR.unexpectedNullCharacter);
  1136. this.currentAttr.value += unicode.REPLACEMENT_CHARACTER;
  1137. } else if (cp === $.EOF) {
  1138. this._err(ERR.eofInTag);
  1139. this._emitEOFToken();
  1140. } else {
  1141. this.currentAttr.value += toChar(cp);
  1142. }
  1143. }
  1144. // Attribute value (unquoted) state
  1145. //------------------------------------------------------------------
  1146. [ATTRIBUTE_VALUE_UNQUOTED_STATE](cp) {
  1147. if (isWhitespace(cp)) {
  1148. this._leaveAttrValue(BEFORE_ATTRIBUTE_NAME_STATE);
  1149. } else if (cp === $.AMPERSAND) {
  1150. this.returnState = ATTRIBUTE_VALUE_UNQUOTED_STATE;
  1151. this.state = CHARACTER_REFERENCE_STATE;
  1152. } else if (cp === $.GREATER_THAN_SIGN) {
  1153. this._leaveAttrValue(DATA_STATE);
  1154. this._emitCurrentToken();
  1155. } else if (cp === $.NULL) {
  1156. this._err(ERR.unexpectedNullCharacter);
  1157. this.currentAttr.value += unicode.REPLACEMENT_CHARACTER;
  1158. } else if (
  1159. cp === $.QUOTATION_MARK ||
  1160. cp === $.APOSTROPHE ||
  1161. cp === $.LESS_THAN_SIGN ||
  1162. cp === $.EQUALS_SIGN ||
  1163. cp === $.GRAVE_ACCENT
  1164. ) {
  1165. this._err(ERR.unexpectedCharacterInUnquotedAttributeValue);
  1166. this.currentAttr.value += toChar(cp);
  1167. } else if (cp === $.EOF) {
  1168. this._err(ERR.eofInTag);
  1169. this._emitEOFToken();
  1170. } else {
  1171. this.currentAttr.value += toChar(cp);
  1172. }
  1173. }
  1174. // After attribute value (quoted) state
  1175. //------------------------------------------------------------------
  1176. [AFTER_ATTRIBUTE_VALUE_QUOTED_STATE](cp) {
  1177. if (isWhitespace(cp)) {
  1178. this._leaveAttrValue(BEFORE_ATTRIBUTE_NAME_STATE);
  1179. } else if (cp === $.SOLIDUS) {
  1180. this._leaveAttrValue(SELF_CLOSING_START_TAG_STATE);
  1181. } else if (cp === $.GREATER_THAN_SIGN) {
  1182. this._leaveAttrValue(DATA_STATE);
  1183. this._emitCurrentToken();
  1184. } else if (cp === $.EOF) {
  1185. this._err(ERR.eofInTag);
  1186. this._emitEOFToken();
  1187. } else {
  1188. this._err(ERR.missingWhitespaceBetweenAttributes);
  1189. this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE);
  1190. }
  1191. }
  1192. // Self-closing start tag state
  1193. //------------------------------------------------------------------
  1194. [SELF_CLOSING_START_TAG_STATE](cp) {
  1195. if (cp === $.GREATER_THAN_SIGN) {
  1196. this.currentToken.selfClosing = true;
  1197. this.state = DATA_STATE;
  1198. this._emitCurrentToken();
  1199. } else if (cp === $.EOF) {
  1200. this._err(ERR.eofInTag);
  1201. this._emitEOFToken();
  1202. } else {
  1203. this._err(ERR.unexpectedSolidusInTag);
  1204. this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE);
  1205. }
  1206. }
  1207. // Bogus comment state
  1208. //------------------------------------------------------------------
  1209. [BOGUS_COMMENT_STATE](cp) {
  1210. if (cp === $.GREATER_THAN_SIGN) {
  1211. this.state = DATA_STATE;
  1212. this._emitCurrentToken();
  1213. } else if (cp === $.EOF) {
  1214. this._emitCurrentToken();
  1215. this._emitEOFToken();
  1216. } else if (cp === $.NULL) {
  1217. this._err(ERR.unexpectedNullCharacter);
  1218. this.currentToken.data += unicode.REPLACEMENT_CHARACTER;
  1219. } else {
  1220. this.currentToken.data += toChar(cp);
  1221. }
  1222. }
  1223. // Markup declaration open state
  1224. //------------------------------------------------------------------
  1225. [MARKUP_DECLARATION_OPEN_STATE](cp) {
  1226. if (this._consumeSequenceIfMatch($$.DASH_DASH_STRING, cp, true)) {
  1227. this._createCommentToken();
  1228. this.state = COMMENT_START_STATE;
  1229. } else if (this._consumeSequenceIfMatch($$.DOCTYPE_STRING, cp, false)) {
  1230. this.state = DOCTYPE_STATE;
  1231. } else if (this._consumeSequenceIfMatch($$.CDATA_START_STRING, cp, true)) {
  1232. if (this.allowCDATA) {
  1233. this.state = CDATA_SECTION_STATE;
  1234. } else {
  1235. this._err(ERR.cdataInHtmlContent);
  1236. this._createCommentToken();
  1237. this.currentToken.data = '[CDATA[';
  1238. this.state = BOGUS_COMMENT_STATE;
  1239. }
  1240. }
  1241. //NOTE: sequence lookup can be abrupted by hibernation. In that case lookup
  1242. //results are no longer valid and we will need to start over.
  1243. else if (!this._ensureHibernation()) {
  1244. this._err(ERR.incorrectlyOpenedComment);
  1245. this._createCommentToken();
  1246. this._reconsumeInState(BOGUS_COMMENT_STATE);
  1247. }
  1248. }
  1249. // Comment start state
  1250. //------------------------------------------------------------------
  1251. [COMMENT_START_STATE](cp) {
  1252. if (cp === $.HYPHEN_MINUS) {
  1253. this.state = COMMENT_START_DASH_STATE;
  1254. } else if (cp === $.GREATER_THAN_SIGN) {
  1255. this._err(ERR.abruptClosingOfEmptyComment);
  1256. this.state = DATA_STATE;
  1257. this._emitCurrentToken();
  1258. } else {
  1259. this._reconsumeInState(COMMENT_STATE);
  1260. }
  1261. }
  1262. // Comment start dash state
  1263. //------------------------------------------------------------------
  1264. [COMMENT_START_DASH_STATE](cp) {
  1265. if (cp === $.HYPHEN_MINUS) {
  1266. this.state = COMMENT_END_STATE;
  1267. } else if (cp === $.GREATER_THAN_SIGN) {
  1268. this._err(ERR.abruptClosingOfEmptyComment);
  1269. this.state = DATA_STATE;
  1270. this._emitCurrentToken();
  1271. } else if (cp === $.EOF) {
  1272. this._err(ERR.eofInComment);
  1273. this._emitCurrentToken();
  1274. this._emitEOFToken();
  1275. } else {
  1276. this.currentToken.data += '-';
  1277. this._reconsumeInState(COMMENT_STATE);
  1278. }
  1279. }
  1280. // Comment state
  1281. //------------------------------------------------------------------
  1282. [COMMENT_STATE](cp) {
  1283. if (cp === $.HYPHEN_MINUS) {
  1284. this.state = COMMENT_END_DASH_STATE;
  1285. } else if (cp === $.LESS_THAN_SIGN) {
  1286. this.currentToken.data += '<';
  1287. this.state = COMMENT_LESS_THAN_SIGN_STATE;
  1288. } else if (cp === $.NULL) {
  1289. this._err(ERR.unexpectedNullCharacter);
  1290. this.currentToken.data += unicode.REPLACEMENT_CHARACTER;
  1291. } else if (cp === $.EOF) {
  1292. this._err(ERR.eofInComment);
  1293. this._emitCurrentToken();
  1294. this._emitEOFToken();
  1295. } else {
  1296. this.currentToken.data += toChar(cp);
  1297. }
  1298. }
  1299. // Comment less-than sign state
  1300. //------------------------------------------------------------------
  1301. [COMMENT_LESS_THAN_SIGN_STATE](cp) {
  1302. if (cp === $.EXCLAMATION_MARK) {
  1303. this.currentToken.data += '!';
  1304. this.state = COMMENT_LESS_THAN_SIGN_BANG_STATE;
  1305. } else if (cp === $.LESS_THAN_SIGN) {
  1306. this.currentToken.data += '!';
  1307. } else {
  1308. this._reconsumeInState(COMMENT_STATE);
  1309. }
  1310. }
  1311. // Comment less-than sign bang state
  1312. //------------------------------------------------------------------
  1313. [COMMENT_LESS_THAN_SIGN_BANG_STATE](cp) {
  1314. if (cp === $.HYPHEN_MINUS) {
  1315. this.state = COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE;
  1316. } else {
  1317. this._reconsumeInState(COMMENT_STATE);
  1318. }
  1319. }
  1320. // Comment less-than sign bang dash state
  1321. //------------------------------------------------------------------
  1322. [COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE](cp) {
  1323. if (cp === $.HYPHEN_MINUS) {
  1324. this.state = COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE;
  1325. } else {
  1326. this._reconsumeInState(COMMENT_END_DASH_STATE);
  1327. }
  1328. }
  1329. // Comment less-than sign bang dash dash state
  1330. //------------------------------------------------------------------
  1331. [COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE](cp) {
  1332. if (cp !== $.GREATER_THAN_SIGN && cp !== $.EOF) {
  1333. this._err(ERR.nestedComment);
  1334. }
  1335. this._reconsumeInState(COMMENT_END_STATE);
  1336. }
  1337. // Comment end dash state
  1338. //------------------------------------------------------------------
  1339. [COMMENT_END_DASH_STATE](cp) {
  1340. if (cp === $.HYPHEN_MINUS) {
  1341. this.state = COMMENT_END_STATE;
  1342. } else if (cp === $.EOF) {
  1343. this._err(ERR.eofInComment);
  1344. this._emitCurrentToken();
  1345. this._emitEOFToken();
  1346. } else {
  1347. this.currentToken.data += '-';
  1348. this._reconsumeInState(COMMENT_STATE);
  1349. }
  1350. }
  1351. // Comment end state
  1352. //------------------------------------------------------------------
  1353. [COMMENT_END_STATE](cp) {
  1354. if (cp === $.GREATER_THAN_SIGN) {
  1355. this.state = DATA_STATE;
  1356. this._emitCurrentToken();
  1357. } else if (cp === $.EXCLAMATION_MARK) {
  1358. this.state = COMMENT_END_BANG_STATE;
  1359. } else if (cp === $.HYPHEN_MINUS) {
  1360. this.currentToken.data += '-';
  1361. } else if (cp === $.EOF) {
  1362. this._err(ERR.eofInComment);
  1363. this._emitCurrentToken();
  1364. this._emitEOFToken();
  1365. } else {
  1366. this.currentToken.data += '--';
  1367. this._reconsumeInState(COMMENT_STATE);
  1368. }
  1369. }
  1370. // Comment end bang state
  1371. //------------------------------------------------------------------
  1372. [COMMENT_END_BANG_STATE](cp) {
  1373. if (cp === $.HYPHEN_MINUS) {
  1374. this.currentToken.data += '--!';
  1375. this.state = COMMENT_END_DASH_STATE;
  1376. } else if (cp === $.GREATER_THAN_SIGN) {
  1377. this._err(ERR.incorrectlyClosedComment);
  1378. this.state = DATA_STATE;
  1379. this._emitCurrentToken();
  1380. } else if (cp === $.EOF) {
  1381. this._err(ERR.eofInComment);
  1382. this._emitCurrentToken();
  1383. this._emitEOFToken();
  1384. } else {
  1385. this.currentToken.data += '--!';
  1386. this._reconsumeInState(COMMENT_STATE);
  1387. }
  1388. }
  1389. // DOCTYPE state
  1390. //------------------------------------------------------------------
  1391. [DOCTYPE_STATE](cp) {
  1392. if (isWhitespace(cp)) {
  1393. this.state = BEFORE_DOCTYPE_NAME_STATE;
  1394. } else if (cp === $.GREATER_THAN_SIGN) {
  1395. this._reconsumeInState(BEFORE_DOCTYPE_NAME_STATE);
  1396. } else if (cp === $.EOF) {
  1397. this._err(ERR.eofInDoctype);
  1398. this._createDoctypeToken(null);
  1399. this.currentToken.forceQuirks = true;
  1400. this._emitCurrentToken();
  1401. this._emitEOFToken();
  1402. } else {
  1403. this._err(ERR.missingWhitespaceBeforeDoctypeName);
  1404. this._reconsumeInState(BEFORE_DOCTYPE_NAME_STATE);
  1405. }
  1406. }
  1407. // Before DOCTYPE name state
  1408. //------------------------------------------------------------------
  1409. [BEFORE_DOCTYPE_NAME_STATE](cp) {
  1410. if (isWhitespace(cp)) {
  1411. return;
  1412. }
  1413. if (isAsciiUpper(cp)) {
  1414. this._createDoctypeToken(toAsciiLowerChar(cp));
  1415. this.state = DOCTYPE_NAME_STATE;
  1416. } else if (cp === $.NULL) {
  1417. this._err(ERR.unexpectedNullCharacter);
  1418. this._createDoctypeToken(unicode.REPLACEMENT_CHARACTER);
  1419. this.state = DOCTYPE_NAME_STATE;
  1420. } else if (cp === $.GREATER_THAN_SIGN) {
  1421. this._err(ERR.missingDoctypeName);
  1422. this._createDoctypeToken(null);
  1423. this.currentToken.forceQuirks = true;
  1424. this._emitCurrentToken();
  1425. this.state = DATA_STATE;
  1426. } else if (cp === $.EOF) {
  1427. this._err(ERR.eofInDoctype);
  1428. this._createDoctypeToken(null);
  1429. this.currentToken.forceQuirks = true;
  1430. this._emitCurrentToken();
  1431. this._emitEOFToken();
  1432. } else {
  1433. this._createDoctypeToken(toChar(cp));
  1434. this.state = DOCTYPE_NAME_STATE;
  1435. }
  1436. }
  1437. // DOCTYPE name state
  1438. //------------------------------------------------------------------
  1439. [DOCTYPE_NAME_STATE](cp) {
  1440. if (isWhitespace(cp)) {
  1441. this.state = AFTER_DOCTYPE_NAME_STATE;
  1442. } else if (cp === $.GREATER_THAN_SIGN) {
  1443. this.state = DATA_STATE;
  1444. this._emitCurrentToken();
  1445. } else if (isAsciiUpper(cp)) {
  1446. this.currentToken.name += toAsciiLowerChar(cp);
  1447. } else if (cp === $.NULL) {
  1448. this._err(ERR.unexpectedNullCharacter);
  1449. this.currentToken.name += unicode.REPLACEMENT_CHARACTER;
  1450. } else if (cp === $.EOF) {
  1451. this._err(ERR.eofInDoctype);
  1452. this.currentToken.forceQuirks = true;
  1453. this._emitCurrentToken();
  1454. this._emitEOFToken();
  1455. } else {
  1456. this.currentToken.name += toChar(cp);
  1457. }
  1458. }
  1459. // After DOCTYPE name state
  1460. //------------------------------------------------------------------
  1461. [AFTER_DOCTYPE_NAME_STATE](cp) {
  1462. if (isWhitespace(cp)) {
  1463. return;
  1464. }
  1465. if (cp === $.GREATER_THAN_SIGN) {
  1466. this.state = DATA_STATE;
  1467. this._emitCurrentToken();
  1468. } else if (cp === $.EOF) {
  1469. this._err(ERR.eofInDoctype);
  1470. this.currentToken.forceQuirks = true;
  1471. this._emitCurrentToken();
  1472. this._emitEOFToken();
  1473. } else if (this._consumeSequenceIfMatch($$.PUBLIC_STRING, cp, false)) {
  1474. this.state = AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE;
  1475. } else if (this._consumeSequenceIfMatch($$.SYSTEM_STRING, cp, false)) {
  1476. this.state = AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE;
  1477. }
  1478. //NOTE: sequence lookup can be abrupted by hibernation. In that case lookup
  1479. //results are no longer valid and we will need to start over.
  1480. else if (!this._ensureHibernation()) {
  1481. this._err(ERR.invalidCharacterSequenceAfterDoctypeName);
  1482. this.currentToken.forceQuirks = true;
  1483. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1484. }
  1485. }
  1486. // After DOCTYPE public keyword state
  1487. //------------------------------------------------------------------
  1488. [AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE](cp) {
  1489. if (isWhitespace(cp)) {
  1490. this.state = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
  1491. } else if (cp === $.QUOTATION_MARK) {
  1492. this._err(ERR.missingWhitespaceAfterDoctypePublicKeyword);
  1493. this.currentToken.publicId = '';
  1494. this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
  1495. } else if (cp === $.APOSTROPHE) {
  1496. this._err(ERR.missingWhitespaceAfterDoctypePublicKeyword);
  1497. this.currentToken.publicId = '';
  1498. this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
  1499. } else if (cp === $.GREATER_THAN_SIGN) {
  1500. this._err(ERR.missingDoctypePublicIdentifier);
  1501. this.currentToken.forceQuirks = true;
  1502. this.state = DATA_STATE;
  1503. this._emitCurrentToken();
  1504. } else if (cp === $.EOF) {
  1505. this._err(ERR.eofInDoctype);
  1506. this.currentToken.forceQuirks = true;
  1507. this._emitCurrentToken();
  1508. this._emitEOFToken();
  1509. } else {
  1510. this._err(ERR.missingQuoteBeforeDoctypePublicIdentifier);
  1511. this.currentToken.forceQuirks = true;
  1512. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1513. }
  1514. }
  1515. // Before DOCTYPE public identifier state
  1516. //------------------------------------------------------------------
  1517. [BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE](cp) {
  1518. if (isWhitespace(cp)) {
  1519. return;
  1520. }
  1521. if (cp === $.QUOTATION_MARK) {
  1522. this.currentToken.publicId = '';
  1523. this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
  1524. } else if (cp === $.APOSTROPHE) {
  1525. this.currentToken.publicId = '';
  1526. this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
  1527. } else if (cp === $.GREATER_THAN_SIGN) {
  1528. this._err(ERR.missingDoctypePublicIdentifier);
  1529. this.currentToken.forceQuirks = true;
  1530. this.state = DATA_STATE;
  1531. this._emitCurrentToken();
  1532. } else if (cp === $.EOF) {
  1533. this._err(ERR.eofInDoctype);
  1534. this.currentToken.forceQuirks = true;
  1535. this._emitCurrentToken();
  1536. this._emitEOFToken();
  1537. } else {
  1538. this._err(ERR.missingQuoteBeforeDoctypePublicIdentifier);
  1539. this.currentToken.forceQuirks = true;
  1540. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1541. }
  1542. }
  1543. // DOCTYPE public identifier (double-quoted) state
  1544. //------------------------------------------------------------------
  1545. [DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE](cp) {
  1546. if (cp === $.QUOTATION_MARK) {
  1547. this.state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
  1548. } else if (cp === $.NULL) {
  1549. this._err(ERR.unexpectedNullCharacter);
  1550. this.currentToken.publicId += unicode.REPLACEMENT_CHARACTER;
  1551. } else if (cp === $.GREATER_THAN_SIGN) {
  1552. this._err(ERR.abruptDoctypePublicIdentifier);
  1553. this.currentToken.forceQuirks = true;
  1554. this._emitCurrentToken();
  1555. this.state = DATA_STATE;
  1556. } else if (cp === $.EOF) {
  1557. this._err(ERR.eofInDoctype);
  1558. this.currentToken.forceQuirks = true;
  1559. this._emitCurrentToken();
  1560. this._emitEOFToken();
  1561. } else {
  1562. this.currentToken.publicId += toChar(cp);
  1563. }
  1564. }
  1565. // DOCTYPE public identifier (single-quoted) state
  1566. //------------------------------------------------------------------
  1567. [DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE](cp) {
  1568. if (cp === $.APOSTROPHE) {
  1569. this.state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
  1570. } else if (cp === $.NULL) {
  1571. this._err(ERR.unexpectedNullCharacter);
  1572. this.currentToken.publicId += unicode.REPLACEMENT_CHARACTER;
  1573. } else if (cp === $.GREATER_THAN_SIGN) {
  1574. this._err(ERR.abruptDoctypePublicIdentifier);
  1575. this.currentToken.forceQuirks = true;
  1576. this._emitCurrentToken();
  1577. this.state = DATA_STATE;
  1578. } else if (cp === $.EOF) {
  1579. this._err(ERR.eofInDoctype);
  1580. this.currentToken.forceQuirks = true;
  1581. this._emitCurrentToken();
  1582. this._emitEOFToken();
  1583. } else {
  1584. this.currentToken.publicId += toChar(cp);
  1585. }
  1586. }
  1587. // After DOCTYPE public identifier state
  1588. //------------------------------------------------------------------
  1589. [AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE](cp) {
  1590. if (isWhitespace(cp)) {
  1591. this.state = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE;
  1592. } else if (cp === $.GREATER_THAN_SIGN) {
  1593. this.state = DATA_STATE;
  1594. this._emitCurrentToken();
  1595. } else if (cp === $.QUOTATION_MARK) {
  1596. this._err(ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
  1597. this.currentToken.systemId = '';
  1598. this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
  1599. } else if (cp === $.APOSTROPHE) {
  1600. this._err(ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
  1601. this.currentToken.systemId = '';
  1602. this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
  1603. } else if (cp === $.EOF) {
  1604. this._err(ERR.eofInDoctype);
  1605. this.currentToken.forceQuirks = true;
  1606. this._emitCurrentToken();
  1607. this._emitEOFToken();
  1608. } else {
  1609. this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier);
  1610. this.currentToken.forceQuirks = true;
  1611. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1612. }
  1613. }
  1614. // Between DOCTYPE public and system identifiers state
  1615. //------------------------------------------------------------------
  1616. [BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE](cp) {
  1617. if (isWhitespace(cp)) {
  1618. return;
  1619. }
  1620. if (cp === $.GREATER_THAN_SIGN) {
  1621. this._emitCurrentToken();
  1622. this.state = DATA_STATE;
  1623. } else if (cp === $.QUOTATION_MARK) {
  1624. this.currentToken.systemId = '';
  1625. this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
  1626. } else if (cp === $.APOSTROPHE) {
  1627. this.currentToken.systemId = '';
  1628. this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
  1629. } else if (cp === $.EOF) {
  1630. this._err(ERR.eofInDoctype);
  1631. this.currentToken.forceQuirks = true;
  1632. this._emitCurrentToken();
  1633. this._emitEOFToken();
  1634. } else {
  1635. this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier);
  1636. this.currentToken.forceQuirks = true;
  1637. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1638. }
  1639. }
  1640. // After DOCTYPE system keyword state
  1641. //------------------------------------------------------------------
  1642. [AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE](cp) {
  1643. if (isWhitespace(cp)) {
  1644. this.state = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
  1645. } else if (cp === $.QUOTATION_MARK) {
  1646. this._err(ERR.missingWhitespaceAfterDoctypeSystemKeyword);
  1647. this.currentToken.systemId = '';
  1648. this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
  1649. } else if (cp === $.APOSTROPHE) {
  1650. this._err(ERR.missingWhitespaceAfterDoctypeSystemKeyword);
  1651. this.currentToken.systemId = '';
  1652. this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
  1653. } else if (cp === $.GREATER_THAN_SIGN) {
  1654. this._err(ERR.missingDoctypeSystemIdentifier);
  1655. this.currentToken.forceQuirks = true;
  1656. this.state = DATA_STATE;
  1657. this._emitCurrentToken();
  1658. } else if (cp === $.EOF) {
  1659. this._err(ERR.eofInDoctype);
  1660. this.currentToken.forceQuirks = true;
  1661. this._emitCurrentToken();
  1662. this._emitEOFToken();
  1663. } else {
  1664. this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier);
  1665. this.currentToken.forceQuirks = true;
  1666. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1667. }
  1668. }
  1669. // Before DOCTYPE system identifier state
  1670. //------------------------------------------------------------------
  1671. [BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE](cp) {
  1672. if (isWhitespace(cp)) {
  1673. return;
  1674. }
  1675. if (cp === $.QUOTATION_MARK) {
  1676. this.currentToken.systemId = '';
  1677. this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
  1678. } else if (cp === $.APOSTROPHE) {
  1679. this.currentToken.systemId = '';
  1680. this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
  1681. } else if (cp === $.GREATER_THAN_SIGN) {
  1682. this._err(ERR.missingDoctypeSystemIdentifier);
  1683. this.currentToken.forceQuirks = true;
  1684. this.state = DATA_STATE;
  1685. this._emitCurrentToken();
  1686. } else if (cp === $.EOF) {
  1687. this._err(ERR.eofInDoctype);
  1688. this.currentToken.forceQuirks = true;
  1689. this._emitCurrentToken();
  1690. this._emitEOFToken();
  1691. } else {
  1692. this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier);
  1693. this.currentToken.forceQuirks = true;
  1694. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1695. }
  1696. }
  1697. // DOCTYPE system identifier (double-quoted) state
  1698. //------------------------------------------------------------------
  1699. [DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE](cp) {
  1700. if (cp === $.QUOTATION_MARK) {
  1701. this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
  1702. } else if (cp === $.NULL) {
  1703. this._err(ERR.unexpectedNullCharacter);
  1704. this.currentToken.systemId += unicode.REPLACEMENT_CHARACTER;
  1705. } else if (cp === $.GREATER_THAN_SIGN) {
  1706. this._err(ERR.abruptDoctypeSystemIdentifier);
  1707. this.currentToken.forceQuirks = true;
  1708. this._emitCurrentToken();
  1709. this.state = DATA_STATE;
  1710. } else if (cp === $.EOF) {
  1711. this._err(ERR.eofInDoctype);
  1712. this.currentToken.forceQuirks = true;
  1713. this._emitCurrentToken();
  1714. this._emitEOFToken();
  1715. } else {
  1716. this.currentToken.systemId += toChar(cp);
  1717. }
  1718. }
  1719. // DOCTYPE system identifier (single-quoted) state
  1720. //------------------------------------------------------------------
  1721. [DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE](cp) {
  1722. if (cp === $.APOSTROPHE) {
  1723. this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
  1724. } else if (cp === $.NULL) {
  1725. this._err(ERR.unexpectedNullCharacter);
  1726. this.currentToken.systemId += unicode.REPLACEMENT_CHARACTER;
  1727. } else if (cp === $.GREATER_THAN_SIGN) {
  1728. this._err(ERR.abruptDoctypeSystemIdentifier);
  1729. this.currentToken.forceQuirks = true;
  1730. this._emitCurrentToken();
  1731. this.state = DATA_STATE;
  1732. } else if (cp === $.EOF) {
  1733. this._err(ERR.eofInDoctype);
  1734. this.currentToken.forceQuirks = true;
  1735. this._emitCurrentToken();
  1736. this._emitEOFToken();
  1737. } else {
  1738. this.currentToken.systemId += toChar(cp);
  1739. }
  1740. }
  1741. // After DOCTYPE system identifier state
  1742. //------------------------------------------------------------------
  1743. [AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE](cp) {
  1744. if (isWhitespace(cp)) {
  1745. return;
  1746. }
  1747. if (cp === $.GREATER_THAN_SIGN) {
  1748. this._emitCurrentToken();
  1749. this.state = DATA_STATE;
  1750. } else if (cp === $.EOF) {
  1751. this._err(ERR.eofInDoctype);
  1752. this.currentToken.forceQuirks = true;
  1753. this._emitCurrentToken();
  1754. this._emitEOFToken();
  1755. } else {
  1756. this._err(ERR.unexpectedCharacterAfterDoctypeSystemIdentifier);
  1757. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1758. }
  1759. }
  1760. // Bogus DOCTYPE state
  1761. //------------------------------------------------------------------
  1762. [BOGUS_DOCTYPE_STATE](cp) {
  1763. if (cp === $.GREATER_THAN_SIGN) {
  1764. this._emitCurrentToken();
  1765. this.state = DATA_STATE;
  1766. } else if (cp === $.NULL) {
  1767. this._err(ERR.unexpectedNullCharacter);
  1768. } else if (cp === $.EOF) {
  1769. this._emitCurrentToken();
  1770. this._emitEOFToken();
  1771. }
  1772. }
  1773. // CDATA section state
  1774. //------------------------------------------------------------------
  1775. [CDATA_SECTION_STATE](cp) {
  1776. if (cp === $.RIGHT_SQUARE_BRACKET) {
  1777. this.state = CDATA_SECTION_BRACKET_STATE;
  1778. } else if (cp === $.EOF) {
  1779. this._err(ERR.eofInCdata);
  1780. this._emitEOFToken();
  1781. } else {
  1782. this._emitCodePoint(cp);
  1783. }
  1784. }
  1785. // CDATA section bracket state
  1786. //------------------------------------------------------------------
  1787. [CDATA_SECTION_BRACKET_STATE](cp) {
  1788. if (cp === $.RIGHT_SQUARE_BRACKET) {
  1789. this.state = CDATA_SECTION_END_STATE;
  1790. } else {
  1791. this._emitChars(']');
  1792. this._reconsumeInState(CDATA_SECTION_STATE);
  1793. }
  1794. }
  1795. // CDATA section end state
  1796. //------------------------------------------------------------------
  1797. [CDATA_SECTION_END_STATE](cp) {
  1798. if (cp === $.GREATER_THAN_SIGN) {
  1799. this.state = DATA_STATE;
  1800. } else if (cp === $.RIGHT_SQUARE_BRACKET) {
  1801. this._emitChars(']');
  1802. } else {
  1803. this._emitChars(']]');
  1804. this._reconsumeInState(CDATA_SECTION_STATE);
  1805. }
  1806. }
  1807. // Character reference state
  1808. //------------------------------------------------------------------
  1809. [CHARACTER_REFERENCE_STATE](cp) {
  1810. this.tempBuff = [$.AMPERSAND];
  1811. if (cp === $.NUMBER_SIGN) {
  1812. this.tempBuff.push(cp);
  1813. this.state = NUMERIC_CHARACTER_REFERENCE_STATE;
  1814. } else if (isAsciiAlphaNumeric(cp)) {
  1815. this._reconsumeInState(NAMED_CHARACTER_REFERENCE_STATE);
  1816. } else {
  1817. this._flushCodePointsConsumedAsCharacterReference();
  1818. this._reconsumeInState(this.returnState);
  1819. }
  1820. }
  1821. // Named character reference state
  1822. //------------------------------------------------------------------
  1823. [NAMED_CHARACTER_REFERENCE_STATE](cp) {
  1824. const matchResult = this._matchNamedCharacterReference(cp);
  1825. //NOTE: matching can be abrupted by hibernation. In that case match
  1826. //results are no longer valid and we will need to start over.
  1827. if (this._ensureHibernation()) {
  1828. this.tempBuff = [$.AMPERSAND];
  1829. } else if (matchResult) {
  1830. const withSemicolon = this.tempBuff[this.tempBuff.length - 1] === $.SEMICOLON;
  1831. if (!this._isCharacterReferenceAttributeQuirk(withSemicolon)) {
  1832. if (!withSemicolon) {
  1833. this._errOnNextCodePoint(ERR.missingSemicolonAfterCharacterReference);
  1834. }
  1835. this.tempBuff = matchResult;
  1836. }
  1837. this._flushCodePointsConsumedAsCharacterReference();
  1838. this.state = this.returnState;
  1839. } else {
  1840. this._flushCodePointsConsumedAsCharacterReference();
  1841. this.state = AMBIGUOUS_AMPERSAND_STATE;
  1842. }
  1843. }
  1844. // Ambiguos ampersand state
  1845. //------------------------------------------------------------------
  1846. [AMBIGUOUS_AMPERSAND_STATE](cp) {
  1847. if (isAsciiAlphaNumeric(cp)) {
  1848. if (this._isCharacterReferenceInAttribute()) {
  1849. this.currentAttr.value += toChar(cp);
  1850. } else {
  1851. this._emitCodePoint(cp);
  1852. }
  1853. } else {
  1854. if (cp === $.SEMICOLON) {
  1855. this._err(ERR.unknownNamedCharacterReference);
  1856. }
  1857. this._reconsumeInState(this.returnState);
  1858. }
  1859. }
  1860. // Numeric character reference state
  1861. //------------------------------------------------------------------
  1862. [NUMERIC_CHARACTER_REFERENCE_STATE](cp) {
  1863. this.charRefCode = 0;
  1864. if (cp === $.LATIN_SMALL_X || cp === $.LATIN_CAPITAL_X) {
  1865. this.tempBuff.push(cp);
  1866. this.state = HEXADEMICAL_CHARACTER_REFERENCE_START_STATE;
  1867. } else {
  1868. this._reconsumeInState(DECIMAL_CHARACTER_REFERENCE_START_STATE);
  1869. }
  1870. }
  1871. // Hexademical character reference start state
  1872. //------------------------------------------------------------------
  1873. [HEXADEMICAL_CHARACTER_REFERENCE_START_STATE](cp) {
  1874. if (isAsciiHexDigit(cp)) {
  1875. this._reconsumeInState(HEXADEMICAL_CHARACTER_REFERENCE_STATE);
  1876. } else {
  1877. this._err(ERR.absenceOfDigitsInNumericCharacterReference);
  1878. this._flushCodePointsConsumedAsCharacterReference();
  1879. this._reconsumeInState(this.returnState);
  1880. }
  1881. }
  1882. // Decimal character reference start state
  1883. //------------------------------------------------------------------
  1884. [DECIMAL_CHARACTER_REFERENCE_START_STATE](cp) {
  1885. if (isAsciiDigit(cp)) {
  1886. this._reconsumeInState(DECIMAL_CHARACTER_REFERENCE_STATE);
  1887. } else {
  1888. this._err(ERR.absenceOfDigitsInNumericCharacterReference);
  1889. this._flushCodePointsConsumedAsCharacterReference();
  1890. this._reconsumeInState(this.returnState);
  1891. }
  1892. }
  1893. // Hexademical character reference state
  1894. //------------------------------------------------------------------
  1895. [HEXADEMICAL_CHARACTER_REFERENCE_STATE](cp) {
  1896. if (isAsciiUpperHexDigit(cp)) {
  1897. this.charRefCode = this.charRefCode * 16 + cp - 0x37;
  1898. } else if (isAsciiLowerHexDigit(cp)) {
  1899. this.charRefCode = this.charRefCode * 16 + cp - 0x57;
  1900. } else if (isAsciiDigit(cp)) {
  1901. this.charRefCode = this.charRefCode * 16 + cp - 0x30;
  1902. } else if (cp === $.SEMICOLON) {
  1903. this.state = NUMERIC_CHARACTER_REFERENCE_END_STATE;
  1904. } else {
  1905. this._err(ERR.missingSemicolonAfterCharacterReference);
  1906. this._reconsumeInState(NUMERIC_CHARACTER_REFERENCE_END_STATE);
  1907. }
  1908. }
  1909. // Decimal character reference state
  1910. //------------------------------------------------------------------
  1911. [DECIMAL_CHARACTER_REFERENCE_STATE](cp) {
  1912. if (isAsciiDigit(cp)) {
  1913. this.charRefCode = this.charRefCode * 10 + cp - 0x30;
  1914. } else if (cp === $.SEMICOLON) {
  1915. this.state = NUMERIC_CHARACTER_REFERENCE_END_STATE;
  1916. } else {
  1917. this._err(ERR.missingSemicolonAfterCharacterReference);
  1918. this._reconsumeInState(NUMERIC_CHARACTER_REFERENCE_END_STATE);
  1919. }
  1920. }
  1921. // Numeric character reference end state
  1922. //------------------------------------------------------------------
  1923. [NUMERIC_CHARACTER_REFERENCE_END_STATE]() {
  1924. if (this.charRefCode === $.NULL) {
  1925. this._err(ERR.nullCharacterReference);
  1926. this.charRefCode = $.REPLACEMENT_CHARACTER;
  1927. } else if (this.charRefCode > 0x10ffff) {
  1928. this._err(ERR.characterReferenceOutsideUnicodeRange);
  1929. this.charRefCode = $.REPLACEMENT_CHARACTER;
  1930. } else if (unicode.isSurrogate(this.charRefCode)) {
  1931. this._err(ERR.surrogateCharacterReference);
  1932. this.charRefCode = $.REPLACEMENT_CHARACTER;
  1933. } else if (unicode.isUndefinedCodePoint(this.charRefCode)) {
  1934. this._err(ERR.noncharacterCharacterReference);
  1935. } else if (unicode.isControlCodePoint(this.charRefCode) || this.charRefCode === $.CARRIAGE_RETURN) {
  1936. this._err(ERR.controlCharacterReference);
  1937. const replacement = C1_CONTROLS_REFERENCE_REPLACEMENTS[this.charRefCode];
  1938. if (replacement) {
  1939. this.charRefCode = replacement;
  1940. }
  1941. }
  1942. this.tempBuff = [this.charRefCode];
  1943. this._flushCodePointsConsumedAsCharacterReference();
  1944. this._reconsumeInState(this.returnState);
  1945. }
  1946. }
  1947. //Token types
  1948. Tokenizer.CHARACTER_TOKEN = 'CHARACTER_TOKEN';
  1949. Tokenizer.NULL_CHARACTER_TOKEN = 'NULL_CHARACTER_TOKEN';
  1950. Tokenizer.WHITESPACE_CHARACTER_TOKEN = 'WHITESPACE_CHARACTER_TOKEN';
  1951. Tokenizer.START_TAG_TOKEN = 'START_TAG_TOKEN';
  1952. Tokenizer.END_TAG_TOKEN = 'END_TAG_TOKEN';
  1953. Tokenizer.COMMENT_TOKEN = 'COMMENT_TOKEN';
  1954. Tokenizer.DOCTYPE_TOKEN = 'DOCTYPE_TOKEN';
  1955. Tokenizer.EOF_TOKEN = 'EOF_TOKEN';
  1956. Tokenizer.HIBERNATION_TOKEN = 'HIBERNATION_TOKEN';
  1957. //Tokenizer initial states for different modes
  1958. Tokenizer.MODE = {
  1959. DATA: DATA_STATE,
  1960. RCDATA: RCDATA_STATE,
  1961. RAWTEXT: RAWTEXT_STATE,
  1962. SCRIPT_DATA: SCRIPT_DATA_STATE,
  1963. PLAINTEXT: PLAINTEXT_STATE
  1964. };
  1965. //Static
  1966. Tokenizer.getTokenAttr = function(token, attrName) {
  1967. for (let i = token.attrs.length - 1; i >= 0; i--) {
  1968. if (token.attrs[i].name === attrName) {
  1969. return token.attrs[i].value;
  1970. }
  1971. }
  1972. return null;
  1973. };
  1974. module.exports = Tokenizer;