preprocessor.js 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. 'use strict';
  2. const unicode = require('../common/unicode');
  3. const ERR = require('../common/error-codes');
  4. //Aliases
  5. const $ = unicode.CODE_POINTS;
  6. //Const
  7. const DEFAULT_BUFFER_WATERLINE = 1 << 16;
  8. //Preprocessor
  9. //NOTE: HTML input preprocessing
  10. //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)
  11. class Preprocessor {
  12. constructor() {
  13. this.html = null;
  14. this.pos = -1;
  15. this.lastGapPos = -1;
  16. this.lastCharPos = -1;
  17. this.gapStack = [];
  18. this.skipNextNewLine = false;
  19. this.lastChunkWritten = false;
  20. this.endOfChunkHit = false;
  21. this.bufferWaterline = DEFAULT_BUFFER_WATERLINE;
  22. }
  23. _err() {
  24. // NOTE: err reporting is noop by default. Enabled by mixin.
  25. }
  26. _addGap() {
  27. this.gapStack.push(this.lastGapPos);
  28. this.lastGapPos = this.pos;
  29. }
  30. _processSurrogate(cp) {
  31. //NOTE: try to peek a surrogate pair
  32. if (this.pos !== this.lastCharPos) {
  33. const nextCp = this.html.charCodeAt(this.pos + 1);
  34. if (unicode.isSurrogatePair(nextCp)) {
  35. //NOTE: we have a surrogate pair. Peek pair character and recalculate code point.
  36. this.pos++;
  37. //NOTE: add gap that should be avoided during retreat
  38. this._addGap();
  39. return unicode.getSurrogatePairCodePoint(cp, nextCp);
  40. }
  41. }
  42. //NOTE: we are at the end of a chunk, therefore we can't infer surrogate pair yet.
  43. else if (!this.lastChunkWritten) {
  44. this.endOfChunkHit = true;
  45. return $.EOF;
  46. }
  47. //NOTE: isolated surrogate
  48. this._err(ERR.surrogateInInputStream);
  49. return cp;
  50. }
  51. dropParsedChunk() {
  52. if (this.pos > this.bufferWaterline) {
  53. this.lastCharPos -= this.pos;
  54. this.html = this.html.substring(this.pos);
  55. this.pos = 0;
  56. this.lastGapPos = -1;
  57. this.gapStack = [];
  58. }
  59. }
  60. write(chunk, isLastChunk) {
  61. if (this.html) {
  62. this.html += chunk;
  63. } else {
  64. this.html = chunk;
  65. }
  66. this.lastCharPos = this.html.length - 1;
  67. this.endOfChunkHit = false;
  68. this.lastChunkWritten = isLastChunk;
  69. }
  70. insertHtmlAtCurrentPos(chunk) {
  71. this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1, this.html.length);
  72. this.lastCharPos = this.html.length - 1;
  73. this.endOfChunkHit = false;
  74. }
  75. advance() {
  76. this.pos++;
  77. if (this.pos > this.lastCharPos) {
  78. this.endOfChunkHit = !this.lastChunkWritten;
  79. return $.EOF;
  80. }
  81. let cp = this.html.charCodeAt(this.pos);
  82. //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character
  83. //must be ignored.
  84. if (this.skipNextNewLine && cp === $.LINE_FEED) {
  85. this.skipNextNewLine = false;
  86. this._addGap();
  87. return this.advance();
  88. }
  89. //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters
  90. if (cp === $.CARRIAGE_RETURN) {
  91. this.skipNextNewLine = true;
  92. return $.LINE_FEED;
  93. }
  94. this.skipNextNewLine = false;
  95. if (unicode.isSurrogate(cp)) {
  96. cp = this._processSurrogate(cp);
  97. }
  98. //OPTIMIZATION: first check if code point is in the common allowed
  99. //range (ASCII alphanumeric, whitespaces, big chunk of BMP)
  100. //before going into detailed performance cost validation.
  101. const isCommonValidRange =
  102. (cp > 0x1f && cp < 0x7f) || cp === $.LINE_FEED || cp === $.CARRIAGE_RETURN || (cp > 0x9f && cp < 0xfdd0);
  103. if (!isCommonValidRange) {
  104. this._checkForProblematicCharacters(cp);
  105. }
  106. return cp;
  107. }
  108. _checkForProblematicCharacters(cp) {
  109. if (unicode.isControlCodePoint(cp)) {
  110. this._err(ERR.controlCharacterInInputStream);
  111. } else if (unicode.isUndefinedCodePoint(cp)) {
  112. this._err(ERR.noncharacterInInputStream);
  113. }
  114. }
  115. retreat() {
  116. if (this.pos === this.lastGapPos) {
  117. this.lastGapPos = this.gapStack.pop();
  118. this.pos--;
  119. }
  120. this.pos--;
  121. }
  122. }
  123. module.exports = Preprocessor;