encode.js 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. "use strict";
  2. var __importDefault = (this && this.__importDefault) || function (mod) {
  3. return (mod && mod.__esModule) ? mod : { "default": mod };
  4. };
  5. Object.defineProperty(exports, "__esModule", { value: true });
  6. exports.escapeUTF8 = exports.escape = exports.encodeNonAsciiHTML = exports.encodeHTML = exports.encodeXML = void 0;
  7. var xml_json_1 = __importDefault(require("./maps/xml.json"));
  8. var encode_trie_1 = require("./encode-trie");
  9. var entities_json_1 = __importDefault(require("./maps/entities.json"));
  10. var htmlReplacer = getCharRegExp(entities_json_1.default, true);
  11. var xmlReplacer = getCharRegExp(xml_json_1.default, true);
  12. var xmlInvalidChars = getCharRegExp(xml_json_1.default, false);
  13. var xmlCodeMap = new Map(Object.keys(xml_json_1.default).map(function (k) { return [
  14. xml_json_1.default[k].charCodeAt(0),
  15. "&" + k + ";",
  16. ]; }));
  17. /**
  18. * Encodes all non-ASCII characters, as well as characters not valid in XML
  19. * documents using XML entities.
  20. *
  21. * If a character has no equivalent entity, a
  22. * numeric hexadecimal reference (eg. `ü`) will be used.
  23. */
  24. function encodeXML(str) {
  25. var ret = "";
  26. var lastIdx = 0;
  27. var match;
  28. while ((match = xmlReplacer.exec(str)) !== null) {
  29. var i = match.index;
  30. var char = str.charCodeAt(i);
  31. var next = xmlCodeMap.get(char);
  32. if (next) {
  33. ret += str.substring(lastIdx, i) + next;
  34. lastIdx = i + 1;
  35. }
  36. else {
  37. ret += str.substring(lastIdx, i) + "&#x" + encode_trie_1.getCodePoint(str, i).toString(16) + ";";
  38. // Increase by 1 if we have a surrogate pair
  39. lastIdx = xmlReplacer.lastIndex += Number((char & 65408) === 0xd800);
  40. }
  41. }
  42. return ret + str.substr(lastIdx);
  43. }
  44. exports.encodeXML = encodeXML;
  45. /**
  46. * Encodes all entities and non-ASCII characters in the input.
  47. *
  48. * This includes characters that are valid ASCII characters in HTML documents.
  49. * For example `#` will be encoded as `#`. To get a more compact output,
  50. * consider using the `encodeNonAsciiHTML` function.
  51. *
  52. * If a character has no equivalent entity, a
  53. * numeric hexadecimal reference (eg. `ü`) will be used.
  54. */
  55. function encodeHTML(data) {
  56. return encode_trie_1.encodeHTMLTrieRe(htmlReplacer, data);
  57. }
  58. exports.encodeHTML = encodeHTML;
  59. /**
  60. * Encodes all non-ASCII characters, as well as characters not valid in HTML
  61. * documents using HTML entities.
  62. *
  63. * If a character has no equivalent entity, a
  64. * numeric hexadecimal reference (eg. `ü`) will be used.
  65. */
  66. function encodeNonAsciiHTML(data) {
  67. return encode_trie_1.encodeHTMLTrieRe(xmlReplacer, data);
  68. }
  69. exports.encodeNonAsciiHTML = encodeNonAsciiHTML;
  70. function getCharRegExp(map, nonAscii) {
  71. // Collect the start characters of all entities
  72. var chars = Object.keys(map)
  73. .map(function (k) { return "\\" + map[k].charAt(0); })
  74. .filter(function (v) { return !nonAscii || v.charCodeAt(1) < 128; })
  75. .sort(function (a, b) { return a.charCodeAt(1) - b.charCodeAt(1); })
  76. // Remove duplicates
  77. .filter(function (v, i, a) { return v !== a[i + 1]; });
  78. // Add ranges to single characters.
  79. for (var start = 0; start < chars.length - 1; start++) {
  80. // Find the end of a run of characters
  81. var end = start;
  82. while (end < chars.length - 1 &&
  83. chars[end].charCodeAt(1) + 1 === chars[end + 1].charCodeAt(1)) {
  84. end += 1;
  85. }
  86. var count = 1 + end - start;
  87. // We want to replace at least three characters
  88. if (count < 3)
  89. continue;
  90. chars.splice(start, count, chars[start] + "-" + chars[end]);
  91. }
  92. return new RegExp("[" + chars.join("") + (nonAscii ? "\\x80-\\uFFFF" : "") + "]", "g");
  93. }
  94. /**
  95. * Encodes all non-ASCII characters, as well as characters not valid in XML
  96. * documents using numeric hexadecimal reference (eg. `&#xfc;`).
  97. *
  98. * Have a look at `escapeUTF8` if you want a more concise output at the expense
  99. * of reduced transportability.
  100. *
  101. * @param data String to escape.
  102. */
  103. exports.escape = encodeXML;
  104. /**
  105. * Encodes all characters not valid in XML documents using XML entities.
  106. *
  107. * Note that the output will be character-set dependent.
  108. *
  109. * @param data String to escape.
  110. */
  111. function escapeUTF8(data) {
  112. var match;
  113. var lastIdx = 0;
  114. var result = "";
  115. while ((match = xmlInvalidChars.exec(data))) {
  116. if (lastIdx !== match.index) {
  117. result += data.substring(lastIdx, match.index);
  118. }
  119. // We know that this chararcter will be in `inverseXML`
  120. result += xmlCodeMap.get(match[0].charCodeAt(0));
  121. // Every match will be of length 1
  122. lastIdx = match.index + 1;
  123. }
  124. return result + data.substring(lastIdx);
  125. }
  126. exports.escapeUTF8 = escapeUTF8;