xml_parser.js 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. /* Copyright 2018 Mozilla Foundation
  2. *
  3. * Licensed under the Apache License, Version 2.0 (the "License");
  4. * you may not use this file except in compliance with the License.
  5. * You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software
  10. * distributed under the License is distributed on an "AS IS" BASIS,
  11. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. * See the License for the specific language governing permissions and
  13. * limitations under the License.
  14. */
  15. // The code for XMLParserBase copied from
  16. // https://github.com/mozilla/shumway/blob/16451d8836fa85f4b16eeda8b4bda2fa9e2b22b0/src/avm2/natives/xml.ts
  17. import { encodeToXmlString } from "./core_utils.js";
  18. const XMLParserErrorCode = {
  19. NoError: 0,
  20. EndOfDocument: -1,
  21. UnterminatedCdat: -2,
  22. UnterminatedXmlDeclaration: -3,
  23. UnterminatedDoctypeDeclaration: -4,
  24. UnterminatedComment: -5,
  25. MalformedElement: -6,
  26. OutOfMemory: -7,
  27. UnterminatedAttributeValue: -8,
  28. UnterminatedElement: -9,
  29. ElementNeverBegun: -10,
  30. };
  31. function isWhitespace(s, index) {
  32. const ch = s[index];
  33. return ch === " " || ch === "\n" || ch === "\r" || ch === "\t";
  34. }
  35. function isWhitespaceString(s) {
  36. for (let i = 0, ii = s.length; i < ii; i++) {
  37. if (!isWhitespace(s, i)) {
  38. return false;
  39. }
  40. }
  41. return true;
  42. }
  43. class XMLParserBase {
  44. _resolveEntities(s) {
  45. return s.replace(/&([^;]+);/g, (all, entity) => {
  46. if (entity.substring(0, 2) === "#x") {
  47. return String.fromCodePoint(parseInt(entity.substring(2), 16));
  48. } else if (entity.substring(0, 1) === "#") {
  49. return String.fromCodePoint(parseInt(entity.substring(1), 10));
  50. }
  51. switch (entity) {
  52. case "lt":
  53. return "<";
  54. case "gt":
  55. return ">";
  56. case "amp":
  57. return "&";
  58. case "quot":
  59. return '"';
  60. case "apos":
  61. return "'";
  62. }
  63. return this.onResolveEntity(entity);
  64. });
  65. }
  66. _parseContent(s, start) {
  67. const attributes = [];
  68. let pos = start;
  69. function skipWs() {
  70. while (pos < s.length && isWhitespace(s, pos)) {
  71. ++pos;
  72. }
  73. }
  74. while (
  75. pos < s.length &&
  76. !isWhitespace(s, pos) &&
  77. s[pos] !== ">" &&
  78. s[pos] !== "/"
  79. ) {
  80. ++pos;
  81. }
  82. const name = s.substring(start, pos);
  83. skipWs();
  84. while (
  85. pos < s.length &&
  86. s[pos] !== ">" &&
  87. s[pos] !== "/" &&
  88. s[pos] !== "?"
  89. ) {
  90. skipWs();
  91. let attrName = "",
  92. attrValue = "";
  93. while (pos < s.length && !isWhitespace(s, pos) && s[pos] !== "=") {
  94. attrName += s[pos];
  95. ++pos;
  96. }
  97. skipWs();
  98. if (s[pos] !== "=") {
  99. return null;
  100. }
  101. ++pos;
  102. skipWs();
  103. const attrEndChar = s[pos];
  104. if (attrEndChar !== '"' && attrEndChar !== "'") {
  105. return null;
  106. }
  107. const attrEndIndex = s.indexOf(attrEndChar, ++pos);
  108. if (attrEndIndex < 0) {
  109. return null;
  110. }
  111. attrValue = s.substring(pos, attrEndIndex);
  112. attributes.push({
  113. name: attrName,
  114. value: this._resolveEntities(attrValue),
  115. });
  116. pos = attrEndIndex + 1;
  117. skipWs();
  118. }
  119. return {
  120. name,
  121. attributes,
  122. parsed: pos - start,
  123. };
  124. }
  125. _parseProcessingInstruction(s, start) {
  126. let pos = start;
  127. function skipWs() {
  128. while (pos < s.length && isWhitespace(s, pos)) {
  129. ++pos;
  130. }
  131. }
  132. while (
  133. pos < s.length &&
  134. !isWhitespace(s, pos) &&
  135. s[pos] !== ">" &&
  136. s[pos] !== "?" &&
  137. s[pos] !== "/"
  138. ) {
  139. ++pos;
  140. }
  141. const name = s.substring(start, pos);
  142. skipWs();
  143. const attrStart = pos;
  144. while (pos < s.length && (s[pos] !== "?" || s[pos + 1] !== ">")) {
  145. ++pos;
  146. }
  147. const value = s.substring(attrStart, pos);
  148. return {
  149. name,
  150. value,
  151. parsed: pos - start,
  152. };
  153. }
  154. parseXml(s) {
  155. let i = 0;
  156. while (i < s.length) {
  157. const ch = s[i];
  158. let j = i;
  159. if (ch === "<") {
  160. ++j;
  161. const ch2 = s[j];
  162. let q;
  163. switch (ch2) {
  164. case "/":
  165. ++j;
  166. q = s.indexOf(">", j);
  167. if (q < 0) {
  168. this.onError(XMLParserErrorCode.UnterminatedElement);
  169. return;
  170. }
  171. this.onEndElement(s.substring(j, q));
  172. j = q + 1;
  173. break;
  174. case "?":
  175. ++j;
  176. const pi = this._parseProcessingInstruction(s, j);
  177. if (s.substring(j + pi.parsed, j + pi.parsed + 2) !== "?>") {
  178. this.onError(XMLParserErrorCode.UnterminatedXmlDeclaration);
  179. return;
  180. }
  181. this.onPi(pi.name, pi.value);
  182. j += pi.parsed + 2;
  183. break;
  184. case "!":
  185. if (s.substring(j + 1, j + 3) === "--") {
  186. q = s.indexOf("-->", j + 3);
  187. if (q < 0) {
  188. this.onError(XMLParserErrorCode.UnterminatedComment);
  189. return;
  190. }
  191. this.onComment(s.substring(j + 3, q));
  192. j = q + 3;
  193. } else if (s.substring(j + 1, j + 8) === "[CDATA[") {
  194. q = s.indexOf("]]>", j + 8);
  195. if (q < 0) {
  196. this.onError(XMLParserErrorCode.UnterminatedCdat);
  197. return;
  198. }
  199. this.onCdata(s.substring(j + 8, q));
  200. j = q + 3;
  201. } else if (s.substring(j + 1, j + 8) === "DOCTYPE") {
  202. const q2 = s.indexOf("[", j + 8);
  203. let complexDoctype = false;
  204. q = s.indexOf(">", j + 8);
  205. if (q < 0) {
  206. this.onError(XMLParserErrorCode.UnterminatedDoctypeDeclaration);
  207. return;
  208. }
  209. if (q2 > 0 && q > q2) {
  210. q = s.indexOf("]>", j + 8);
  211. if (q < 0) {
  212. this.onError(
  213. XMLParserErrorCode.UnterminatedDoctypeDeclaration
  214. );
  215. return;
  216. }
  217. complexDoctype = true;
  218. }
  219. const doctypeContent = s.substring(
  220. j + 8,
  221. q + (complexDoctype ? 1 : 0)
  222. );
  223. this.onDoctype(doctypeContent);
  224. j = q + (complexDoctype ? 2 : 1);
  225. } else {
  226. this.onError(XMLParserErrorCode.MalformedElement);
  227. return;
  228. }
  229. break;
  230. default:
  231. const content = this._parseContent(s, j);
  232. if (content === null) {
  233. this.onError(XMLParserErrorCode.MalformedElement);
  234. return;
  235. }
  236. let isClosed = false;
  237. if (
  238. s.substring(j + content.parsed, j + content.parsed + 2) === "/>"
  239. ) {
  240. isClosed = true;
  241. } else if (
  242. s.substring(j + content.parsed, j + content.parsed + 1) !== ">"
  243. ) {
  244. this.onError(XMLParserErrorCode.UnterminatedElement);
  245. return;
  246. }
  247. this.onBeginElement(content.name, content.attributes, isClosed);
  248. j += content.parsed + (isClosed ? 2 : 1);
  249. break;
  250. }
  251. } else {
  252. while (j < s.length && s[j] !== "<") {
  253. j++;
  254. }
  255. const text = s.substring(i, j);
  256. this.onText(this._resolveEntities(text));
  257. }
  258. i = j;
  259. }
  260. }
  261. onResolveEntity(name) {
  262. return `&${name};`;
  263. }
  264. onPi(name, value) {}
  265. onComment(text) {}
  266. onCdata(text) {}
  267. onDoctype(doctypeContent) {}
  268. onText(text) {}
  269. onBeginElement(name, attributes, isEmpty) {}
  270. onEndElement(name) {}
  271. onError(code) {}
  272. }
  273. class SimpleDOMNode {
  274. constructor(nodeName, nodeValue) {
  275. this.nodeName = nodeName;
  276. this.nodeValue = nodeValue;
  277. Object.defineProperty(this, "parentNode", { value: null, writable: true });
  278. }
  279. get firstChild() {
  280. return this.childNodes && this.childNodes[0];
  281. }
  282. get nextSibling() {
  283. const childNodes = this.parentNode.childNodes;
  284. if (!childNodes) {
  285. return undefined;
  286. }
  287. const index = childNodes.indexOf(this);
  288. if (index === -1) {
  289. return undefined;
  290. }
  291. return childNodes[index + 1];
  292. }
  293. get textContent() {
  294. if (!this.childNodes) {
  295. return this.nodeValue || "";
  296. }
  297. return this.childNodes
  298. .map(function (child) {
  299. return child.textContent;
  300. })
  301. .join("");
  302. }
  303. get children() {
  304. return this.childNodes || [];
  305. }
  306. hasChildNodes() {
  307. return this.childNodes && this.childNodes.length > 0;
  308. }
  309. /**
  310. * Search a node in the tree with the given path
  311. * foo.bar[nnn], i.e. find the nnn-th node named
  312. * bar under a node named foo.
  313. *
  314. * @param {Array} paths - an array of objects as
  315. * returned by {parseXFAPath}.
  316. * @param {number} pos - the current position in
  317. * the paths array.
  318. * @returns {SimpleDOMNode} The node corresponding
  319. * to the path or null if not found.
  320. */
  321. searchNode(paths, pos) {
  322. if (pos >= paths.length) {
  323. return this;
  324. }
  325. const component = paths[pos];
  326. const stack = [];
  327. let node = this;
  328. while (true) {
  329. if (component.name === node.nodeName) {
  330. if (component.pos === 0) {
  331. const res = node.searchNode(paths, pos + 1);
  332. if (res !== null) {
  333. return res;
  334. }
  335. } else if (stack.length === 0) {
  336. return null;
  337. } else {
  338. const [parent] = stack.pop();
  339. let siblingPos = 0;
  340. for (const child of parent.childNodes) {
  341. if (component.name === child.nodeName) {
  342. if (siblingPos === component.pos) {
  343. return child.searchNode(paths, pos + 1);
  344. }
  345. siblingPos++;
  346. }
  347. }
  348. // We didn't find the correct sibling
  349. // so just return the first found node
  350. return node.searchNode(paths, pos + 1);
  351. }
  352. }
  353. if (node.childNodes && node.childNodes.length !== 0) {
  354. stack.push([node, 0]);
  355. node = node.childNodes[0];
  356. } else if (stack.length === 0) {
  357. return null;
  358. } else {
  359. while (stack.length !== 0) {
  360. const [parent, currentPos] = stack.pop();
  361. const newPos = currentPos + 1;
  362. if (newPos < parent.childNodes.length) {
  363. stack.push([parent, newPos]);
  364. node = parent.childNodes[newPos];
  365. break;
  366. }
  367. }
  368. if (stack.length === 0) {
  369. return null;
  370. }
  371. }
  372. }
  373. }
  374. dump(buffer) {
  375. if (this.nodeName === "#text") {
  376. buffer.push(encodeToXmlString(this.nodeValue));
  377. return;
  378. }
  379. buffer.push(`<${this.nodeName}`);
  380. if (this.attributes) {
  381. for (const attribute of this.attributes) {
  382. buffer.push(
  383. ` ${attribute.name}="${encodeToXmlString(attribute.value)}"`
  384. );
  385. }
  386. }
  387. if (this.hasChildNodes()) {
  388. buffer.push(">");
  389. for (const child of this.childNodes) {
  390. child.dump(buffer);
  391. }
  392. buffer.push(`</${this.nodeName}>`);
  393. } else if (this.nodeValue) {
  394. buffer.push(`>${encodeToXmlString(this.nodeValue)}</${this.nodeName}>`);
  395. } else {
  396. buffer.push("/>");
  397. }
  398. }
  399. }
  400. class SimpleXMLParser extends XMLParserBase {
  401. constructor({ hasAttributes = false, lowerCaseName = false }) {
  402. super();
  403. this._currentFragment = null;
  404. this._stack = null;
  405. this._errorCode = XMLParserErrorCode.NoError;
  406. this._hasAttributes = hasAttributes;
  407. this._lowerCaseName = lowerCaseName;
  408. }
  409. parseFromString(data) {
  410. this._currentFragment = [];
  411. this._stack = [];
  412. this._errorCode = XMLParserErrorCode.NoError;
  413. this.parseXml(data);
  414. if (this._errorCode !== XMLParserErrorCode.NoError) {
  415. return undefined; // return undefined on error
  416. }
  417. // We should only have one root.
  418. const [documentElement] = this._currentFragment;
  419. if (!documentElement) {
  420. return undefined; // Return undefined if no root was found.
  421. }
  422. return { documentElement };
  423. }
  424. onText(text) {
  425. if (isWhitespaceString(text)) {
  426. return;
  427. }
  428. const node = new SimpleDOMNode("#text", text);
  429. this._currentFragment.push(node);
  430. }
  431. onCdata(text) {
  432. const node = new SimpleDOMNode("#text", text);
  433. this._currentFragment.push(node);
  434. }
  435. onBeginElement(name, attributes, isEmpty) {
  436. if (this._lowerCaseName) {
  437. name = name.toLowerCase();
  438. }
  439. const node = new SimpleDOMNode(name);
  440. node.childNodes = [];
  441. if (this._hasAttributes) {
  442. node.attributes = attributes;
  443. }
  444. this._currentFragment.push(node);
  445. if (isEmpty) {
  446. return;
  447. }
  448. this._stack.push(this._currentFragment);
  449. this._currentFragment = node.childNodes;
  450. }
  451. onEndElement(name) {
  452. this._currentFragment = this._stack.pop() || [];
  453. const lastElement = this._currentFragment.at(-1);
  454. if (!lastElement) {
  455. return null;
  456. }
  457. for (const childNode of lastElement.childNodes) {
  458. childNode.parentNode = lastElement;
  459. }
  460. return lastElement;
  461. }
  462. onError(code) {
  463. this._errorCode = code;
  464. }
  465. }
  466. export { SimpleDOMNode, SimpleXMLParser, XMLParserBase, XMLParserErrorCode };