metadata_parser.js 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. /* Copyright 2012 Mozilla Foundation
  2. *
  3. * Licensed under the Apache License, Version 2.0 (the "License");
  4. * you may not use this file except in compliance with the License.
  5. * You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software
  10. * distributed under the License is distributed on an "AS IS" BASIS,
  11. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. * See the License for the specific language governing permissions and
  13. * limitations under the License.
  14. */
  15. import { SimpleXMLParser } from "./xml_parser.js";
  16. class MetadataParser {
  17. constructor(data) {
  18. // Ghostscript may produce invalid metadata, so try to repair that first.
  19. data = this._repair(data);
  20. // Convert the string to an XML document.
  21. const parser = new SimpleXMLParser({ lowerCaseName: true });
  22. const xmlDocument = parser.parseFromString(data);
  23. this._metadataMap = new Map();
  24. this._data = data;
  25. if (xmlDocument) {
  26. this._parse(xmlDocument);
  27. }
  28. }
  29. _repair(data) {
  30. // Start by removing any "junk" before the first tag (see issue 10395).
  31. return data
  32. .replace(/^[^<]+/, "")
  33. .replace(/>\\376\\377([^<]+)/g, function (all, codes) {
  34. const bytes = codes
  35. .replace(/\\([0-3])([0-7])([0-7])/g, function (code, d1, d2, d3) {
  36. return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1);
  37. })
  38. .replace(/&(amp|apos|gt|lt|quot);/g, function (str, name) {
  39. switch (name) {
  40. case "amp":
  41. return "&";
  42. case "apos":
  43. return "'";
  44. case "gt":
  45. return ">";
  46. case "lt":
  47. return "<";
  48. case "quot":
  49. return '"';
  50. }
  51. throw new Error(`_repair: ${name} isn't defined.`);
  52. });
  53. const charBuf = [];
  54. for (let i = 0, ii = bytes.length; i < ii; i += 2) {
  55. const code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1);
  56. if (
  57. code >= /* Space = */ 32 &&
  58. code < /* Delete = */ 127 &&
  59. code !== /* '<' = */ 60 &&
  60. code !== /* '>' = */ 62 &&
  61. code !== /* '&' = */ 38
  62. ) {
  63. charBuf.push(String.fromCharCode(code));
  64. } else {
  65. charBuf.push(
  66. "&#x" + (0x10000 + code).toString(16).substring(1) + ";"
  67. );
  68. }
  69. }
  70. return ">" + charBuf.join("");
  71. });
  72. }
  73. _getSequence(entry) {
  74. const name = entry.nodeName;
  75. if (name !== "rdf:bag" && name !== "rdf:seq" && name !== "rdf:alt") {
  76. return null;
  77. }
  78. return entry.childNodes.filter(node => node.nodeName === "rdf:li");
  79. }
  80. _parseArray(entry) {
  81. if (!entry.hasChildNodes()) {
  82. return;
  83. }
  84. // Child must be a Bag (unordered array) or a Seq.
  85. const [seqNode] = entry.childNodes;
  86. const sequence = this._getSequence(seqNode) || [];
  87. this._metadataMap.set(
  88. entry.nodeName,
  89. sequence.map(node => node.textContent.trim())
  90. );
  91. }
  92. _parse(xmlDocument) {
  93. let rdf = xmlDocument.documentElement;
  94. if (rdf.nodeName !== "rdf:rdf") {
  95. // Wrapped in <xmpmeta>
  96. rdf = rdf.firstChild;
  97. while (rdf && rdf.nodeName !== "rdf:rdf") {
  98. rdf = rdf.nextSibling;
  99. }
  100. }
  101. if (!rdf || rdf.nodeName !== "rdf:rdf" || !rdf.hasChildNodes()) {
  102. return;
  103. }
  104. for (const desc of rdf.childNodes) {
  105. if (desc.nodeName !== "rdf:description") {
  106. continue;
  107. }
  108. for (const entry of desc.childNodes) {
  109. const name = entry.nodeName;
  110. switch (name) {
  111. case "#text":
  112. continue;
  113. case "dc:creator":
  114. case "dc:subject":
  115. this._parseArray(entry);
  116. continue;
  117. }
  118. this._metadataMap.set(name, entry.textContent.trim());
  119. }
  120. }
  121. }
  122. get serializable() {
  123. return {
  124. parsedData: this._metadataMap,
  125. rawData: this._data,
  126. };
  127. }
  128. }
  129. export { MetadataParser };