struct_tree.js 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. /* Copyright 2021 Mozilla Foundation
  2. *
  3. * Licensed under the Apache License, Version 2.0 (the "License");
  4. * you may not use this file except in compliance with the License.
  5. * You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software
  10. * distributed under the License is distributed on an "AS IS" BASIS,
  11. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. * See the License for the specific language governing permissions and
  13. * limitations under the License.
  14. */
  15. import { Dict, isName, Name, Ref } from "./primitives.js";
  16. import { stringToPDFString, warn } from "../shared/util.js";
  17. import { NumberTree } from "./name_number_tree.js";
  18. const MAX_DEPTH = 40;
  19. const StructElementType = {
  20. PAGE_CONTENT: "PAGE_CONTENT",
  21. STREAM_CONTENT: "STREAM_CONTENT",
  22. OBJECT: "OBJECT",
  23. ELEMENT: "ELEMENT",
  24. };
  25. class StructTreeRoot {
  26. constructor(rootDict) {
  27. this.dict = rootDict;
  28. this.roleMap = new Map();
  29. }
  30. init() {
  31. this.readRoleMap();
  32. }
  33. readRoleMap() {
  34. const roleMapDict = this.dict.get("RoleMap");
  35. if (!(roleMapDict instanceof Dict)) {
  36. return;
  37. }
  38. roleMapDict.forEach((key, value) => {
  39. if (!(value instanceof Name)) {
  40. return;
  41. }
  42. this.roleMap.set(key, value.name);
  43. });
  44. }
  45. }
  46. /**
  47. * Instead of loading the whole tree we load just the page's relevant structure
  48. * elements, which means we need a wrapper structure to represent the tree.
  49. */
  50. class StructElementNode {
  51. constructor(tree, dict) {
  52. this.tree = tree;
  53. this.dict = dict;
  54. this.kids = [];
  55. this.parseKids();
  56. }
  57. get role() {
  58. const nameObj = this.dict.get("S");
  59. const name = nameObj instanceof Name ? nameObj.name : "";
  60. const { root } = this.tree;
  61. if (root.roleMap.has(name)) {
  62. return root.roleMap.get(name);
  63. }
  64. return name;
  65. }
  66. parseKids() {
  67. let pageObjId = null;
  68. const objRef = this.dict.getRaw("Pg");
  69. if (objRef instanceof Ref) {
  70. pageObjId = objRef.toString();
  71. }
  72. const kids = this.dict.get("K");
  73. if (Array.isArray(kids)) {
  74. for (const kid of kids) {
  75. const element = this.parseKid(pageObjId, kid);
  76. if (element) {
  77. this.kids.push(element);
  78. }
  79. }
  80. } else {
  81. const element = this.parseKid(pageObjId, kids);
  82. if (element) {
  83. this.kids.push(element);
  84. }
  85. }
  86. }
  87. parseKid(pageObjId, kid) {
  88. // A direct link to content, the integer is an mcid.
  89. if (Number.isInteger(kid)) {
  90. if (this.tree.pageDict.objId !== pageObjId) {
  91. return null;
  92. }
  93. return new StructElement({
  94. type: StructElementType.PAGE_CONTENT,
  95. mcid: kid,
  96. pageObjId,
  97. });
  98. }
  99. // Find the dictionary for the kid.
  100. let kidDict = null;
  101. if (kid instanceof Ref) {
  102. kidDict = this.dict.xref.fetch(kid);
  103. } else if (kid instanceof Dict) {
  104. kidDict = kid;
  105. }
  106. if (!kidDict) {
  107. return null;
  108. }
  109. const pageRef = kidDict.getRaw("Pg");
  110. if (pageRef instanceof Ref) {
  111. pageObjId = pageRef.toString();
  112. }
  113. const type =
  114. kidDict.get("Type") instanceof Name ? kidDict.get("Type").name : null;
  115. if (type === "MCR") {
  116. if (this.tree.pageDict.objId !== pageObjId) {
  117. return null;
  118. }
  119. return new StructElement({
  120. type: StructElementType.STREAM_CONTENT,
  121. refObjId:
  122. kidDict.getRaw("Stm") instanceof Ref
  123. ? kidDict.getRaw("Stm").toString()
  124. : null,
  125. pageObjId,
  126. mcid: kidDict.get("MCID"),
  127. });
  128. }
  129. if (type === "OBJR") {
  130. if (this.tree.pageDict.objId !== pageObjId) {
  131. return null;
  132. }
  133. return new StructElement({
  134. type: StructElementType.OBJECT,
  135. refObjId:
  136. kidDict.getRaw("Obj") instanceof Ref
  137. ? kidDict.getRaw("Obj").toString()
  138. : null,
  139. pageObjId,
  140. });
  141. }
  142. return new StructElement({
  143. type: StructElementType.ELEMENT,
  144. dict: kidDict,
  145. });
  146. }
  147. }
  148. class StructElement {
  149. constructor({
  150. type,
  151. dict = null,
  152. mcid = null,
  153. pageObjId = null,
  154. refObjId = null,
  155. }) {
  156. this.type = type;
  157. this.dict = dict;
  158. this.mcid = mcid;
  159. this.pageObjId = pageObjId;
  160. this.refObjId = refObjId;
  161. this.parentNode = null;
  162. }
  163. }
  164. class StructTreePage {
  165. constructor(structTreeRoot, pageDict) {
  166. this.root = structTreeRoot;
  167. this.rootDict = structTreeRoot ? structTreeRoot.dict : null;
  168. this.pageDict = pageDict;
  169. this.nodes = [];
  170. }
  171. parse() {
  172. if (!this.root || !this.rootDict) {
  173. return;
  174. }
  175. const parentTree = this.rootDict.get("ParentTree");
  176. if (!parentTree) {
  177. return;
  178. }
  179. const id = this.pageDict.get("StructParents");
  180. if (!Number.isInteger(id)) {
  181. return;
  182. }
  183. const numberTree = new NumberTree(parentTree, this.rootDict.xref);
  184. const parentArray = numberTree.get(id);
  185. if (!Array.isArray(parentArray)) {
  186. return;
  187. }
  188. const map = new Map();
  189. for (const ref of parentArray) {
  190. if (ref instanceof Ref) {
  191. this.addNode(this.rootDict.xref.fetch(ref), map);
  192. }
  193. }
  194. }
  195. addNode(dict, map, level = 0) {
  196. if (level > MAX_DEPTH) {
  197. warn("StructTree MAX_DEPTH reached.");
  198. return null;
  199. }
  200. if (map.has(dict)) {
  201. return map.get(dict);
  202. }
  203. const element = new StructElementNode(this, dict);
  204. map.set(dict, element);
  205. const parent = dict.get("P");
  206. if (!parent || isName(parent.get("Type"), "StructTreeRoot")) {
  207. if (!this.addTopLevelNode(dict, element)) {
  208. map.delete(dict);
  209. }
  210. return element;
  211. }
  212. const parentNode = this.addNode(parent, map, level + 1);
  213. if (!parentNode) {
  214. return element;
  215. }
  216. let save = false;
  217. for (const kid of parentNode.kids) {
  218. if (kid.type === StructElementType.ELEMENT && kid.dict === dict) {
  219. kid.parentNode = element;
  220. save = true;
  221. }
  222. }
  223. if (!save) {
  224. map.delete(dict);
  225. }
  226. return element;
  227. }
  228. addTopLevelNode(dict, element) {
  229. const obj = this.rootDict.get("K");
  230. if (!obj) {
  231. return false;
  232. }
  233. if (obj instanceof Dict) {
  234. if (obj.objId !== dict.objId) {
  235. return false;
  236. }
  237. this.nodes[0] = element;
  238. return true;
  239. }
  240. if (!Array.isArray(obj)) {
  241. return true;
  242. }
  243. let save = false;
  244. for (let i = 0; i < obj.length; i++) {
  245. const kidRef = obj[i];
  246. if (kidRef && kidRef.toString() === dict.objId) {
  247. this.nodes[i] = element;
  248. save = true;
  249. }
  250. }
  251. return save;
  252. }
  253. /**
  254. * Convert the tree structure into a simplified object literal that can
  255. * be sent to the main thread.
  256. * @returns {Object}
  257. */
  258. get serializable() {
  259. function nodeToSerializable(node, parent, level = 0) {
  260. if (level > MAX_DEPTH) {
  261. warn("StructTree too deep to be fully serialized.");
  262. return;
  263. }
  264. const obj = Object.create(null);
  265. obj.role = node.role;
  266. obj.children = [];
  267. parent.children.push(obj);
  268. const alt = node.dict.get("Alt");
  269. if (typeof alt === "string") {
  270. obj.alt = stringToPDFString(alt);
  271. }
  272. const lang = node.dict.get("Lang");
  273. if (typeof lang === "string") {
  274. obj.lang = stringToPDFString(lang);
  275. }
  276. for (const kid of node.kids) {
  277. const kidElement =
  278. kid.type === StructElementType.ELEMENT ? kid.parentNode : null;
  279. if (kidElement) {
  280. nodeToSerializable(kidElement, obj, level + 1);
  281. continue;
  282. } else if (
  283. kid.type === StructElementType.PAGE_CONTENT ||
  284. kid.type === StructElementType.STREAM_CONTENT
  285. ) {
  286. obj.children.push({
  287. type: "content",
  288. id: `page${kid.pageObjId}_mcid${kid.mcid}`,
  289. });
  290. } else if (kid.type === StructElementType.OBJECT) {
  291. obj.children.push({
  292. type: "object",
  293. id: kid.refObjId,
  294. });
  295. }
  296. }
  297. }
  298. const root = Object.create(null);
  299. root.children = [];
  300. root.role = "Root";
  301. for (const child of this.nodes) {
  302. if (!child) {
  303. continue;
  304. }
  305. nodeToSerializable(child, root);
  306. }
  307. return root;
  308. }
  309. }
  310. export { StructTreePage, StructTreeRoot };