pdf_find_controller.js 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047
  1. /* Copyright 2012 Mozilla Foundation
  2. *
  3. * Licensed under the Apache License, Version 2.0 (the "License");
  4. * you may not use this file except in compliance with the License.
  5. * You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software
  10. * distributed under the License is distributed on an "AS IS" BASIS,
  11. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. * See the License for the specific language governing permissions and
  13. * limitations under the License.
  14. */
  15. /** @typedef {import("../src/display/api").PDFDocumentProxy} PDFDocumentProxy */
  16. /** @typedef {import("./event_utils").EventBus} EventBus */
  17. /** @typedef {import("./interfaces").IPDFLinkService} IPDFLinkService */
  18. import { binarySearchFirstItem, scrollIntoView } from "./ui_utils.js";
  19. import { createPromiseCapability } from "pdfjs-lib";
  20. import { getCharacterType } from "./pdf_find_utils.js";
  21. const FindState = {
  22. FOUND: 0,
  23. NOT_FOUND: 1,
  24. WRAPPED: 2,
  25. PENDING: 3,
  26. };
  27. const FIND_TIMEOUT = 250; // ms
  28. const MATCH_SCROLL_OFFSET_TOP = -50; // px
  29. const MATCH_SCROLL_OFFSET_LEFT = -400; // px
  30. const CHARACTERS_TO_NORMALIZE = {
  31. "\u2010": "-", // Hyphen
  32. "\u2018": "'", // Left single quotation mark
  33. "\u2019": "'", // Right single quotation mark
  34. "\u201A": "'", // Single low-9 quotation mark
  35. "\u201B": "'", // Single high-reversed-9 quotation mark
  36. "\u201C": '"', // Left double quotation mark
  37. "\u201D": '"', // Right double quotation mark
  38. "\u201E": '"', // Double low-9 quotation mark
  39. "\u201F": '"', // Double high-reversed-9 quotation mark
  40. "\u00BC": "1/4", // Vulgar fraction one quarter
  41. "\u00BD": "1/2", // Vulgar fraction one half
  42. "\u00BE": "3/4", // Vulgar fraction three quarters
  43. };
  44. // These diacritics aren't considered as combining diacritics
  45. // when searching in a document:
  46. // https://searchfox.org/mozilla-central/source/intl/unicharutil/util/is_combining_diacritic.py.
  47. // The combining class definitions can be found:
  48. // https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values
  49. // Category 0 corresponds to [^\p{Mn}].
  50. const DIACRITICS_EXCEPTION = new Set([
  51. // UNICODE_COMBINING_CLASS_KANA_VOICING
  52. // https://www.compart.com/fr/unicode/combining/8
  53. 0x3099, 0x309a,
  54. // UNICODE_COMBINING_CLASS_VIRAMA (under 0xFFFF)
  55. // https://www.compart.com/fr/unicode/combining/9
  56. 0x094d, 0x09cd, 0x0a4d, 0x0acd, 0x0b4d, 0x0bcd, 0x0c4d, 0x0ccd, 0x0d3b,
  57. 0x0d3c, 0x0d4d, 0x0dca, 0x0e3a, 0x0eba, 0x0f84, 0x1039, 0x103a, 0x1714,
  58. 0x1734, 0x17d2, 0x1a60, 0x1b44, 0x1baa, 0x1bab, 0x1bf2, 0x1bf3, 0x2d7f,
  59. 0xa806, 0xa82c, 0xa8c4, 0xa953, 0xa9c0, 0xaaf6, 0xabed,
  60. // 91
  61. // https://www.compart.com/fr/unicode/combining/91
  62. 0x0c56,
  63. // 129
  64. // https://www.compart.com/fr/unicode/combining/129
  65. 0x0f71,
  66. // 130
  67. // https://www.compart.com/fr/unicode/combining/130
  68. 0x0f72, 0x0f7a, 0x0f7b, 0x0f7c, 0x0f7d, 0x0f80,
  69. // 132
  70. // https://www.compart.com/fr/unicode/combining/132
  71. 0x0f74,
  72. ]);
  73. let DIACRITICS_EXCEPTION_STR; // Lazily initialized, see below.
  74. const DIACRITICS_REG_EXP = /\p{M}+/gu;
  75. const SPECIAL_CHARS_REG_EXP =
  76. /([.*+?^${}()|[\]\\])|(\p{P})|(\s+)|(\p{M})|(\p{L})/gu;
  77. const NOT_DIACRITIC_FROM_END_REG_EXP = /([^\p{M}])\p{M}*$/u;
  78. const NOT_DIACRITIC_FROM_START_REG_EXP = /^\p{M}*([^\p{M}])/u;
  79. // The range [AC00-D7AF] corresponds to the Hangul syllables.
  80. // The few other chars are some CJK Compatibility Ideographs.
  81. const SYLLABLES_REG_EXP = /[\uAC00-\uD7AF\uFA6C\uFACF-\uFAD1\uFAD5-\uFAD7]+/g;
  82. const SYLLABLES_LENGTHS = new Map();
  83. // When decomposed (in using NFD) the above syllables will start
  84. // with one of the chars in this regexp.
  85. const FIRST_CHAR_SYLLABLES_REG_EXP =
  86. "[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]";
  87. const NFKC_CHARS_TO_NORMALIZE = new Map();
  88. let noSyllablesRegExp = null;
  89. let withSyllablesRegExp = null;
  90. function normalize(text) {
  91. // The diacritics in the text or in the query can be composed or not.
  92. // So we use a decomposed text using NFD (and the same for the query)
  93. // in order to be sure that diacritics are in the same order.
  94. // Collect syllables length and positions.
  95. const syllablePositions = [];
  96. let m;
  97. while ((m = SYLLABLES_REG_EXP.exec(text)) !== null) {
  98. let { index } = m;
  99. for (const char of m[0]) {
  100. let len = SYLLABLES_LENGTHS.get(char);
  101. if (!len) {
  102. len = char.normalize("NFD").length;
  103. SYLLABLES_LENGTHS.set(char, len);
  104. }
  105. syllablePositions.push([len, index++]);
  106. }
  107. }
  108. let normalizationRegex;
  109. if (syllablePositions.length === 0 && noSyllablesRegExp) {
  110. normalizationRegex = noSyllablesRegExp;
  111. } else if (syllablePositions.length > 0 && withSyllablesRegExp) {
  112. normalizationRegex = withSyllablesRegExp;
  113. } else {
  114. // Compile the regular expression for text normalization once.
  115. const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
  116. const toNormalizeWithNFKC =
  117. "\u2460-\u2473" + // Circled numbers.
  118. "\u24b6-\u24ff" + // Circled letters/numbers.
  119. "\u3244-\u32bf" + // Circled ideograms/numbers.
  120. "\u32d0-\u32fe" + // Circled ideograms.
  121. "\uff00-\uffef"; // Halfwidth, fullwidth forms.
  122. // 3040-309F: Hiragana
  123. // 30A0-30FF: Katakana
  124. const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])";
  125. const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`;
  126. if (syllablePositions.length === 0) {
  127. // Most of the syllables belong to Hangul so there are no need
  128. // to search for them in a non-Hangul document.
  129. // We use the \0 in order to have the same number of groups.
  130. normalizationRegex = noSyllablesRegExp = new RegExp(
  131. regexp + "|(\\u0000)",
  132. "gum"
  133. );
  134. } else {
  135. normalizationRegex = withSyllablesRegExp = new RegExp(
  136. regexp + `|(${FIRST_CHAR_SYLLABLES_REG_EXP})`,
  137. "gum"
  138. );
  139. }
  140. }
  141. // The goal of this function is to normalize the string and
  142. // be able to get from an index in the new string the
  143. // corresponding index in the old string.
  144. // For example if we have: abCd12ef456gh where C is replaced by ccc
  145. // and numbers replaced by nothing (it's the case for diacritics), then
  146. // we'll obtain the normalized string: abcccdefgh.
  147. // So here the reverse map is: [0,1,2,2,2,3,6,7,11,12].
  148. // The goal is to obtain the array: [[0, 0], [3, -1], [4, -2],
  149. // [6, 0], [8, 3]].
  150. // which can be used like this:
  151. // - let say that i is the index in new string and j the index
  152. // the old string.
  153. // - if i is in [0; 3[ then j = i + 0
  154. // - if i is in [3; 4[ then j = i - 1
  155. // - if i is in [4; 6[ then j = i - 2
  156. // ...
  157. // Thanks to a binary search it's easy to know where is i and what's the
  158. // shift.
  159. // Let say that the last entry in the array is [x, s] and we have a
  160. // substitution at index y (old string) which will replace o chars by n chars.
  161. // Firstly, if o === n, then no need to add a new entry: the shift is
  162. // the same.
  163. // Secondly, if o < n, then we push the n - o elements:
  164. // [y - (s - 1), s - 1], [y - (s - 2), s - 2], ...
  165. // Thirdly, if o > n, then we push the element: [y - (s - n), o + s - n]
  166. // Collect diacritics length and positions.
  167. const rawDiacriticsPositions = [];
  168. while ((m = DIACRITICS_REG_EXP.exec(text)) !== null) {
  169. rawDiacriticsPositions.push([m[0].length, m.index]);
  170. }
  171. let normalized = text.normalize("NFD");
  172. const positions = [[0, 0]];
  173. let rawDiacriticsIndex = 0;
  174. let syllableIndex = 0;
  175. let shift = 0;
  176. let shiftOrigin = 0;
  177. let eol = 0;
  178. let hasDiacritics = false;
  179. normalized = normalized.replace(
  180. normalizationRegex,
  181. (match, p1, p2, p3, p4, p5, p6, p7, i) => {
  182. i -= shiftOrigin;
  183. if (p1) {
  184. // Maybe fractions or quotations mark...
  185. const replacement = CHARACTERS_TO_NORMALIZE[p1];
  186. const jj = replacement.length;
  187. for (let j = 1; j < jj; j++) {
  188. positions.push([i - shift + j, shift - j]);
  189. }
  190. shift -= jj - 1;
  191. return replacement;
  192. }
  193. if (p2) {
  194. // Use the NFKC representation to normalize the char.
  195. let replacement = NFKC_CHARS_TO_NORMALIZE.get(p2);
  196. if (!replacement) {
  197. replacement = p2.normalize("NFKC");
  198. NFKC_CHARS_TO_NORMALIZE.set(p2, replacement);
  199. }
  200. const jj = replacement.length;
  201. for (let j = 1; j < jj; j++) {
  202. positions.push([i - shift + j, shift - j]);
  203. }
  204. shift -= jj - 1;
  205. return replacement;
  206. }
  207. if (p3) {
  208. const hasTrailingDashEOL = p3.endsWith("\n");
  209. const len = hasTrailingDashEOL ? p3.length - 2 : p3.length;
  210. // Diacritics.
  211. hasDiacritics = true;
  212. let jj = len;
  213. if (i + eol === rawDiacriticsPositions[rawDiacriticsIndex]?.[1]) {
  214. jj -= rawDiacriticsPositions[rawDiacriticsIndex][0];
  215. ++rawDiacriticsIndex;
  216. }
  217. for (let j = 1; j <= jj; j++) {
  218. // i is the position of the first diacritic
  219. // so (i - 1) is the position for the letter before.
  220. positions.push([i - 1 - shift + j, shift - j]);
  221. }
  222. shift -= jj;
  223. shiftOrigin += jj;
  224. if (hasTrailingDashEOL) {
  225. // Diacritics are followed by a -\n.
  226. // See comments in `if (p4)` block.
  227. i += len - 1;
  228. positions.push([i - shift + 1, 1 + shift]);
  229. shift += 1;
  230. shiftOrigin += 1;
  231. eol += 1;
  232. return p3.slice(0, len);
  233. }
  234. return p3;
  235. }
  236. if (p4) {
  237. // "X-\n" is removed because an hyphen at the end of a line
  238. // with not a space before is likely here to mark a break
  239. // in a word.
  240. // The \n isn't in the original text so here y = i, n = 1 and o = 2.
  241. positions.push([i - shift + 1, 1 + shift]);
  242. shift += 1;
  243. shiftOrigin += 1;
  244. eol += 1;
  245. return p4.charAt(0);
  246. }
  247. if (p5) {
  248. // An ideographic at the end of a line doesn't imply adding an extra
  249. // white space.
  250. positions.push([i - shift + 1, shift]);
  251. shiftOrigin += 1;
  252. eol += 1;
  253. return p5.charAt(0);
  254. }
  255. if (p6) {
  256. // eol is replaced by space: "foo\nbar" is likely equivalent to
  257. // "foo bar".
  258. positions.push([i - shift + 1, shift - 1]);
  259. shift -= 1;
  260. shiftOrigin += 1;
  261. eol += 1;
  262. return " ";
  263. }
  264. // p7
  265. if (i + eol === syllablePositions[syllableIndex]?.[1]) {
  266. // A syllable (1 char) is replaced with several chars (n) so
  267. // newCharsLen = n - 1.
  268. const newCharLen = syllablePositions[syllableIndex][0] - 1;
  269. ++syllableIndex;
  270. for (let j = 1; j <= newCharLen; j++) {
  271. positions.push([i - (shift - j), shift - j]);
  272. }
  273. shift -= newCharLen;
  274. shiftOrigin += newCharLen;
  275. }
  276. return p7;
  277. }
  278. );
  279. positions.push([normalized.length, shift]);
  280. return [normalized, positions, hasDiacritics];
  281. }
  282. // Determine the original, non-normalized, match index such that highlighting of
  283. // search results is correct in the `textLayer` for strings containing e.g. "½"
  284. // characters; essentially "inverting" the result of the `normalize` function.
  285. function getOriginalIndex(diffs, pos, len) {
  286. if (!diffs) {
  287. return [pos, len];
  288. }
  289. const start = pos;
  290. const end = pos + len;
  291. let i = binarySearchFirstItem(diffs, x => x[0] >= start);
  292. if (diffs[i][0] > start) {
  293. --i;
  294. }
  295. let j = binarySearchFirstItem(diffs, x => x[0] >= end, i);
  296. if (diffs[j][0] > end) {
  297. --j;
  298. }
  299. return [start + diffs[i][1], len + diffs[j][1] - diffs[i][1]];
  300. }
  301. /**
  302. * @typedef {Object} PDFFindControllerOptions
  303. * @property {IPDFLinkService} linkService - The navigation/linking service.
  304. * @property {EventBus} eventBus - The application event bus.
  305. */
  306. /**
  307. * Provides search functionality to find a given string in a PDF document.
  308. */
  309. class PDFFindController {
  310. /**
  311. * @param {PDFFindControllerOptions} options
  312. */
  313. constructor({ linkService, eventBus }) {
  314. this._linkService = linkService;
  315. this._eventBus = eventBus;
  316. this.#reset();
  317. eventBus._on("find", this.#onFind.bind(this));
  318. eventBus._on("findbarclose", this.#onFindBarClose.bind(this));
  319. }
  320. get highlightMatches() {
  321. return this._highlightMatches;
  322. }
  323. get pageMatches() {
  324. return this._pageMatches;
  325. }
  326. get pageMatchesLength() {
  327. return this._pageMatchesLength;
  328. }
  329. get selected() {
  330. return this._selected;
  331. }
  332. get state() {
  333. return this._state;
  334. }
  335. /**
  336. * Set a reference to the PDF document in order to search it.
  337. * Note that searching is not possible if this method is not called.
  338. *
  339. * @param {PDFDocumentProxy} pdfDocument - The PDF document to search.
  340. */
  341. setDocument(pdfDocument) {
  342. if (this._pdfDocument) {
  343. this.#reset();
  344. }
  345. if (!pdfDocument) {
  346. return;
  347. }
  348. this._pdfDocument = pdfDocument;
  349. this._firstPageCapability.resolve();
  350. }
  351. #onFind(state) {
  352. if (!state) {
  353. return;
  354. }
  355. const pdfDocument = this._pdfDocument;
  356. const { type } = state;
  357. if (this._state === null || this.#shouldDirtyMatch(state)) {
  358. this._dirtyMatch = true;
  359. }
  360. this._state = state;
  361. if (type !== "highlightallchange") {
  362. this.#updateUIState(FindState.PENDING);
  363. }
  364. this._firstPageCapability.promise.then(() => {
  365. // If the document was closed before searching began, or if the search
  366. // operation was relevant for a previously opened document, do nothing.
  367. if (
  368. !this._pdfDocument ||
  369. (pdfDocument && this._pdfDocument !== pdfDocument)
  370. ) {
  371. return;
  372. }
  373. this.#extractText();
  374. const findbarClosed = !this._highlightMatches;
  375. const pendingTimeout = !!this._findTimeout;
  376. if (this._findTimeout) {
  377. clearTimeout(this._findTimeout);
  378. this._findTimeout = null;
  379. }
  380. if (!type) {
  381. // Trigger the find action with a small delay to avoid starting the
  382. // search when the user is still typing (saving resources).
  383. this._findTimeout = setTimeout(() => {
  384. this.#nextMatch();
  385. this._findTimeout = null;
  386. }, FIND_TIMEOUT);
  387. } else if (this._dirtyMatch) {
  388. // Immediately trigger searching for non-'find' operations, when the
  389. // current state needs to be reset and matches re-calculated.
  390. this.#nextMatch();
  391. } else if (type === "again") {
  392. this.#nextMatch();
  393. // When the findbar was previously closed, and `highlightAll` is set,
  394. // ensure that the matches on all active pages are highlighted again.
  395. if (findbarClosed && this._state.highlightAll) {
  396. this.#updateAllPages();
  397. }
  398. } else if (type === "highlightallchange") {
  399. // If there was a pending search operation, synchronously trigger a new
  400. // search *first* to ensure that the correct matches are highlighted.
  401. if (pendingTimeout) {
  402. this.#nextMatch();
  403. } else {
  404. this._highlightMatches = true;
  405. }
  406. this.#updateAllPages(); // Update the highlighting on all active pages.
  407. } else {
  408. this.#nextMatch();
  409. }
  410. });
  411. }
  412. scrollMatchIntoView({
  413. element = null,
  414. selectedLeft = 0,
  415. pageIndex = -1,
  416. matchIndex = -1,
  417. }) {
  418. if (!this._scrollMatches || !element) {
  419. return;
  420. } else if (matchIndex === -1 || matchIndex !== this._selected.matchIdx) {
  421. return;
  422. } else if (pageIndex === -1 || pageIndex !== this._selected.pageIdx) {
  423. return;
  424. }
  425. this._scrollMatches = false; // Ensure that scrolling only happens once.
  426. const spot = {
  427. top: MATCH_SCROLL_OFFSET_TOP,
  428. left: selectedLeft + MATCH_SCROLL_OFFSET_LEFT,
  429. };
  430. scrollIntoView(element, spot, /* scrollMatches = */ true);
  431. }
  432. #reset() {
  433. this._highlightMatches = false;
  434. this._scrollMatches = false;
  435. this._pdfDocument = null;
  436. this._pageMatches = [];
  437. this._pageMatchesLength = [];
  438. this._state = null;
  439. // Currently selected match.
  440. this._selected = {
  441. pageIdx: -1,
  442. matchIdx: -1,
  443. };
  444. // Where the find algorithm currently is in the document.
  445. this._offset = {
  446. pageIdx: null,
  447. matchIdx: null,
  448. wrapped: false,
  449. };
  450. this._extractTextPromises = [];
  451. this._pageContents = []; // Stores the normalized text for each page.
  452. this._pageDiffs = [];
  453. this._hasDiacritics = [];
  454. this._matchesCountTotal = 0;
  455. this._pagesToSearch = null;
  456. this._pendingFindMatches = new Set();
  457. this._resumePageIdx = null;
  458. this._dirtyMatch = false;
  459. clearTimeout(this._findTimeout);
  460. this._findTimeout = null;
  461. this._firstPageCapability = createPromiseCapability();
  462. }
  463. /**
  464. * @type {string} The (current) normalized search query.
  465. */
  466. get #query() {
  467. if (this._state.query !== this._rawQuery) {
  468. this._rawQuery = this._state.query;
  469. [this._normalizedQuery] = normalize(this._state.query);
  470. }
  471. return this._normalizedQuery;
  472. }
  473. #shouldDirtyMatch(state) {
  474. // When the search query changes, regardless of the actual search command
  475. // used, always re-calculate matches to avoid errors (fixes bug 1030622).
  476. if (state.query !== this._state.query) {
  477. return true;
  478. }
  479. switch (state.type) {
  480. case "again":
  481. const pageNumber = this._selected.pageIdx + 1;
  482. const linkService = this._linkService;
  483. // Only treat a 'findagain' event as a new search operation when it's
  484. // *absolutely* certain that the currently selected match is no longer
  485. // visible, e.g. as a result of the user scrolling in the document.
  486. //
  487. // NOTE: If only a simple `this._linkService.page` check was used here,
  488. // there's a risk that consecutive 'findagain' operations could "skip"
  489. // over matches at the top/bottom of pages thus making them completely
  490. // inaccessible when there's multiple pages visible in the viewer.
  491. if (
  492. pageNumber >= 1 &&
  493. pageNumber <= linkService.pagesCount &&
  494. pageNumber !== linkService.page &&
  495. !linkService.isPageVisible(pageNumber)
  496. ) {
  497. return true;
  498. }
  499. return false;
  500. case "highlightallchange":
  501. return false;
  502. }
  503. return true;
  504. }
  505. /**
  506. * Determine if the search query constitutes a "whole word", by comparing the
  507. * first/last character type with the preceding/following character type.
  508. */
  509. #isEntireWord(content, startIdx, length) {
  510. let match = content
  511. .slice(0, startIdx)
  512. .match(NOT_DIACRITIC_FROM_END_REG_EXP);
  513. if (match) {
  514. const first = content.charCodeAt(startIdx);
  515. const limit = match[1].charCodeAt(0);
  516. if (getCharacterType(first) === getCharacterType(limit)) {
  517. return false;
  518. }
  519. }
  520. match = content
  521. .slice(startIdx + length)
  522. .match(NOT_DIACRITIC_FROM_START_REG_EXP);
  523. if (match) {
  524. const last = content.charCodeAt(startIdx + length - 1);
  525. const limit = match[1].charCodeAt(0);
  526. if (getCharacterType(last) === getCharacterType(limit)) {
  527. return false;
  528. }
  529. }
  530. return true;
  531. }
  532. #calculateRegExpMatch(query, entireWord, pageIndex, pageContent) {
  533. const matches = [],
  534. matchesLength = [];
  535. const diffs = this._pageDiffs[pageIndex];
  536. let match;
  537. while ((match = query.exec(pageContent)) !== null) {
  538. if (
  539. entireWord &&
  540. !this.#isEntireWord(pageContent, match.index, match[0].length)
  541. ) {
  542. continue;
  543. }
  544. const [matchPos, matchLen] = getOriginalIndex(
  545. diffs,
  546. match.index,
  547. match[0].length
  548. );
  549. if (matchLen) {
  550. matches.push(matchPos);
  551. matchesLength.push(matchLen);
  552. }
  553. }
  554. this._pageMatches[pageIndex] = matches;
  555. this._pageMatchesLength[pageIndex] = matchesLength;
  556. }
  557. #convertToRegExpString(query, hasDiacritics) {
  558. const { matchDiacritics } = this._state;
  559. let isUnicode = false;
  560. query = query.replace(
  561. SPECIAL_CHARS_REG_EXP,
  562. (
  563. match,
  564. p1 /* to escape */,
  565. p2 /* punctuation */,
  566. p3 /* whitespaces */,
  567. p4 /* diacritics */,
  568. p5 /* letters */
  569. ) => {
  570. // We don't need to use a \s for whitespaces since all the different
  571. // kind of whitespaces are replaced by a single " ".
  572. if (p1) {
  573. // Escape characters like *+?... to not interfer with regexp syntax.
  574. return `[ ]*\\${p1}[ ]*`;
  575. }
  576. if (p2) {
  577. // Allow whitespaces around punctuation signs.
  578. return `[ ]*${p2}[ ]*`;
  579. }
  580. if (p3) {
  581. // Replace spaces by \s+ to be sure to match any spaces.
  582. return "[ ]+";
  583. }
  584. if (matchDiacritics) {
  585. return p4 || p5;
  586. }
  587. if (p4) {
  588. // Diacritics are removed with few exceptions.
  589. return DIACRITICS_EXCEPTION.has(p4.charCodeAt(0)) ? p4 : "";
  590. }
  591. // A letter has been matched and it can be followed by any diacritics
  592. // in normalized text.
  593. if (hasDiacritics) {
  594. isUnicode = true;
  595. return `${p5}\\p{M}*`;
  596. }
  597. return p5;
  598. }
  599. );
  600. const trailingSpaces = "[ ]*";
  601. if (query.endsWith(trailingSpaces)) {
  602. // The [ ]* has been added in order to help to match "foo . bar" but
  603. // it doesn't make sense to match some whitespaces after the dot
  604. // when it's the last character.
  605. query = query.slice(0, query.length - trailingSpaces.length);
  606. }
  607. if (matchDiacritics) {
  608. // aX must not match aXY.
  609. if (hasDiacritics) {
  610. DIACRITICS_EXCEPTION_STR ||= String.fromCharCode(
  611. ...DIACRITICS_EXCEPTION
  612. );
  613. isUnicode = true;
  614. query = `${query}(?=[${DIACRITICS_EXCEPTION_STR}]|[^\\p{M}]|$)`;
  615. }
  616. }
  617. return [isUnicode, query];
  618. }
  619. #calculateMatch(pageIndex) {
  620. let query = this.#query;
  621. if (query.length === 0) {
  622. // Do nothing: the matches should be wiped out already.
  623. return;
  624. }
  625. const { caseSensitive, entireWord, phraseSearch } = this._state;
  626. const pageContent = this._pageContents[pageIndex];
  627. const hasDiacritics = this._hasDiacritics[pageIndex];
  628. let isUnicode = false;
  629. if (phraseSearch) {
  630. [isUnicode, query] = this.#convertToRegExpString(query, hasDiacritics);
  631. } else {
  632. // Words are sorted in reverse order to be sure that "foobar" is matched
  633. // before "foo" in case the query is "foobar foo".
  634. const match = query.match(/\S+/g);
  635. if (match) {
  636. query = match
  637. .sort()
  638. .reverse()
  639. .map(q => {
  640. const [isUnicodePart, queryPart] = this.#convertToRegExpString(
  641. q,
  642. hasDiacritics
  643. );
  644. isUnicode ||= isUnicodePart;
  645. return `(${queryPart})`;
  646. })
  647. .join("|");
  648. }
  649. }
  650. const flags = `g${isUnicode ? "u" : ""}${caseSensitive ? "" : "i"}`;
  651. query = new RegExp(query, flags);
  652. this.#calculateRegExpMatch(query, entireWord, pageIndex, pageContent);
  653. // When `highlightAll` is set, ensure that the matches on previously
  654. // rendered (and still active) pages are correctly highlighted.
  655. if (this._state.highlightAll) {
  656. this.#updatePage(pageIndex);
  657. }
  658. if (this._resumePageIdx === pageIndex) {
  659. this._resumePageIdx = null;
  660. this.#nextPageMatch();
  661. }
  662. // Update the match count.
  663. const pageMatchesCount = this._pageMatches[pageIndex].length;
  664. if (pageMatchesCount > 0) {
  665. this._matchesCountTotal += pageMatchesCount;
  666. this.#updateUIResultsCount();
  667. }
  668. }
  669. #extractText() {
  670. // Perform text extraction once if this method is called multiple times.
  671. if (this._extractTextPromises.length > 0) {
  672. return;
  673. }
  674. let promise = Promise.resolve();
  675. for (let i = 0, ii = this._linkService.pagesCount; i < ii; i++) {
  676. const extractTextCapability = createPromiseCapability();
  677. this._extractTextPromises[i] = extractTextCapability.promise;
  678. promise = promise.then(() => {
  679. return this._pdfDocument
  680. .getPage(i + 1)
  681. .then(pdfPage => {
  682. return pdfPage.getTextContent();
  683. })
  684. .then(
  685. textContent => {
  686. const strBuf = [];
  687. for (const textItem of textContent.items) {
  688. strBuf.push(textItem.str);
  689. if (textItem.hasEOL) {
  690. strBuf.push("\n");
  691. }
  692. }
  693. // Store the normalized page content (text items) as one string.
  694. [
  695. this._pageContents[i],
  696. this._pageDiffs[i],
  697. this._hasDiacritics[i],
  698. ] = normalize(strBuf.join(""));
  699. extractTextCapability.resolve();
  700. },
  701. reason => {
  702. console.error(
  703. `Unable to get text content for page ${i + 1}`,
  704. reason
  705. );
  706. // Page error -- assuming no text content.
  707. this._pageContents[i] = "";
  708. this._pageDiffs[i] = null;
  709. this._hasDiacritics[i] = false;
  710. extractTextCapability.resolve();
  711. }
  712. );
  713. });
  714. }
  715. }
  716. #updatePage(index) {
  717. if (this._scrollMatches && this._selected.pageIdx === index) {
  718. // If the page is selected, scroll the page into view, which triggers
  719. // rendering the page, which adds the text layer. Once the text layer
  720. // is built, it will attempt to scroll the selected match into view.
  721. this._linkService.page = index + 1;
  722. }
  723. this._eventBus.dispatch("updatetextlayermatches", {
  724. source: this,
  725. pageIndex: index,
  726. });
  727. }
  728. #updateAllPages() {
  729. this._eventBus.dispatch("updatetextlayermatches", {
  730. source: this,
  731. pageIndex: -1,
  732. });
  733. }
  734. #nextMatch() {
  735. const previous = this._state.findPrevious;
  736. const currentPageIndex = this._linkService.page - 1;
  737. const numPages = this._linkService.pagesCount;
  738. this._highlightMatches = true;
  739. if (this._dirtyMatch) {
  740. // Need to recalculate the matches, reset everything.
  741. this._dirtyMatch = false;
  742. this._selected.pageIdx = this._selected.matchIdx = -1;
  743. this._offset.pageIdx = currentPageIndex;
  744. this._offset.matchIdx = null;
  745. this._offset.wrapped = false;
  746. this._resumePageIdx = null;
  747. this._pageMatches.length = 0;
  748. this._pageMatchesLength.length = 0;
  749. this._matchesCountTotal = 0;
  750. this.#updateAllPages(); // Wipe out any previously highlighted matches.
  751. for (let i = 0; i < numPages; i++) {
  752. // Start finding the matches as soon as the text is extracted.
  753. if (this._pendingFindMatches.has(i)) {
  754. continue;
  755. }
  756. this._pendingFindMatches.add(i);
  757. this._extractTextPromises[i].then(() => {
  758. this._pendingFindMatches.delete(i);
  759. this.#calculateMatch(i);
  760. });
  761. }
  762. }
  763. // If there's no query there's no point in searching.
  764. if (this.#query === "") {
  765. this.#updateUIState(FindState.FOUND);
  766. return;
  767. }
  768. // If we're waiting on a page, we return since we can't do anything else.
  769. if (this._resumePageIdx) {
  770. return;
  771. }
  772. const offset = this._offset;
  773. // Keep track of how many pages we should maximally iterate through.
  774. this._pagesToSearch = numPages;
  775. // If there's already a `matchIdx` that means we are iterating through a
  776. // page's matches.
  777. if (offset.matchIdx !== null) {
  778. const numPageMatches = this._pageMatches[offset.pageIdx].length;
  779. if (
  780. (!previous && offset.matchIdx + 1 < numPageMatches) ||
  781. (previous && offset.matchIdx > 0)
  782. ) {
  783. // The simple case; we just have advance the matchIdx to select
  784. // the next match on the page.
  785. offset.matchIdx = previous ? offset.matchIdx - 1 : offset.matchIdx + 1;
  786. this.#updateMatch(/* found = */ true);
  787. return;
  788. }
  789. // We went beyond the current page's matches, so we advance to
  790. // the next page.
  791. this.#advanceOffsetPage(previous);
  792. }
  793. // Start searching through the page.
  794. this.#nextPageMatch();
  795. }
  796. #matchesReady(matches) {
  797. const offset = this._offset;
  798. const numMatches = matches.length;
  799. const previous = this._state.findPrevious;
  800. if (numMatches) {
  801. // There were matches for the page, so initialize `matchIdx`.
  802. offset.matchIdx = previous ? numMatches - 1 : 0;
  803. this.#updateMatch(/* found = */ true);
  804. return true;
  805. }
  806. // No matches, so attempt to search the next page.
  807. this.#advanceOffsetPage(previous);
  808. if (offset.wrapped) {
  809. offset.matchIdx = null;
  810. if (this._pagesToSearch < 0) {
  811. // No point in wrapping again, there were no matches.
  812. this.#updateMatch(/* found = */ false);
  813. // While matches were not found, searching for a page
  814. // with matches should nevertheless halt.
  815. return true;
  816. }
  817. }
  818. // Matches were not found (and searching is not done).
  819. return false;
  820. }
  821. #nextPageMatch() {
  822. if (this._resumePageIdx !== null) {
  823. console.error("There can only be one pending page.");
  824. }
  825. let matches = null;
  826. do {
  827. const pageIdx = this._offset.pageIdx;
  828. matches = this._pageMatches[pageIdx];
  829. if (!matches) {
  830. // The matches don't exist yet for processing by `_matchesReady`,
  831. // so set a resume point for when they do exist.
  832. this._resumePageIdx = pageIdx;
  833. break;
  834. }
  835. } while (!this.#matchesReady(matches));
  836. }
  837. #advanceOffsetPage(previous) {
  838. const offset = this._offset;
  839. const numPages = this._linkService.pagesCount;
  840. offset.pageIdx = previous ? offset.pageIdx - 1 : offset.pageIdx + 1;
  841. offset.matchIdx = null;
  842. this._pagesToSearch--;
  843. if (offset.pageIdx >= numPages || offset.pageIdx < 0) {
  844. offset.pageIdx = previous ? numPages - 1 : 0;
  845. offset.wrapped = true;
  846. }
  847. }
  848. #updateMatch(found = false) {
  849. let state = FindState.NOT_FOUND;
  850. const wrapped = this._offset.wrapped;
  851. this._offset.wrapped = false;
  852. if (found) {
  853. const previousPage = this._selected.pageIdx;
  854. this._selected.pageIdx = this._offset.pageIdx;
  855. this._selected.matchIdx = this._offset.matchIdx;
  856. state = wrapped ? FindState.WRAPPED : FindState.FOUND;
  857. // Update the currently selected page to wipe out any selected matches.
  858. if (previousPage !== -1 && previousPage !== this._selected.pageIdx) {
  859. this.#updatePage(previousPage);
  860. }
  861. }
  862. this.#updateUIState(state, this._state.findPrevious);
  863. if (this._selected.pageIdx !== -1) {
  864. // Ensure that the match will be scrolled into view.
  865. this._scrollMatches = true;
  866. this.#updatePage(this._selected.pageIdx);
  867. }
  868. }
  869. #onFindBarClose(evt) {
  870. const pdfDocument = this._pdfDocument;
  871. // Since searching is asynchronous, ensure that the removal of highlighted
  872. // matches (from the UI) is async too such that the 'updatetextlayermatches'
  873. // events will always be dispatched in the expected order.
  874. this._firstPageCapability.promise.then(() => {
  875. // Only update the UI if the document is open, and is the current one.
  876. if (
  877. !this._pdfDocument ||
  878. (pdfDocument && this._pdfDocument !== pdfDocument)
  879. ) {
  880. return;
  881. }
  882. // Ensure that a pending, not yet started, search operation is aborted.
  883. if (this._findTimeout) {
  884. clearTimeout(this._findTimeout);
  885. this._findTimeout = null;
  886. }
  887. // Abort any long running searches, to avoid a match being scrolled into
  888. // view *after* the findbar has been closed. In this case `this._offset`
  889. // will most likely differ from `this._selected`, hence we also ensure
  890. // that any new search operation will always start with a clean slate.
  891. if (this._resumePageIdx) {
  892. this._resumePageIdx = null;
  893. this._dirtyMatch = true;
  894. }
  895. // Avoid the UI being in a pending state when the findbar is re-opened.
  896. this.#updateUIState(FindState.FOUND);
  897. this._highlightMatches = false;
  898. this.#updateAllPages(); // Wipe out any previously highlighted matches.
  899. });
  900. }
  901. #requestMatchesCount() {
  902. const { pageIdx, matchIdx } = this._selected;
  903. let current = 0,
  904. total = this._matchesCountTotal;
  905. if (matchIdx !== -1) {
  906. for (let i = 0; i < pageIdx; i++) {
  907. current += this._pageMatches[i]?.length || 0;
  908. }
  909. current += matchIdx + 1;
  910. }
  911. // When searching starts, this method may be called before the `pageMatches`
  912. // have been counted (in `_calculateMatch`). Ensure that the UI won't show
  913. // temporarily broken state when the active find result doesn't make sense.
  914. if (current < 1 || current > total) {
  915. current = total = 0;
  916. }
  917. return { current, total };
  918. }
  919. #updateUIResultsCount() {
  920. this._eventBus.dispatch("updatefindmatchescount", {
  921. source: this,
  922. matchesCount: this.#requestMatchesCount(),
  923. });
  924. }
  925. #updateUIState(state, previous = false) {
  926. this._eventBus.dispatch("updatefindcontrolstate", {
  927. source: this,
  928. state,
  929. previous,
  930. matchesCount: this.#requestMatchesCount(),
  931. rawQuery: this._state?.query ?? null,
  932. });
  933. }
  934. }
  935. export { FindState, PDFFindController };