123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047 |
- /* Copyright 2012 Mozilla Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /** @typedef {import("../src/display/api").PDFDocumentProxy} PDFDocumentProxy */
- /** @typedef {import("./event_utils").EventBus} EventBus */
- /** @typedef {import("./interfaces").IPDFLinkService} IPDFLinkService */
- import { binarySearchFirstItem, scrollIntoView } from "./ui_utils.js";
- import { createPromiseCapability } from "pdfjs-lib";
- import { getCharacterType } from "./pdf_find_utils.js";
- const FindState = {
- FOUND: 0,
- NOT_FOUND: 1,
- WRAPPED: 2,
- PENDING: 3,
- };
- const FIND_TIMEOUT = 250; // ms
- const MATCH_SCROLL_OFFSET_TOP = -50; // px
- const MATCH_SCROLL_OFFSET_LEFT = -400; // px
- const CHARACTERS_TO_NORMALIZE = {
- "\u2010": "-", // Hyphen
- "\u2018": "'", // Left single quotation mark
- "\u2019": "'", // Right single quotation mark
- "\u201A": "'", // Single low-9 quotation mark
- "\u201B": "'", // Single high-reversed-9 quotation mark
- "\u201C": '"', // Left double quotation mark
- "\u201D": '"', // Right double quotation mark
- "\u201E": '"', // Double low-9 quotation mark
- "\u201F": '"', // Double high-reversed-9 quotation mark
- "\u00BC": "1/4", // Vulgar fraction one quarter
- "\u00BD": "1/2", // Vulgar fraction one half
- "\u00BE": "3/4", // Vulgar fraction three quarters
- };
- // These diacritics aren't considered as combining diacritics
- // when searching in a document:
- // https://searchfox.org/mozilla-central/source/intl/unicharutil/util/is_combining_diacritic.py.
- // The combining class definitions can be found:
- // https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values
- // Category 0 corresponds to [^\p{Mn}].
- const DIACRITICS_EXCEPTION = new Set([
- // UNICODE_COMBINING_CLASS_KANA_VOICING
- // https://www.compart.com/fr/unicode/combining/8
- 0x3099, 0x309a,
- // UNICODE_COMBINING_CLASS_VIRAMA (under 0xFFFF)
- // https://www.compart.com/fr/unicode/combining/9
- 0x094d, 0x09cd, 0x0a4d, 0x0acd, 0x0b4d, 0x0bcd, 0x0c4d, 0x0ccd, 0x0d3b,
- 0x0d3c, 0x0d4d, 0x0dca, 0x0e3a, 0x0eba, 0x0f84, 0x1039, 0x103a, 0x1714,
- 0x1734, 0x17d2, 0x1a60, 0x1b44, 0x1baa, 0x1bab, 0x1bf2, 0x1bf3, 0x2d7f,
- 0xa806, 0xa82c, 0xa8c4, 0xa953, 0xa9c0, 0xaaf6, 0xabed,
- // 91
- // https://www.compart.com/fr/unicode/combining/91
- 0x0c56,
- // 129
- // https://www.compart.com/fr/unicode/combining/129
- 0x0f71,
- // 130
- // https://www.compart.com/fr/unicode/combining/130
- 0x0f72, 0x0f7a, 0x0f7b, 0x0f7c, 0x0f7d, 0x0f80,
- // 132
- // https://www.compart.com/fr/unicode/combining/132
- 0x0f74,
- ]);
- let DIACRITICS_EXCEPTION_STR; // Lazily initialized, see below.
- const DIACRITICS_REG_EXP = /\p{M}+/gu;
- const SPECIAL_CHARS_REG_EXP =
- /([.*+?^${}()|[\]\\])|(\p{P})|(\s+)|(\p{M})|(\p{L})/gu;
- const NOT_DIACRITIC_FROM_END_REG_EXP = /([^\p{M}])\p{M}*$/u;
- const NOT_DIACRITIC_FROM_START_REG_EXP = /^\p{M}*([^\p{M}])/u;
- // The range [AC00-D7AF] corresponds to the Hangul syllables.
- // The few other chars are some CJK Compatibility Ideographs.
- const SYLLABLES_REG_EXP = /[\uAC00-\uD7AF\uFA6C\uFACF-\uFAD1\uFAD5-\uFAD7]+/g;
- const SYLLABLES_LENGTHS = new Map();
- // When decomposed (in using NFD) the above syllables will start
- // with one of the chars in this regexp.
- const FIRST_CHAR_SYLLABLES_REG_EXP =
- "[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]";
- const NFKC_CHARS_TO_NORMALIZE = new Map();
- let noSyllablesRegExp = null;
- let withSyllablesRegExp = null;
- function normalize(text) {
- // The diacritics in the text or in the query can be composed or not.
- // So we use a decomposed text using NFD (and the same for the query)
- // in order to be sure that diacritics are in the same order.
- // Collect syllables length and positions.
- const syllablePositions = [];
- let m;
- while ((m = SYLLABLES_REG_EXP.exec(text)) !== null) {
- let { index } = m;
- for (const char of m[0]) {
- let len = SYLLABLES_LENGTHS.get(char);
- if (!len) {
- len = char.normalize("NFD").length;
- SYLLABLES_LENGTHS.set(char, len);
- }
- syllablePositions.push([len, index++]);
- }
- }
- let normalizationRegex;
- if (syllablePositions.length === 0 && noSyllablesRegExp) {
- normalizationRegex = noSyllablesRegExp;
- } else if (syllablePositions.length > 0 && withSyllablesRegExp) {
- normalizationRegex = withSyllablesRegExp;
- } else {
- // Compile the regular expression for text normalization once.
- const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
- const toNormalizeWithNFKC =
- "\u2460-\u2473" + // Circled numbers.
- "\u24b6-\u24ff" + // Circled letters/numbers.
- "\u3244-\u32bf" + // Circled ideograms/numbers.
- "\u32d0-\u32fe" + // Circled ideograms.
- "\uff00-\uffef"; // Halfwidth, fullwidth forms.
- // 3040-309F: Hiragana
- // 30A0-30FF: Katakana
- const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])";
- const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`;
- if (syllablePositions.length === 0) {
- // Most of the syllables belong to Hangul so there are no need
- // to search for them in a non-Hangul document.
- // We use the \0 in order to have the same number of groups.
- normalizationRegex = noSyllablesRegExp = new RegExp(
- regexp + "|(\\u0000)",
- "gum"
- );
- } else {
- normalizationRegex = withSyllablesRegExp = new RegExp(
- regexp + `|(${FIRST_CHAR_SYLLABLES_REG_EXP})`,
- "gum"
- );
- }
- }
- // The goal of this function is to normalize the string and
- // be able to get from an index in the new string the
- // corresponding index in the old string.
- // For example if we have: abCd12ef456gh where C is replaced by ccc
- // and numbers replaced by nothing (it's the case for diacritics), then
- // we'll obtain the normalized string: abcccdefgh.
- // So here the reverse map is: [0,1,2,2,2,3,6,7,11,12].
- // The goal is to obtain the array: [[0, 0], [3, -1], [4, -2],
- // [6, 0], [8, 3]].
- // which can be used like this:
- // - let say that i is the index in new string and j the index
- // the old string.
- // - if i is in [0; 3[ then j = i + 0
- // - if i is in [3; 4[ then j = i - 1
- // - if i is in [4; 6[ then j = i - 2
- // ...
- // Thanks to a binary search it's easy to know where is i and what's the
- // shift.
- // Let say that the last entry in the array is [x, s] and we have a
- // substitution at index y (old string) which will replace o chars by n chars.
- // Firstly, if o === n, then no need to add a new entry: the shift is
- // the same.
- // Secondly, if o < n, then we push the n - o elements:
- // [y - (s - 1), s - 1], [y - (s - 2), s - 2], ...
- // Thirdly, if o > n, then we push the element: [y - (s - n), o + s - n]
- // Collect diacritics length and positions.
- const rawDiacriticsPositions = [];
- while ((m = DIACRITICS_REG_EXP.exec(text)) !== null) {
- rawDiacriticsPositions.push([m[0].length, m.index]);
- }
- let normalized = text.normalize("NFD");
- const positions = [[0, 0]];
- let rawDiacriticsIndex = 0;
- let syllableIndex = 0;
- let shift = 0;
- let shiftOrigin = 0;
- let eol = 0;
- let hasDiacritics = false;
- normalized = normalized.replace(
- normalizationRegex,
- (match, p1, p2, p3, p4, p5, p6, p7, i) => {
- i -= shiftOrigin;
- if (p1) {
- // Maybe fractions or quotations mark...
- const replacement = CHARACTERS_TO_NORMALIZE[p1];
- const jj = replacement.length;
- for (let j = 1; j < jj; j++) {
- positions.push([i - shift + j, shift - j]);
- }
- shift -= jj - 1;
- return replacement;
- }
- if (p2) {
- // Use the NFKC representation to normalize the char.
- let replacement = NFKC_CHARS_TO_NORMALIZE.get(p2);
- if (!replacement) {
- replacement = p2.normalize("NFKC");
- NFKC_CHARS_TO_NORMALIZE.set(p2, replacement);
- }
- const jj = replacement.length;
- for (let j = 1; j < jj; j++) {
- positions.push([i - shift + j, shift - j]);
- }
- shift -= jj - 1;
- return replacement;
- }
- if (p3) {
- const hasTrailingDashEOL = p3.endsWith("\n");
- const len = hasTrailingDashEOL ? p3.length - 2 : p3.length;
- // Diacritics.
- hasDiacritics = true;
- let jj = len;
- if (i + eol === rawDiacriticsPositions[rawDiacriticsIndex]?.[1]) {
- jj -= rawDiacriticsPositions[rawDiacriticsIndex][0];
- ++rawDiacriticsIndex;
- }
- for (let j = 1; j <= jj; j++) {
- // i is the position of the first diacritic
- // so (i - 1) is the position for the letter before.
- positions.push([i - 1 - shift + j, shift - j]);
- }
- shift -= jj;
- shiftOrigin += jj;
- if (hasTrailingDashEOL) {
- // Diacritics are followed by a -\n.
- // See comments in `if (p4)` block.
- i += len - 1;
- positions.push([i - shift + 1, 1 + shift]);
- shift += 1;
- shiftOrigin += 1;
- eol += 1;
- return p3.slice(0, len);
- }
- return p3;
- }
- if (p4) {
- // "X-\n" is removed because an hyphen at the end of a line
- // with not a space before is likely here to mark a break
- // in a word.
- // The \n isn't in the original text so here y = i, n = 1 and o = 2.
- positions.push([i - shift + 1, 1 + shift]);
- shift += 1;
- shiftOrigin += 1;
- eol += 1;
- return p4.charAt(0);
- }
- if (p5) {
- // An ideographic at the end of a line doesn't imply adding an extra
- // white space.
- positions.push([i - shift + 1, shift]);
- shiftOrigin += 1;
- eol += 1;
- return p5.charAt(0);
- }
- if (p6) {
- // eol is replaced by space: "foo\nbar" is likely equivalent to
- // "foo bar".
- positions.push([i - shift + 1, shift - 1]);
- shift -= 1;
- shiftOrigin += 1;
- eol += 1;
- return " ";
- }
- // p7
- if (i + eol === syllablePositions[syllableIndex]?.[1]) {
- // A syllable (1 char) is replaced with several chars (n) so
- // newCharsLen = n - 1.
- const newCharLen = syllablePositions[syllableIndex][0] - 1;
- ++syllableIndex;
- for (let j = 1; j <= newCharLen; j++) {
- positions.push([i - (shift - j), shift - j]);
- }
- shift -= newCharLen;
- shiftOrigin += newCharLen;
- }
- return p7;
- }
- );
- positions.push([normalized.length, shift]);
- return [normalized, positions, hasDiacritics];
- }
- // Determine the original, non-normalized, match index such that highlighting of
- // search results is correct in the `textLayer` for strings containing e.g. "½"
- // characters; essentially "inverting" the result of the `normalize` function.
- function getOriginalIndex(diffs, pos, len) {
- if (!diffs) {
- return [pos, len];
- }
- const start = pos;
- const end = pos + len;
- let i = binarySearchFirstItem(diffs, x => x[0] >= start);
- if (diffs[i][0] > start) {
- --i;
- }
- let j = binarySearchFirstItem(diffs, x => x[0] >= end, i);
- if (diffs[j][0] > end) {
- --j;
- }
- return [start + diffs[i][1], len + diffs[j][1] - diffs[i][1]];
- }
- /**
- * @typedef {Object} PDFFindControllerOptions
- * @property {IPDFLinkService} linkService - The navigation/linking service.
- * @property {EventBus} eventBus - The application event bus.
- */
- /**
- * Provides search functionality to find a given string in a PDF document.
- */
- class PDFFindController {
- /**
- * @param {PDFFindControllerOptions} options
- */
- constructor({ linkService, eventBus }) {
- this._linkService = linkService;
- this._eventBus = eventBus;
- this.#reset();
- eventBus._on("find", this.#onFind.bind(this));
- eventBus._on("findbarclose", this.#onFindBarClose.bind(this));
- }
- get highlightMatches() {
- return this._highlightMatches;
- }
- get pageMatches() {
- return this._pageMatches;
- }
- get pageMatchesLength() {
- return this._pageMatchesLength;
- }
- get selected() {
- return this._selected;
- }
- get state() {
- return this._state;
- }
- /**
- * Set a reference to the PDF document in order to search it.
- * Note that searching is not possible if this method is not called.
- *
- * @param {PDFDocumentProxy} pdfDocument - The PDF document to search.
- */
- setDocument(pdfDocument) {
- if (this._pdfDocument) {
- this.#reset();
- }
- if (!pdfDocument) {
- return;
- }
- this._pdfDocument = pdfDocument;
- this._firstPageCapability.resolve();
- }
- #onFind(state) {
- if (!state) {
- return;
- }
- const pdfDocument = this._pdfDocument;
- const { type } = state;
- if (this._state === null || this.#shouldDirtyMatch(state)) {
- this._dirtyMatch = true;
- }
- this._state = state;
- if (type !== "highlightallchange") {
- this.#updateUIState(FindState.PENDING);
- }
- this._firstPageCapability.promise.then(() => {
- // If the document was closed before searching began, or if the search
- // operation was relevant for a previously opened document, do nothing.
- if (
- !this._pdfDocument ||
- (pdfDocument && this._pdfDocument !== pdfDocument)
- ) {
- return;
- }
- this.#extractText();
- const findbarClosed = !this._highlightMatches;
- const pendingTimeout = !!this._findTimeout;
- if (this._findTimeout) {
- clearTimeout(this._findTimeout);
- this._findTimeout = null;
- }
- if (!type) {
- // Trigger the find action with a small delay to avoid starting the
- // search when the user is still typing (saving resources).
- this._findTimeout = setTimeout(() => {
- this.#nextMatch();
- this._findTimeout = null;
- }, FIND_TIMEOUT);
- } else if (this._dirtyMatch) {
- // Immediately trigger searching for non-'find' operations, when the
- // current state needs to be reset and matches re-calculated.
- this.#nextMatch();
- } else if (type === "again") {
- this.#nextMatch();
- // When the findbar was previously closed, and `highlightAll` is set,
- // ensure that the matches on all active pages are highlighted again.
- if (findbarClosed && this._state.highlightAll) {
- this.#updateAllPages();
- }
- } else if (type === "highlightallchange") {
- // If there was a pending search operation, synchronously trigger a new
- // search *first* to ensure that the correct matches are highlighted.
- if (pendingTimeout) {
- this.#nextMatch();
- } else {
- this._highlightMatches = true;
- }
- this.#updateAllPages(); // Update the highlighting on all active pages.
- } else {
- this.#nextMatch();
- }
- });
- }
- scrollMatchIntoView({
- element = null,
- selectedLeft = 0,
- pageIndex = -1,
- matchIndex = -1,
- }) {
- if (!this._scrollMatches || !element) {
- return;
- } else if (matchIndex === -1 || matchIndex !== this._selected.matchIdx) {
- return;
- } else if (pageIndex === -1 || pageIndex !== this._selected.pageIdx) {
- return;
- }
- this._scrollMatches = false; // Ensure that scrolling only happens once.
- const spot = {
- top: MATCH_SCROLL_OFFSET_TOP,
- left: selectedLeft + MATCH_SCROLL_OFFSET_LEFT,
- };
- scrollIntoView(element, spot, /* scrollMatches = */ true);
- }
- #reset() {
- this._highlightMatches = false;
- this._scrollMatches = false;
- this._pdfDocument = null;
- this._pageMatches = [];
- this._pageMatchesLength = [];
- this._state = null;
- // Currently selected match.
- this._selected = {
- pageIdx: -1,
- matchIdx: -1,
- };
- // Where the find algorithm currently is in the document.
- this._offset = {
- pageIdx: null,
- matchIdx: null,
- wrapped: false,
- };
- this._extractTextPromises = [];
- this._pageContents = []; // Stores the normalized text for each page.
- this._pageDiffs = [];
- this._hasDiacritics = [];
- this._matchesCountTotal = 0;
- this._pagesToSearch = null;
- this._pendingFindMatches = new Set();
- this._resumePageIdx = null;
- this._dirtyMatch = false;
- clearTimeout(this._findTimeout);
- this._findTimeout = null;
- this._firstPageCapability = createPromiseCapability();
- }
- /**
- * @type {string} The (current) normalized search query.
- */
- get #query() {
- if (this._state.query !== this._rawQuery) {
- this._rawQuery = this._state.query;
- [this._normalizedQuery] = normalize(this._state.query);
- }
- return this._normalizedQuery;
- }
- #shouldDirtyMatch(state) {
- // When the search query changes, regardless of the actual search command
- // used, always re-calculate matches to avoid errors (fixes bug 1030622).
- if (state.query !== this._state.query) {
- return true;
- }
- switch (state.type) {
- case "again":
- const pageNumber = this._selected.pageIdx + 1;
- const linkService = this._linkService;
- // Only treat a 'findagain' event as a new search operation when it's
- // *absolutely* certain that the currently selected match is no longer
- // visible, e.g. as a result of the user scrolling in the document.
- //
- // NOTE: If only a simple `this._linkService.page` check was used here,
- // there's a risk that consecutive 'findagain' operations could "skip"
- // over matches at the top/bottom of pages thus making them completely
- // inaccessible when there's multiple pages visible in the viewer.
- if (
- pageNumber >= 1 &&
- pageNumber <= linkService.pagesCount &&
- pageNumber !== linkService.page &&
- !linkService.isPageVisible(pageNumber)
- ) {
- return true;
- }
- return false;
- case "highlightallchange":
- return false;
- }
- return true;
- }
- /**
- * Determine if the search query constitutes a "whole word", by comparing the
- * first/last character type with the preceding/following character type.
- */
- #isEntireWord(content, startIdx, length) {
- let match = content
- .slice(0, startIdx)
- .match(NOT_DIACRITIC_FROM_END_REG_EXP);
- if (match) {
- const first = content.charCodeAt(startIdx);
- const limit = match[1].charCodeAt(0);
- if (getCharacterType(first) === getCharacterType(limit)) {
- return false;
- }
- }
- match = content
- .slice(startIdx + length)
- .match(NOT_DIACRITIC_FROM_START_REG_EXP);
- if (match) {
- const last = content.charCodeAt(startIdx + length - 1);
- const limit = match[1].charCodeAt(0);
- if (getCharacterType(last) === getCharacterType(limit)) {
- return false;
- }
- }
- return true;
- }
- #calculateRegExpMatch(query, entireWord, pageIndex, pageContent) {
- const matches = [],
- matchesLength = [];
- const diffs = this._pageDiffs[pageIndex];
- let match;
- while ((match = query.exec(pageContent)) !== null) {
- if (
- entireWord &&
- !this.#isEntireWord(pageContent, match.index, match[0].length)
- ) {
- continue;
- }
- const [matchPos, matchLen] = getOriginalIndex(
- diffs,
- match.index,
- match[0].length
- );
- if (matchLen) {
- matches.push(matchPos);
- matchesLength.push(matchLen);
- }
- }
- this._pageMatches[pageIndex] = matches;
- this._pageMatchesLength[pageIndex] = matchesLength;
- }
- #convertToRegExpString(query, hasDiacritics) {
- const { matchDiacritics } = this._state;
- let isUnicode = false;
- query = query.replace(
- SPECIAL_CHARS_REG_EXP,
- (
- match,
- p1 /* to escape */,
- p2 /* punctuation */,
- p3 /* whitespaces */,
- p4 /* diacritics */,
- p5 /* letters */
- ) => {
- // We don't need to use a \s for whitespaces since all the different
- // kind of whitespaces are replaced by a single " ".
- if (p1) {
- // Escape characters like *+?... to not interfer with regexp syntax.
- return `[ ]*\\${p1}[ ]*`;
- }
- if (p2) {
- // Allow whitespaces around punctuation signs.
- return `[ ]*${p2}[ ]*`;
- }
- if (p3) {
- // Replace spaces by \s+ to be sure to match any spaces.
- return "[ ]+";
- }
- if (matchDiacritics) {
- return p4 || p5;
- }
- if (p4) {
- // Diacritics are removed with few exceptions.
- return DIACRITICS_EXCEPTION.has(p4.charCodeAt(0)) ? p4 : "";
- }
- // A letter has been matched and it can be followed by any diacritics
- // in normalized text.
- if (hasDiacritics) {
- isUnicode = true;
- return `${p5}\\p{M}*`;
- }
- return p5;
- }
- );
- const trailingSpaces = "[ ]*";
- if (query.endsWith(trailingSpaces)) {
- // The [ ]* has been added in order to help to match "foo . bar" but
- // it doesn't make sense to match some whitespaces after the dot
- // when it's the last character.
- query = query.slice(0, query.length - trailingSpaces.length);
- }
- if (matchDiacritics) {
- // aX must not match aXY.
- if (hasDiacritics) {
- DIACRITICS_EXCEPTION_STR ||= String.fromCharCode(
- ...DIACRITICS_EXCEPTION
- );
- isUnicode = true;
- query = `${query}(?=[${DIACRITICS_EXCEPTION_STR}]|[^\\p{M}]|$)`;
- }
- }
- return [isUnicode, query];
- }
- #calculateMatch(pageIndex) {
- let query = this.#query;
- if (query.length === 0) {
- // Do nothing: the matches should be wiped out already.
- return;
- }
- const { caseSensitive, entireWord, phraseSearch } = this._state;
- const pageContent = this._pageContents[pageIndex];
- const hasDiacritics = this._hasDiacritics[pageIndex];
- let isUnicode = false;
- if (phraseSearch) {
- [isUnicode, query] = this.#convertToRegExpString(query, hasDiacritics);
- } else {
- // Words are sorted in reverse order to be sure that "foobar" is matched
- // before "foo" in case the query is "foobar foo".
- const match = query.match(/\S+/g);
- if (match) {
- query = match
- .sort()
- .reverse()
- .map(q => {
- const [isUnicodePart, queryPart] = this.#convertToRegExpString(
- q,
- hasDiacritics
- );
- isUnicode ||= isUnicodePart;
- return `(${queryPart})`;
- })
- .join("|");
- }
- }
- const flags = `g${isUnicode ? "u" : ""}${caseSensitive ? "" : "i"}`;
- query = new RegExp(query, flags);
- this.#calculateRegExpMatch(query, entireWord, pageIndex, pageContent);
- // When `highlightAll` is set, ensure that the matches on previously
- // rendered (and still active) pages are correctly highlighted.
- if (this._state.highlightAll) {
- this.#updatePage(pageIndex);
- }
- if (this._resumePageIdx === pageIndex) {
- this._resumePageIdx = null;
- this.#nextPageMatch();
- }
- // Update the match count.
- const pageMatchesCount = this._pageMatches[pageIndex].length;
- if (pageMatchesCount > 0) {
- this._matchesCountTotal += pageMatchesCount;
- this.#updateUIResultsCount();
- }
- }
- #extractText() {
- // Perform text extraction once if this method is called multiple times.
- if (this._extractTextPromises.length > 0) {
- return;
- }
- let promise = Promise.resolve();
- for (let i = 0, ii = this._linkService.pagesCount; i < ii; i++) {
- const extractTextCapability = createPromiseCapability();
- this._extractTextPromises[i] = extractTextCapability.promise;
- promise = promise.then(() => {
- return this._pdfDocument
- .getPage(i + 1)
- .then(pdfPage => {
- return pdfPage.getTextContent();
- })
- .then(
- textContent => {
- const strBuf = [];
- for (const textItem of textContent.items) {
- strBuf.push(textItem.str);
- if (textItem.hasEOL) {
- strBuf.push("\n");
- }
- }
- // Store the normalized page content (text items) as one string.
- [
- this._pageContents[i],
- this._pageDiffs[i],
- this._hasDiacritics[i],
- ] = normalize(strBuf.join(""));
- extractTextCapability.resolve();
- },
- reason => {
- console.error(
- `Unable to get text content for page ${i + 1}`,
- reason
- );
- // Page error -- assuming no text content.
- this._pageContents[i] = "";
- this._pageDiffs[i] = null;
- this._hasDiacritics[i] = false;
- extractTextCapability.resolve();
- }
- );
- });
- }
- }
- #updatePage(index) {
- if (this._scrollMatches && this._selected.pageIdx === index) {
- // If the page is selected, scroll the page into view, which triggers
- // rendering the page, which adds the text layer. Once the text layer
- // is built, it will attempt to scroll the selected match into view.
- this._linkService.page = index + 1;
- }
- this._eventBus.dispatch("updatetextlayermatches", {
- source: this,
- pageIndex: index,
- });
- }
- #updateAllPages() {
- this._eventBus.dispatch("updatetextlayermatches", {
- source: this,
- pageIndex: -1,
- });
- }
- #nextMatch() {
- const previous = this._state.findPrevious;
- const currentPageIndex = this._linkService.page - 1;
- const numPages = this._linkService.pagesCount;
- this._highlightMatches = true;
- if (this._dirtyMatch) {
- // Need to recalculate the matches, reset everything.
- this._dirtyMatch = false;
- this._selected.pageIdx = this._selected.matchIdx = -1;
- this._offset.pageIdx = currentPageIndex;
- this._offset.matchIdx = null;
- this._offset.wrapped = false;
- this._resumePageIdx = null;
- this._pageMatches.length = 0;
- this._pageMatchesLength.length = 0;
- this._matchesCountTotal = 0;
- this.#updateAllPages(); // Wipe out any previously highlighted matches.
- for (let i = 0; i < numPages; i++) {
- // Start finding the matches as soon as the text is extracted.
- if (this._pendingFindMatches.has(i)) {
- continue;
- }
- this._pendingFindMatches.add(i);
- this._extractTextPromises[i].then(() => {
- this._pendingFindMatches.delete(i);
- this.#calculateMatch(i);
- });
- }
- }
- // If there's no query there's no point in searching.
- if (this.#query === "") {
- this.#updateUIState(FindState.FOUND);
- return;
- }
- // If we're waiting on a page, we return since we can't do anything else.
- if (this._resumePageIdx) {
- return;
- }
- const offset = this._offset;
- // Keep track of how many pages we should maximally iterate through.
- this._pagesToSearch = numPages;
- // If there's already a `matchIdx` that means we are iterating through a
- // page's matches.
- if (offset.matchIdx !== null) {
- const numPageMatches = this._pageMatches[offset.pageIdx].length;
- if (
- (!previous && offset.matchIdx + 1 < numPageMatches) ||
- (previous && offset.matchIdx > 0)
- ) {
- // The simple case; we just have advance the matchIdx to select
- // the next match on the page.
- offset.matchIdx = previous ? offset.matchIdx - 1 : offset.matchIdx + 1;
- this.#updateMatch(/* found = */ true);
- return;
- }
- // We went beyond the current page's matches, so we advance to
- // the next page.
- this.#advanceOffsetPage(previous);
- }
- // Start searching through the page.
- this.#nextPageMatch();
- }
- #matchesReady(matches) {
- const offset = this._offset;
- const numMatches = matches.length;
- const previous = this._state.findPrevious;
- if (numMatches) {
- // There were matches for the page, so initialize `matchIdx`.
- offset.matchIdx = previous ? numMatches - 1 : 0;
- this.#updateMatch(/* found = */ true);
- return true;
- }
- // No matches, so attempt to search the next page.
- this.#advanceOffsetPage(previous);
- if (offset.wrapped) {
- offset.matchIdx = null;
- if (this._pagesToSearch < 0) {
- // No point in wrapping again, there were no matches.
- this.#updateMatch(/* found = */ false);
- // While matches were not found, searching for a page
- // with matches should nevertheless halt.
- return true;
- }
- }
- // Matches were not found (and searching is not done).
- return false;
- }
- #nextPageMatch() {
- if (this._resumePageIdx !== null) {
- console.error("There can only be one pending page.");
- }
- let matches = null;
- do {
- const pageIdx = this._offset.pageIdx;
- matches = this._pageMatches[pageIdx];
- if (!matches) {
- // The matches don't exist yet for processing by `_matchesReady`,
- // so set a resume point for when they do exist.
- this._resumePageIdx = pageIdx;
- break;
- }
- } while (!this.#matchesReady(matches));
- }
- #advanceOffsetPage(previous) {
- const offset = this._offset;
- const numPages = this._linkService.pagesCount;
- offset.pageIdx = previous ? offset.pageIdx - 1 : offset.pageIdx + 1;
- offset.matchIdx = null;
- this._pagesToSearch--;
- if (offset.pageIdx >= numPages || offset.pageIdx < 0) {
- offset.pageIdx = previous ? numPages - 1 : 0;
- offset.wrapped = true;
- }
- }
- #updateMatch(found = false) {
- let state = FindState.NOT_FOUND;
- const wrapped = this._offset.wrapped;
- this._offset.wrapped = false;
- if (found) {
- const previousPage = this._selected.pageIdx;
- this._selected.pageIdx = this._offset.pageIdx;
- this._selected.matchIdx = this._offset.matchIdx;
- state = wrapped ? FindState.WRAPPED : FindState.FOUND;
- // Update the currently selected page to wipe out any selected matches.
- if (previousPage !== -1 && previousPage !== this._selected.pageIdx) {
- this.#updatePage(previousPage);
- }
- }
- this.#updateUIState(state, this._state.findPrevious);
- if (this._selected.pageIdx !== -1) {
- // Ensure that the match will be scrolled into view.
- this._scrollMatches = true;
- this.#updatePage(this._selected.pageIdx);
- }
- }
- #onFindBarClose(evt) {
- const pdfDocument = this._pdfDocument;
- // Since searching is asynchronous, ensure that the removal of highlighted
- // matches (from the UI) is async too such that the 'updatetextlayermatches'
- // events will always be dispatched in the expected order.
- this._firstPageCapability.promise.then(() => {
- // Only update the UI if the document is open, and is the current one.
- if (
- !this._pdfDocument ||
- (pdfDocument && this._pdfDocument !== pdfDocument)
- ) {
- return;
- }
- // Ensure that a pending, not yet started, search operation is aborted.
- if (this._findTimeout) {
- clearTimeout(this._findTimeout);
- this._findTimeout = null;
- }
- // Abort any long running searches, to avoid a match being scrolled into
- // view *after* the findbar has been closed. In this case `this._offset`
- // will most likely differ from `this._selected`, hence we also ensure
- // that any new search operation will always start with a clean slate.
- if (this._resumePageIdx) {
- this._resumePageIdx = null;
- this._dirtyMatch = true;
- }
- // Avoid the UI being in a pending state when the findbar is re-opened.
- this.#updateUIState(FindState.FOUND);
- this._highlightMatches = false;
- this.#updateAllPages(); // Wipe out any previously highlighted matches.
- });
- }
- #requestMatchesCount() {
- const { pageIdx, matchIdx } = this._selected;
- let current = 0,
- total = this._matchesCountTotal;
- if (matchIdx !== -1) {
- for (let i = 0; i < pageIdx; i++) {
- current += this._pageMatches[i]?.length || 0;
- }
- current += matchIdx + 1;
- }
- // When searching starts, this method may be called before the `pageMatches`
- // have been counted (in `_calculateMatch`). Ensure that the UI won't show
- // temporarily broken state when the active find result doesn't make sense.
- if (current < 1 || current > total) {
- current = total = 0;
- }
- return { current, total };
- }
- #updateUIResultsCount() {
- this._eventBus.dispatch("updatefindmatchescount", {
- source: this,
- matchesCount: this.#requestMatchesCount(),
- });
- }
- #updateUIState(state, previous = false) {
- this._eventBus.dispatch("updatefindcontrolstate", {
- source: this,
- state,
- previous,
- matchesCount: this.#requestMatchesCount(),
- rawQuery: this._state?.query ?? null,
- });
- }
- }
- export { FindState, PDFFindController };
|