From 8358ab63b3a331cd90e06c35f26e8907cf3f7b81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Ribaudo?= Date: Tue, 14 Jan 2025 17:54:37 +0100 Subject: [PATCH] Allow searching for number-number on two lines When a dash separates two digits, it's very likely to not be a hyphen inserted to split a word into two lines (e.g. "par\n-ser"), but rather either a minus sign, a range, or a date. For example, in the tracemonkey PDF there is `2008-02` (a date) split across two lines. Preserving the dash, similarly to how we do for compound words, allows searches for "2008-02" to find a match. --- test/unit/pdf_find_controller_spec.js | 34 ++++++++++++ web/pdf_find_controller.js | 76 ++++++++++++++++----------- 2 files changed, 79 insertions(+), 31 deletions(-) diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index ad5dd49f81a5c..76ac0485fd5bf 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -1104,6 +1104,40 @@ describe("pdf_find_controller", function () { }); }); + it("performs a search with a dash between two digits", async () => { + const { eventBus, pdfFindController } = await initPdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "2008-02", + }, + matchesPerPage: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], + selectedMatch: { + pageIndex: 13, + matchIndex: 0, + }, + pageMatches: [[], [], [], [], [], [], [], [], [], [], [], [], [], [314]], + pageMatchesLength: [ + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [7], + ], + }); + }); + describe("custom matcher", () => { it("calls to the matcher with the right arguments", async () => { const QUERY = "Foo bar"; diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index fc6250c65c854..0b46117e9f986 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -117,10 +117,12 @@ function normalize(text) { } } + const hasSyllables = syllablePositions.length > 0; + let normalizationRegex; - if (syllablePositions.length === 0 && noSyllablesRegExp) { + if (!hasSyllables && noSyllablesRegExp) { normalizationRegex = noSyllablesRegExp; - } else if (syllablePositions.length > 0 && withSyllablesRegExp) { + } else if (hasSyllables && withSyllablesRegExp) { normalizationRegex = withSyllablesRegExp; } else { // Compile the regular expression for text normalization once. @@ -131,22 +133,33 @@ function normalize(text) { // 30A0-30FF: Katakana const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])"; const HKDiacritics = "(?:\u3099|\u309A)"; - const CompoundWord = "\\p{Ll}-\\n\\p{Lu}"; - const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(${CompoundWord})|(\\S-\\n)|(${CJK}\\n)|(\\n)`; - - if (syllablePositions.length === 0) { - // Most of the syllables belong to Hangul so there are no need - // to search for them in a non-Hangul document. - // We use the \0 in order to have the same number of groups. - normalizationRegex = noSyllablesRegExp = new RegExp( - regexp + "|(\\u0000)", - "gum" - ); + const BrokenWord = `\\p{Ll}-\\n(?=\\p{Ll})|\\p{Lu}-\\n(?=\\p{L})`; + + const regexps = [ + /* p1 */ `[${replace}]`, + /* p2 */ `[${toNormalizeWithNFKC}]`, + /* p3 */ `${HKDiacritics}\\n`, + /* p4 */ "\\p{M}+(?:-\\n)?", + /* p5 */ `${BrokenWord}`, + /* p6 */ "\\S-\\n", + /* p7 */ `${CJK}\\n`, + /* p8 */ "\\n", + /* p9 */ hasSyllables + ? FIRST_CHAR_SYLLABLES_REG_EXP + : // Most of the syllables belong to Hangul so there are no need + // to search for them in a non-Hangul document. + // We use the \0 in order to have the same number of groups. + "\\u0000", + ]; + normalizationRegex = new RegExp( + regexps.map(r => `(${r})`).join("|"), + "gum" + ); + + if (hasSyllables) { + withSyllablesRegExp = normalizationRegex; } else { - normalizationRegex = withSyllablesRegExp = new RegExp( - regexp + `|(${FIRST_CHAR_SYLLABLES_REG_EXP})`, - "gum" - ); + noSyllablesRegExp = normalizationRegex; } } @@ -281,26 +294,27 @@ function normalize(text) { } if (p5) { - // Compound word with a line break after the hyphen. - // Since the \n isn't in the original text, o = 3 and n = 3. - shiftOrigin += 1; - eol += 1; - return p5.replace("\n", ""); - } - - if (p6) { - // "X-\n" is removed because an hyphen at the end of a line - // with not a space before is likely here to mark a break - // in a word. + // In "X-\ny", "-\n" is removed because an hyphen at the end of a line + // between two letters is likely here to mark a break in a word. // If X is encoded with UTF-32 then it can have a length greater than 1. // The \n isn't in the original text so here y = i, n = X.len - 2 and // o = X.len - 1. - const len = p6.length - 2; + const len = p5.length - 2; positions.push(i - shift + len, 1 + shift); shift += 1; shiftOrigin += 1; eol += 1; - return p6.slice(0, -2); + return p5.slice(0, -2); + } + + if (p6) { + // A - following a non-space character that is not detected as the + // hyphen breaking a word in two lines needs to be preserved. It could + // be, for example, in a compound word or in a date. + // Only remove the newline. + shiftOrigin += 1; + eol += 1; + return p6.slice(0, -1); } if (p7) { @@ -324,7 +338,7 @@ function normalize(text) { return " "; } - // p8 + // p9 if (i + eol === syllablePositions[syllableIndex]?.[1]) { // A syllable (1 char) is replaced with several chars (n) so // newCharsLen = n - 1.