Skip to content

Commit

Permalink
Fixes and performance improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
mrtcode committed May 29, 2024
1 parent 97a8acd commit 849a4e2
Show file tree
Hide file tree
Showing 15 changed files with 171 additions and 146 deletions.
58 changes: 57 additions & 1 deletion src/core/evaluator.js
Original file line number Diff line number Diff line change
Expand Up @@ -3026,6 +3026,62 @@ class PartialEvaluator {

let diagonal = rotation % 90 !== 0;

function normalizeText(text) {
// Normalize the text to NFKD form to decompose ligatures and combined characters
let normalizedText = text.normalize('NFKD');

// Handling known special cases where combining characters may still be decomposed
const specialCases = {
'e\u0301': 'é', // e + ´ -> é
'a\u0301': 'á', // a + ´ -> á
'i\u0301': 'í', // i + ´ -> í
'o\u0301': 'ó', // o + ´ -> ó
'u\u0301': 'ú', // u + ´ -> ú
'e\u0300': 'è', // e + ` -> è
'a\u0300': 'à', // a + ` -> à
'i\u0300': 'ì', // i + ` -> ì
'o\u0300': 'ò', // o + ` -> ò
'u\u0300': 'ù', // u + ` -> ù
'e\u0302': 'ê', // e + ^ -> ê
'a\u0302': 'â', // a + ^ -> â
'i\u0302': 'î', // i + ^ -> î
'o\u0302': 'ô', // o + ^ -> ô
'u\u0302': 'û', // u + ^ -> û
'e\u0308': 'ë', // e + ¨ -> ë
'a\u0308': 'ä', // a + ¨ -> ä
'i\u0308': 'ï', // i + ¨ -> ï
'o\u0308': 'ö', // o + ¨ -> ö
'u\u0308': 'ü', // u + ¨ -> ü
'c\u0327': 'ç', // c + ¸ -> ç
'n\u0303': 'ñ', // n + ˜ -> ñ
// Add other special cases here
};

// Convert specialCases to a map for fast lookup
const specialCasesMap = new Map(Object.entries(specialCases));

// Create a new array to hold the result characters
let result = [];
for (let i = 0; i < normalizedText.length; i++) {
// Try to find a match in the specialCasesMap
let found = false;
for (let [decomposed, composed] of specialCasesMap) {
if (normalizedText.startsWith(decomposed, i)) {
result.push(composed);
i += decomposed.length - 1; // Adjust index to skip the matched decomposed sequence
found = true;
break;
}
}
// If no match is found, just add the current character
if (!found) {
result.push(normalizedText[i]);
}
}

return result.join('');
}

if (
glyph.unicode !== ' ' &&
fontSize !== 0 &&
Expand All @@ -3034,7 +3090,7 @@ class PartialEvaluator {
) {
textChunk.chars.push({
// Decomposed ligatures, normalized Arabic characters
c: glyphUnicode,
c: normalizeText(glyphUnicode),
// Normalizes Arabic characters others characters where length remains 1, but preserves
// ligatures and more importantly avoids 'e\u00be' being converted into 'e \u0301'
// which is quite common in Spanish author names and because of the space prevents
Expand Down
2 changes: 1 addition & 1 deletion src/core/module/content-rect.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { distance } from './lib/levenstein.js';
import { getPageLabel } from './page-label.js';
import { getCenterRect, getClusters, getRectCenter } from './utilities.js';
import { getCenterRect, getClusters, getRectCenter } from './util.js';

// TODO: Take into account horizontal pages

Expand Down
7 changes: 4 additions & 3 deletions src/core/module/link/annotation-overlays.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import { overlayDestinationsEqual } from './util.js';
import {
getPositionFromDestination,
overlayDestinationsEqual
} from './util.js';
import { getRectCenter, getSortIndex } from '../utilities.js';
getRectCenter,
getSortIndex,
} from '../util.js';

async function _getLinkAnnotationOverlays(pdfDocument, structuredCharsProvider, pageIndex){
let overlays = [];
Expand Down
2 changes: 1 addition & 1 deletion src/core/module/link/link.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { getParsedOverlays } from './parsed-overlays.js';
import { getAnnotationOverlays } from './annotation-overlays.js';
import { getMatchedOverlays } from './matched-overlays.js';
import { overlaysIntersect } from '../utilities.js';
import { overlaysIntersect } from '../util.js';

export async function getRegularLinkOverlays(pdfDocument, structuredCharsProvider, pageIndex) {
let annotationOverlays = await getAnnotationOverlays(pdfDocument, structuredCharsProvider, pageIndex);
Expand Down
2 changes: 1 addition & 1 deletion src/core/module/link/matched-overlays.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { getSortIndex } from '../utilities.js';
import { getSortIndex } from '../util.js';

let labels = [
['figure', 'fig'],
Expand Down
2 changes: 1 addition & 1 deletion src/core/module/link/parsed-overlays.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { getRangeRects, getSortIndex } from '../utilities.js';
import { getRangeRects, getSortIndex } from '../util.js';

export async function getParsedOverlays(pdfDocument, structuredCharsProvider, pageIndex) {
let chars = await structuredCharsProvider(pageIndex);
Expand Down
63 changes: 1 addition & 62 deletions src/core/module/link/util.js
Original file line number Diff line number Diff line change
@@ -1,65 +1,4 @@
import { basicDeepEqual } from '../utilities.js';

export async function getPositionFromDestination(pdfDocument, dest) {
if (!pdfDocument || !dest) {
// No PDF document available or invalid destination provided.
return;
}

let destArray;

// If the destination is a string, it's a named destination.
// We'll need to resolve it to get the actual destination array.
if (typeof dest === 'string') {
destArray = await pdfDocument.pdfManager.ensureCatalog("getDestination", [dest]);
if (!destArray) {
// Unable to resolve named destination
return;
}
} else {
destArray = dest;
}

const ref = destArray[0];
const pageNumber = await pdfDocument.pdfManager.ensureCatalog("getPageIndex", [ref]) + 1;
let { rotate, view } = await pdfDocument.getPage(pageNumber - 1);
let width = view[2] - view[0];
let height = view[3] - view[1];

let x = 0, y = 0;
const changeOrientation = rotate % 180 !== 0;
const pageHeight = (changeOrientation ? width : height);

switch (destArray[1].name) {
case "XYZ":
x = destArray[2] !== null ? destArray[2] : 0;
y = destArray[3] !== null ? destArray[3] : pageHeight;
break;
case "Fit":
case "FitB":
break;
case "FitH":
case "FitBH":
y = destArray[2] !== null ? destArray[2] : pageHeight;
break;
case "FitV":
case "FitBV":
x = destArray[2] !== null ? destArray[2] : 0;
break;
case "FitR":
x = destArray[2];
y = destArray[5];
break;
default:
// Not a valid destination type.
return;
}

return {
pageIndex: pageNumber - 1,
rects: [[x, y, x, y]],
};
}
import { basicDeepEqual } from '../util.js';

export function overlayDestinationsEqual(a, b) {
return (
Expand Down
24 changes: 20 additions & 4 deletions src/core/module/module.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ import { extractReferences } from './reference-extractor.js';
import { getExistingOutline } from './outline-reader.js';
import { extractOutline } from './outline-extractor.js';
import { getContentRect } from './content-rect.js';
import { intersectRects, overlaysIntersect } from './utilities.js';
import { intersectRects, overlaysIntersect } from './util.js';

export class Module {
constructor(pdfDocument) {
this._pdfDocument = pdfDocument;
this._structuredCharsCache = new Map();
this._temporaryStructuredCharsCache = new Map();
this._initializePromise = new Promise((resolve) => {
this._initializePromiseResolve = resolve;
});
Expand All @@ -26,6 +27,22 @@ export class Module {
if (cached) {
return cached;
}

cached = this._temporaryStructuredCharsCache.get(pageIndex);
if (cached) {
if (this._contentRect) {
let chars = cached;
for (let char of chars) {
if (!intersectRects(this._contentRect, char.rect)) {
char.isolated = true;
}
}
this._structuredCharsCache.set(pageIndex, chars)
this._temporaryStructuredCharsCache.delete(pageIndex);
}
return cached;
}

let page = await this._pdfDocument.getPage(pageIndex);
let task = {
name: 'dummy-task',
Expand Down Expand Up @@ -71,10 +88,9 @@ export class Module {
char.isolated = true;
}
}
}

if (this._initialized && this._enableCache) {
this._structuredCharsCache.set(pageIndex, chars);
} else {
this._temporaryStructuredCharsCache.set(pageIndex, chars);
}

return chars;
Expand Down
2 changes: 1 addition & 1 deletion src/core/module/outline-extractor.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import {
getClosestDistance,
getSortIndex,
printOutline,
} from "./utilities.js";
} from "./util.js";

function charsToText(chars) {
let text = [];
Expand Down
71 changes: 7 additions & 64 deletions src/core/module/outline-reader.js
Original file line number Diff line number Diff line change
@@ -1,69 +1,12 @@
import { getSortIndex } from './utilities.js';
import { getPositionFromDestination, getSortIndex } from './util.js';

async function getPositionFromDestination(pdfDocument, dest) {
if (!pdfDocument || !dest) {
throw new Error("No PDF document available or invalid destination provided.");
}

let destArray;

// If the destination is a string, it's a named destination.
// We'll need to resolve it to get the actual destination array.
if (typeof dest === 'string') {
destArray = await pdfDocument.pdfManager.ensureCatalog("getDestination", [dest]);
if (!destArray) {
throw new Error(`Unable to resolve named destination: "${dest}"`);
}
} else {
destArray = dest;
}

const ref = destArray[0];
const pageNumber = await pdfDocument.pdfManager.ensureCatalog("getPageIndex", [ref]) + 1;
let { rotate, view } = await pdfDocument.getPage(pageNumber - 1);
let width = view[2] - view[0];
let height = view[3] - view[1];

let x = 0, y = 0;
const changeOrientation = rotate % 180 !== 0;
const pageHeight = (changeOrientation ? width : height);

switch (destArray[1].name) {
case "XYZ":
x = destArray[2] !== null ? destArray[2] : 0;
y = destArray[3] !== null ? destArray[3] : pageHeight;
break;
case "Fit":
case "FitB":
break;
case "FitH":
case "FitBH":
y = destArray[2] !== null ? destArray[2] : pageHeight;
break;
case "FitV":
case "FitBV":
x = destArray[2] !== null ? destArray[2] : 0;
break;
case "FitR":
x = destArray[2];
y = destArray[5];
break;
default:
console.error(`"${destArray[1].name}" is not a valid destination type.`);
return;
async function getSortIndexFromTitle(pdfDocument, structuredCharsProvider, title, dest){
let position = await getPositionFromDestination(pdfDocument, dest);
if (!position) {
return '';
}

return {
pageIndex: pageNumber - 1,
x,
y,
};
}

async function getSortIndexFromTitle(pdfDocument, structuredCharsProvider, title, dest){
let pos = await getPositionFromDestination(pdfDocument, dest);
// TODO: Optimize this because there is no need to get the structure in this case
let chars = await structuredCharsProvider(pos.pageIndex);
let chars = await structuredCharsProvider(position.pageIndex);

title = title.split('').filter(x => x !== ' ');

Expand All @@ -81,7 +24,7 @@ async function getSortIndexFromTitle(pdfDocument, structuredCharsProvider, title
}
}

return getSortIndex(pos.pageIndex, offset, 0);
return getSortIndex(position.pageIndex, offset, 0);
}

export async function getExistingOutline(pdfDocument, structuredCharsProvider) {
Expand Down
2 changes: 1 addition & 1 deletion src/core/module/page-label.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { getBoundingRect, getRectCenter } from './utilities.js';
import { getBoundingRect, getRectCenter } from './util.js';

function romanToInteger(str) {
// Check if the string is empty or mixed case
Expand Down
2 changes: 1 addition & 1 deletion src/core/module/reference-extractor.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import {
getCenterRect,
intersectRects,
getClusters
} from './utilities.js';
} from './util.js';
import { getRegularLinkOverlays } from './link/link.js';

function removeASCIISymbolsAndNumbers(inputString) {
Expand Down
2 changes: 1 addition & 1 deletion src/core/module/reference-matcher.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { getRectsFromChars, getSortIndex } from './utilities.js';
import { getRectsFromChars, getSortIndex } from './util.js';

function getPositionFromRects(chars, pageIndex) {
let chars1 = [];
Expand Down
Loading

0 comments on commit 849a4e2

Please sign in to comment.