Skip to content

Commit

Permalink
Fixed bug with source PDF data being edited
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Jan 6, 2025
1 parent e665648 commit a9b5ffc
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 4 deletions.
3 changes: 3 additions & 0 deletions js/containers/imageContainer.js
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ export class ImageCache {
/** @type {Array<ImageProperties>} */
static binaryProps = [];

/** @type {?ArrayBuffer} */
static pdfData = null;

/**
* @param {ImagePropertiesRequest} props
* @param {ImageWrapper} inputImage
Expand Down
9 changes: 7 additions & 2 deletions js/export/export.js
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,13 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
// An earlier version handled this by deleting the text in the source document,
// however this resulted in results that were not as expected by the user (a visual element disappeared).
try {
// The `save` function modifies the original PDF, so we need a new PDF object to avoid modifying the original.
const basePdfDataCopy = structuredClone(ImageCache.pdfData);
const basePdf = await w.openDocument(basePdfDataCopy, 'document.pdf');
// Make a new PDF with invisible text removed to avoid duplication.
// Making a new PDF object is also required as the `overlayDocuments` function modifies the input PDF in place.
const basePdfNoInvisData = await w.save({
doc1: w.pdfDoc, minpage: minPage, maxpage: maxPage, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, humanReadable: opt.humanReadablePDF, skipTextInvis: true,
doc1: basePdf, minpage: minPage, maxpage: maxPage, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, humanReadable: opt.humanReadablePDF, skipTextInvis: true,
});
const basePdfNoInvis = await w.openDocument(basePdfNoInvisData, 'document.pdf');
if (minPage > 0 || maxPage < inputData.pageCount - 1) {
Expand All @@ -91,6 +94,8 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
content = await w.save({
doc1: basePdfNoInvis, minpage: minPage, maxpage: maxPage, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, humanReadable: opt.humanReadablePDF,
});
w.freeDocument(basePdf);
w.freeDocument(basePdfNoInvis);
} catch (error) {
console.error('Failed to insert contents into input PDF, creating new PDF from rendered images instead.');
console.error(error);
Expand Down Expand Up @@ -138,7 +143,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
content = await w.save({
doc1: pdfBase, minpage: minPage, maxpage: maxPage, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, humanReadable: opt.humanReadablePDF,
});

w.freeDocument(pdfBase);
// Otherwise, there is only OCR data and not image data.
} else if (!insertInputPDF) {
content = await w.save({
Expand Down
4 changes: 2 additions & 2 deletions js/import/import.js
Original file line number Diff line number Diff line change
Expand Up @@ -285,10 +285,10 @@ export async function importFiles(files) {
// Start loading mupdf workers as soon as possible, without waiting for `pdfFile.arrayBuffer` (which can take a while).
ImageCache.getMuPDFScheduler();

const pdfFileData = pdfFile instanceof ArrayBuffer ? pdfFile : await pdfFile.arrayBuffer();
ImageCache.pdfData = pdfFile instanceof ArrayBuffer ? pdfFile : await pdfFile.arrayBuffer();

// If no XML data is provided, page sizes are calculated using muPDF alone
await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText);
await ImageCache.openMainPDF(ImageCache.pdfData, opt.omitNativeText);

pageCountImage = ImageCache.pageCount;
ImageCache.loadCount = ImageCache.pageCount;
Expand Down

0 comments on commit a9b5ffc

Please sign in to comment.