Node.js native modules offer a way of parsing PDF documents in a memory efficient way using streams:
import * as fs from 'fs';
import { promisify } from 'util';
import * as zlib from 'zlib';
const unzipAsync = promisify(zlib.unzip);
// this regular expression finds the zipped contents
const pageRegExp = /(?<=>>stream\n).+?(?=\nendstream)/gsu;
const parsePDF = async (path) => {
// we need to cache data across chunks in case the search area finds itself
// across the boundary of two chunks
let data = '';
for await (const chunk of fs.createReadStream(path)) {
// convert chunk to string for regular expression checking
data += chunk.toString('binary');
let sliceIndex = 0;
for (const match of data.matchAll(pageRegExp)) {
sliceIndex = match.index + match[0].length;
// unzip the data from the match and convert to correct encoding
console.log(
(await unzipAsync(Buffer.from(match[0], 'binary'))).toString('utf8')
);
}
// remove the data that we've already seen in preparation for the next chunk
// matchAll will otherwise match data we've already seen
if (sliceIndex) data = data.slice(sliceIndex);
}
};
The original gist (written in Python) that inspired this.