From a3cd5bf5f80d4be9e87c187e9088e687ded6fc0c Mon Sep 17 00:00:00 2001 From: Yavor Ivanov Date: Fri, 22 Mar 2024 15:23:15 +0200 Subject: [PATCH] feat: Add html parser --- src/detectors/transpilers/html/transpiler.ts | 58 ++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 src/detectors/transpilers/html/transpiler.ts diff --git a/src/detectors/transpilers/html/transpiler.ts b/src/detectors/transpilers/html/transpiler.ts new file mode 100644 index 000000000..b2c48867a --- /dev/null +++ b/src/detectors/transpilers/html/transpiler.ts @@ -0,0 +1,58 @@ +import type { ReadStream } from "node:fs"; +import { Detail, SaxEventType, SAXParser, Tag as SaxTag } from "sax-wasm"; +import { finished } from "node:stream/promises"; +import fs from "node:fs/promises"; +import { createRequire } from "node:module"; +const require = createRequire(import.meta.url); + +let saxWasmBuffer: Buffer; +async function initSaxWasm() { + if (!saxWasmBuffer) { + const saxPath = require.resolve("sax-wasm/lib/sax-wasm.wasm"); + saxWasmBuffer = await fs.readFile(saxPath); + } + + return saxWasmBuffer; +} + +export async function parseHtml(contentStream: ReadStream, parseHandler: (type: SaxEventType, tag: Detail) => void) { + const options = { highWaterMark: 32 * 1024 }; // 32k chunks + const saxWasmBuffer = await initSaxWasm(); + const saxParser = new SAXParser(SaxEventType.OpenTag | SaxEventType.CloseTag, options); + + saxParser.eventHandler = parseHandler; + + // Instantiate and prepare the wasm for parsing + if (!await saxParser.prepareWasm(saxWasmBuffer)) { + throw new Error("Unknown error during WASM Initialization"); + } + + // stream from a file in the current directory + contentStream.on("data", (chunk: Uint8Array) => { + try { + saxParser.write(chunk); + } catch (err) { + if (err instanceof Error) { + // In case of an error, destroy the content stream to make the + // error bubble up to our callers + contentStream.destroy(err); + } else { + throw err; + } + } + }); + await finished(contentStream); + saxParser.end(); +} + +export async function extractScriptTags(contentStream: ReadStream) { + await parseHtml(contentStream, (event, tag) => { + if (tag instanceof SaxTag) { + if (event === SaxEventType.OpenTag) { + console.log(tag.value); + } else if (event === SaxEventType.CloseTag) { + console.log(tag.value); + } + } + }); +} \ No newline at end of file