From bca6ba4a2c7a84c839584c92cb508a35a4114f3d Mon Sep 17 00:00:00 2001 From: angyonghaseyo Date: Thu, 10 Oct 2024 09:41:12 +0800 Subject: [PATCH 1/8] Using isProcessibleUrl in PreNavigationHook and isProcessibleUrl have checks on contentType and contentDisposition --- package-lock.json | 10 ++--- package.json | 2 + src/crawlers/crawlDomain.ts | 81 ++++++++++++++++++++++++++++--------- 3 files changed, 70 insertions(+), 23 deletions(-) diff --git a/package-lock.json b/package-lock.json index 251dafd9..6c6d8e9c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -16,11 +16,13 @@ "cheerio": "^1.0.0-rc.12", "crawlee": "^3.11.1", "ejs": "^3.1.9", + "file-type": "^19.5.0", "fs-extra": "^11.2.0", "glob": "^10.3.10", "inquirer": "^9.2.12", "jsdom": "^21.1.2", "lodash": "^4.17.21", + "mime-types": "^2.1.35", "minimatch": "^9.0.3", "pdfjs-dist": "github:veraPDF/pdfjs-dist#v2.14.305-taggedPdf-0.1.11", "playwright": "^1.44.1", @@ -5513,10 +5515,9 @@ } }, "node_modules/file-type": { - "version": "19.4.1", - "resolved": "https://registry.npmjs.org/file-type/-/file-type-19.4.1.tgz", - "integrity": "sha512-RuWzwF2L9tCHS76KR/Mdh+DwJZcFCzrhrPXpOw6MlEfl/o31fjpTikzcKlYuyeV7e7ftdCGVJTNOCzkYD/aLbw==", - "license": "MIT", + "version": "19.5.0", + "resolved": "https://registry.npmjs.org/file-type/-/file-type-19.5.0.tgz", + "integrity": "sha512-dMuq6WWnP6BpQY0zYJNpTtQWgeCImSMG0BTIzUBXvxbwc1HWP/E7AE4UWU9XSCOPGJuOHda0HpDnwM2FW+d90A==", "dependencies": { "get-stream": "^9.0.1", "strtok3": "^8.1.0", @@ -8087,7 +8088,6 @@ "version": "2.1.35", "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", - "license": "MIT", "dependencies": { "mime-db": "1.52.0" }, diff --git a/package.json b/package.json index 29a79837..2e989ca9 100644 --- a/package.json +++ b/package.json @@ -11,11 +11,13 @@ "cheerio": "^1.0.0-rc.12", "crawlee": "^3.11.1", "ejs": "^3.1.9", + "file-type": "^19.5.0", "fs-extra": "^11.2.0", "glob": "^10.3.10", "inquirer": "^9.2.12", "jsdom": "^21.1.2", "lodash": "^4.17.21", + "mime-types": "^2.1.35", "minimatch": "^9.0.3", "pdfjs-dist": "github:veraPDF/pdfjs-dist#v2.14.305-taggedPdf-0.1.11", "playwright": "^1.44.1", diff --git a/src/crawlers/crawlDomain.ts b/src/crawlers/crawlDomain.ts index 0798f731..b1b02d33 100644 --- a/src/crawlers/crawlDomain.ts +++ b/src/crawlers/crawlDomain.ts @@ -35,6 +35,8 @@ import { ViewportSettingsClass } from '../combine.js'; import type { EnqueueLinksOptions, RequestOptions } from 'crawlee'; import type { BatchAddRequestsResult } from '@crawlee/types'; import axios from 'axios'; +import { fileTypeFromBuffer } from 'file-type'; +import mime from 'mime-types'; const isBlacklisted = (url: string) => { const blacklistedPatterns = getBlackListedPatterns(null); @@ -151,43 +153,76 @@ const crawlDomain = async ( silentLogger.info('cache hit', url, httpHeadCache.get(url)); return false; // return false to avoid processing the url again } - + try { - const response = await axios.head(url, { headers: { Authorization: authHeader } }); - const contentType = response.headers['content-type'] || ''; - - if (!contentType.includes('text/html') && !contentType.includes('application/pdf')) { - silentLogger.info(`Skipping MIME type ${contentType} at URL ${url}`); + // Send a HEAD request to check headers without downloading the file + const headResponse = await axios.head(url, { headers: { Authorization: authHeader } }); + const contentType = headResponse.headers['content-type'] || ''; + const contentDisposition = headResponse.headers['content-disposition'] || ''; + + // Check if the response suggests it's a downloadable file based on Content-Disposition header + if (contentDisposition.includes('attachment')) { + silentLogger.info(`Skipping URL due to attachment header: ${url}`); httpHeadCache.set(url, false); return false; } - - // further check for zip files where the url ends with .zip + + // Check if the MIME type suggests it's a downloadable file + if (contentType.startsWith('application/') || contentType.includes('octet-stream')) { + silentLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`); + httpHeadCache.set(url, false); + return false; + } + + // Use the mime-types library to ensure it's processible content (e.g., HTML or plain text) + const mimeType = mime.lookup(contentType); + if (mimeType && !mimeType.startsWith('text/html') && !mimeType.startsWith('text/')) { + silentLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`); + httpHeadCache.set(url, false); + return false; + } + + // Additional check for zip files by their magic number (PK\x03\x04) if (url.endsWith('.zip')) { silentLogger.info(`Checking for zip file magic number at URL ${url}`); - // download first 4 bytes of file to check the magic number - const response = await axios.get(url, { + + // Download the first few bytes of the file to check for the magic number + const byteResponse = await axios.get(url, { headers: { Range: 'bytes=0-3', Authorization: authHeader }, + responseType: 'arraybuffer' }); - // check using startsWith because some server does not handle Range header and returns the whole file - if (response.data.startsWith('PK\x03\x04')) { - // PK\x03\x04 is the magic number for zip files + + const magicNumber = byteResponse.data.toString('hex'); + if (magicNumber === '504b0304') { silentLogger.info(`Skipping zip file at URL ${url}`); httpHeadCache.set(url, false); return false; } else { - // print out the hex value of the first 4 bytes - silentLogger.info( - `Not skipping ${url} as it has magic number: ${response.data.slice(0, 4).toString('hex')}`, - ); + silentLogger.info(`Not skipping ${url}, magic number does not match ZIP file: ${magicNumber}`); } } + + // If you want more robust checks, you can download a portion of the content and use the file-type package to detect file types by content + const response = await axios.get(url, { + headers: { Range: 'bytes=0-4100', Authorization: authHeader }, + responseType: 'arraybuffer' + }); + + const fileType = await fileTypeFromBuffer(response.data); + if (fileType && !fileType.mime.startsWith('text/html') && !fileType.mime.startsWith('text/')) { + silentLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`); + httpHeadCache.set(url, false); + return false; + } + } catch (e) { silentLogger.error(`Error checking the MIME type of ${url}: ${e.message}`); - // when failing to check the MIME type (e.g. need to go through proxy), let crawlee handle the request + // If an error occurs (e.g., a network issue), assume the URL is processible httpHeadCache.set(url, true); return true; } + + // If none of the conditions to skip are met, allow processing of the URL httpHeadCache.set(url, true); return true; }; @@ -511,11 +546,21 @@ const crawlDomain = async ( Authorization: authHeader, ...extraHTTPHeaders, }); + const processible = await isProcessibleUrl(request.url); + if (!processible) { + request.skipNavigation = true; + return null; + } }, ] : [ async ({ request }) => { preNavigationHooks(extraHTTPHeaders); + const processible = await isProcessibleUrl(request.url); + if (!processible) { + request.skipNavigation = true; + return null; + } }, ], requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds From ac0e153c3e1594384033252961e85ca860f4753a Mon Sep 17 00:00:00 2001 From: angyonghaseyo Date: Thu, 10 Oct 2024 11:19:33 +0800 Subject: [PATCH 2/8] Added httpAgent --- package-lock.json | 6 ++++++ package.json | 1 + src/crawlers/crawlDomain.ts | 7 ++++++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/package-lock.json b/package-lock.json index 6c6d8e9c..fd2bb065 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,6 +19,7 @@ "file-type": "^19.5.0", "fs-extra": "^11.2.0", "glob": "^10.3.10", + "https": "^1.0.0", "inquirer": "^9.2.12", "jsdom": "^21.1.2", "lodash": "^4.17.21", @@ -6295,6 +6296,11 @@ "node": ">=10.19.0" } }, + "node_modules/https": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/https/-/https-1.0.0.tgz", + "integrity": "sha512-4EC57ddXrkaF0x83Oj8sM6SLQHAWXw90Skqu2M4AEWENZ3F02dFJE/GARA8igO79tcgYqGrD7ae4f5L3um2lgg==" + }, "node_modules/https-proxy-agent": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz", diff --git a/package.json b/package.json index 2e989ca9..dc983bdd 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,7 @@ "file-type": "^19.5.0", "fs-extra": "^11.2.0", "glob": "^10.3.10", + "https": "^1.0.0", "inquirer": "^9.2.12", "jsdom": "^21.1.2", "lodash": "^4.17.21", diff --git a/src/crawlers/crawlDomain.ts b/src/crawlers/crawlDomain.ts index b1b02d33..363cb6ef 100644 --- a/src/crawlers/crawlDomain.ts +++ b/src/crawlers/crawlDomain.ts @@ -37,6 +37,7 @@ import type { BatchAddRequestsResult } from '@crawlee/types'; import axios from 'axios'; import { fileTypeFromBuffer } from 'file-type'; import mime from 'mime-types'; +import https from 'https'; const isBlacklisted = (url: string) => { const blacklistedPatterns = getBlackListedPatterns(null); @@ -101,6 +102,9 @@ const crawlDomain = async ( const { playwrightDeviceDetailsObject } = viewportSettings; const isBlacklistedUrl = isBlacklisted(url); + const httpsAgent = new https.Agent({ rejectUnauthorized: false }); + + if (isBlacklistedUrl) { guiInfoLog(guiInfoStatusTypes.SKIPPED, { numScanned: urlsCrawled.scanned.length, @@ -205,7 +209,8 @@ const crawlDomain = async ( // If you want more robust checks, you can download a portion of the content and use the file-type package to detect file types by content const response = await axios.get(url, { headers: { Range: 'bytes=0-4100', Authorization: authHeader }, - responseType: 'arraybuffer' + responseType: 'arraybuffer', + httpsAgent }); const fileType = await fileTypeFromBuffer(response.data); From 250d254594bcdd4d3e6faea57e2e8a5d1dcc6b65 Mon Sep 17 00:00:00 2001 From: angyonghaseyo Date: Thu, 10 Oct 2024 11:24:35 +0800 Subject: [PATCH 3/8] Adding of httpsagent for bytereponse --- src/crawlers/crawlDomain.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/crawlers/crawlDomain.ts b/src/crawlers/crawlDomain.ts index 363cb6ef..8c6d8b7b 100644 --- a/src/crawlers/crawlDomain.ts +++ b/src/crawlers/crawlDomain.ts @@ -193,7 +193,8 @@ const crawlDomain = async ( // Download the first few bytes of the file to check for the magic number const byteResponse = await axios.get(url, { headers: { Range: 'bytes=0-3', Authorization: authHeader }, - responseType: 'arraybuffer' + responseType: 'arraybuffer', + httpsAgent }); const magicNumber = byteResponse.data.toString('hex'); From d75fc3be0948dd283359f8be92e992dbea17ab10 Mon Sep 17 00:00:00 2001 From: angyonghaseyo Date: Thu, 10 Oct 2024 11:43:00 +0800 Subject: [PATCH 4/8] added httpsagent for headresponse --- src/crawlers/crawlDomain.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlers/crawlDomain.ts b/src/crawlers/crawlDomain.ts index 8c6d8b7b..e29a9afd 100644 --- a/src/crawlers/crawlDomain.ts +++ b/src/crawlers/crawlDomain.ts @@ -160,7 +160,7 @@ const crawlDomain = async ( try { // Send a HEAD request to check headers without downloading the file - const headResponse = await axios.head(url, { headers: { Authorization: authHeader } }); + const headResponse = await axios.head(url, { headers: { Authorization: authHeader }, httpsAgent }); const contentType = headResponse.headers['content-type'] || ''; const contentDisposition = headResponse.headers['content-disposition'] || ''; From b11d0840d0fb85a61ca68cff4c408ef829acf368 Mon Sep 17 00:00:00 2001 From: Wilson WeeSheng Khoo Date: Thu, 10 Oct 2024 17:58:27 +0800 Subject: [PATCH 5/8] Disable error logging for failed MIME type checks in URL processing --- src/crawlers/crawlDomain.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlers/crawlDomain.ts b/src/crawlers/crawlDomain.ts index e29a9afd..58d28c91 100644 --- a/src/crawlers/crawlDomain.ts +++ b/src/crawlers/crawlDomain.ts @@ -222,7 +222,7 @@ const crawlDomain = async ( } } catch (e) { - silentLogger.error(`Error checking the MIME type of ${url}: ${e.message}`); + // silentLogger.error(`Error checking the MIME type of ${url}: ${e.message}`); // If an error occurs (e.g., a network issue), assume the URL is processible httpHeadCache.set(url, true); return true; From c6459866e69b287675fe15f9b11d5437382f3da5 Mon Sep 17 00:00:00 2001 From: younglim Date: Mon, 14 Oct 2024 17:36:12 +0800 Subject: [PATCH 6/8] Bump package version --- package-lock.json | 4 ++-- package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index fd2bb065..ac749a49 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@govtechsg/purple-hats", - "version": "0.10.15", + "version": "0.10.16", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@govtechsg/purple-hats", - "version": "0.10.15", + "version": "0.10.16", "license": "MIT", "dependencies": { "@json2csv/node": "^7.0.3", diff --git a/package.json b/package.json index dc983bdd..070b6387 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@govtechsg/purple-hats", "main": "dist/npmIndex.js", - "version": "0.10.15", + "version": "0.10.16", "type": "module", "dependencies": { "@json2csv/node": "^7.0.3", From 28e73d12321d5ad42d4601cf950b192431912ede Mon Sep 17 00:00:00 2001 From: younglim Date: Mon, 14 Oct 2024 17:49:54 +0800 Subject: [PATCH 7/8] Increase max memory allocation to 6GB to support insanely large scans --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 070b6387..3ca9ee80 100644 --- a/package.json +++ b/package.json @@ -74,7 +74,7 @@ "build:watch": "npm run build -- --watch", "copyfiles": "node ./scripts/copyFiles.js src/static/ejs dist/static && node ./scripts/copyFiles.js src/constants/errorMeta.json dist/constants && node ./scripts/copyFiles.js exclusions.txt dist", "start": "node dist/index.js", - "cli": "node dist/cli.js", + "cli": "node --max-old-space-size=6144 dist/cli.js", "test": "node --experimental-vm-modules ./node_modules/.bin/jest", "lint": "eslint . --report-unused-disable-directives --max-warnings 0", "lint:fix": "eslint . --fix --report-unused-disable-directives --max-warnings 0" From d4c3b94e293b667aa62b2cf70148ace3c812eb00 Mon Sep 17 00:00:00 2001 From: younglim Date: Mon, 14 Oct 2024 17:51:31 +0800 Subject: [PATCH 8/8] Large scan support for npm start as well --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 3ca9ee80..c1e17a1f 100644 --- a/package.json +++ b/package.json @@ -73,7 +73,7 @@ "build": "npm run copyfiles && tsc", "build:watch": "npm run build -- --watch", "copyfiles": "node ./scripts/copyFiles.js src/static/ejs dist/static && node ./scripts/copyFiles.js src/constants/errorMeta.json dist/constants && node ./scripts/copyFiles.js exclusions.txt dist", - "start": "node dist/index.js", + "start": "node --max-old-space-size=6144 dist/index.js", "cli": "node --max-old-space-size=6144 dist/cli.js", "test": "node --experimental-vm-modules ./node_modules/.bin/jest", "lint": "eslint . --report-unused-disable-directives --max-warnings 0",