diff --git a/package-lock.json b/package-lock.json index 59a049fc..d4e04529 100644 --- a/package-lock.json +++ b/package-lock.json @@ -16,11 +16,14 @@ "cheerio": "^1.0.0-rc.12", "crawlee": "^3.11.1", "ejs": "^3.1.9", + "file-type": "^19.5.0", "fs-extra": "^11.2.0", "glob": "^10.3.10", + "https": "^1.0.0", "inquirer": "^9.2.12", "jsdom": "^21.1.2", "lodash": "^4.17.21", + "mime-types": "^2.1.35", "minimatch": "^9.0.3", "pdfjs-dist": "github:veraPDF/pdfjs-dist#v2.14.305-taggedPdf-0.1.11", "playwright": "^1.44.1", @@ -5513,10 +5516,9 @@ } }, "node_modules/file-type": { - "version": "19.4.1", - "resolved": "https://registry.npmjs.org/file-type/-/file-type-19.4.1.tgz", - "integrity": "sha512-RuWzwF2L9tCHS76KR/Mdh+DwJZcFCzrhrPXpOw6MlEfl/o31fjpTikzcKlYuyeV7e7ftdCGVJTNOCzkYD/aLbw==", - "license": "MIT", + "version": "19.5.0", + "resolved": "https://registry.npmjs.org/file-type/-/file-type-19.5.0.tgz", + "integrity": "sha512-dMuq6WWnP6BpQY0zYJNpTtQWgeCImSMG0BTIzUBXvxbwc1HWP/E7AE4UWU9XSCOPGJuOHda0HpDnwM2FW+d90A==", "dependencies": { "get-stream": "^9.0.1", "strtok3": "^8.1.0", @@ -6294,6 +6296,11 @@ "node": ">=10.19.0" } }, + "node_modules/https": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/https/-/https-1.0.0.tgz", + "integrity": "sha512-4EC57ddXrkaF0x83Oj8sM6SLQHAWXw90Skqu2M4AEWENZ3F02dFJE/GARA8igO79tcgYqGrD7ae4f5L3um2lgg==" + }, "node_modules/https-proxy-agent": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz", @@ -8087,7 +8094,6 @@ "version": "2.1.35", "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", - "license": "MIT", "dependencies": { "mime-db": "1.52.0" }, diff --git a/package.json b/package.json index 07a93d8a..1256c8a1 100644 --- a/package.json +++ b/package.json @@ -11,11 +11,14 @@ "cheerio": "^1.0.0-rc.12", "crawlee": "^3.11.1", "ejs": "^3.1.9", + "file-type": "^19.5.0", "fs-extra": "^11.2.0", "glob": "^10.3.10", + "https": "^1.0.0", "inquirer": "^9.2.12", "jsdom": "^21.1.2", "lodash": "^4.17.21", + "mime-types": "^2.1.35", "minimatch": "^9.0.3", "pdfjs-dist": "github:veraPDF/pdfjs-dist#v2.14.305-taggedPdf-0.1.11", "playwright": "^1.44.1", @@ -70,8 +73,8 @@ "build": "npm run copyfiles && tsc", "build:watch": "npm run build -- --watch", "copyfiles": "node ./scripts/copyFiles.js src/static/ejs dist/static && node ./scripts/copyFiles.js src/constants/errorMeta.json dist/constants && node ./scripts/copyFiles.js exclusions.txt dist", - "start": "node dist/index.js", - "cli": "node dist/cli.js", + "start": "node --max-old-space-size=6144 dist/index.js", + "cli": "node --max-old-space-size=6144 dist/cli.js", "test": "node --experimental-vm-modules ./node_modules/.bin/jest", "lint": "eslint . --report-unused-disable-directives --max-warnings 0", "lint:fix": "eslint . --fix --report-unused-disable-directives --max-warnings 0" diff --git a/src/crawlers/crawlDomain.ts b/src/crawlers/crawlDomain.ts index 0798f731..58d28c91 100644 --- a/src/crawlers/crawlDomain.ts +++ b/src/crawlers/crawlDomain.ts @@ -35,6 +35,9 @@ import { ViewportSettingsClass } from '../combine.js'; import type { EnqueueLinksOptions, RequestOptions } from 'crawlee'; import type { BatchAddRequestsResult } from '@crawlee/types'; import axios from 'axios'; +import { fileTypeFromBuffer } from 'file-type'; +import mime from 'mime-types'; +import https from 'https'; const isBlacklisted = (url: string) => { const blacklistedPatterns = getBlackListedPatterns(null); @@ -99,6 +102,9 @@ const crawlDomain = async ( const { playwrightDeviceDetailsObject } = viewportSettings; const isBlacklistedUrl = isBlacklisted(url); + const httpsAgent = new https.Agent({ rejectUnauthorized: false }); + + if (isBlacklistedUrl) { guiInfoLog(guiInfoStatusTypes.SKIPPED, { numScanned: urlsCrawled.scanned.length, @@ -151,43 +157,78 @@ const crawlDomain = async ( silentLogger.info('cache hit', url, httpHeadCache.get(url)); return false; // return false to avoid processing the url again } - + try { - const response = await axios.head(url, { headers: { Authorization: authHeader } }); - const contentType = response.headers['content-type'] || ''; - - if (!contentType.includes('text/html') && !contentType.includes('application/pdf')) { - silentLogger.info(`Skipping MIME type ${contentType} at URL ${url}`); + // Send a HEAD request to check headers without downloading the file + const headResponse = await axios.head(url, { headers: { Authorization: authHeader }, httpsAgent }); + const contentType = headResponse.headers['content-type'] || ''; + const contentDisposition = headResponse.headers['content-disposition'] || ''; + + // Check if the response suggests it's a downloadable file based on Content-Disposition header + if (contentDisposition.includes('attachment')) { + silentLogger.info(`Skipping URL due to attachment header: ${url}`); httpHeadCache.set(url, false); return false; } - - // further check for zip files where the url ends with .zip + + // Check if the MIME type suggests it's a downloadable file + if (contentType.startsWith('application/') || contentType.includes('octet-stream')) { + silentLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`); + httpHeadCache.set(url, false); + return false; + } + + // Use the mime-types library to ensure it's processible content (e.g., HTML or plain text) + const mimeType = mime.lookup(contentType); + if (mimeType && !mimeType.startsWith('text/html') && !mimeType.startsWith('text/')) { + silentLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`); + httpHeadCache.set(url, false); + return false; + } + + // Additional check for zip files by their magic number (PK\x03\x04) if (url.endsWith('.zip')) { silentLogger.info(`Checking for zip file magic number at URL ${url}`); - // download first 4 bytes of file to check the magic number - const response = await axios.get(url, { + + // Download the first few bytes of the file to check for the magic number + const byteResponse = await axios.get(url, { headers: { Range: 'bytes=0-3', Authorization: authHeader }, + responseType: 'arraybuffer', + httpsAgent }); - // check using startsWith because some server does not handle Range header and returns the whole file - if (response.data.startsWith('PK\x03\x04')) { - // PK\x03\x04 is the magic number for zip files + + const magicNumber = byteResponse.data.toString('hex'); + if (magicNumber === '504b0304') { silentLogger.info(`Skipping zip file at URL ${url}`); httpHeadCache.set(url, false); return false; } else { - // print out the hex value of the first 4 bytes - silentLogger.info( - `Not skipping ${url} as it has magic number: ${response.data.slice(0, 4).toString('hex')}`, - ); + silentLogger.info(`Not skipping ${url}, magic number does not match ZIP file: ${magicNumber}`); } } + + // If you want more robust checks, you can download a portion of the content and use the file-type package to detect file types by content + const response = await axios.get(url, { + headers: { Range: 'bytes=0-4100', Authorization: authHeader }, + responseType: 'arraybuffer', + httpsAgent + }); + + const fileType = await fileTypeFromBuffer(response.data); + if (fileType && !fileType.mime.startsWith('text/html') && !fileType.mime.startsWith('text/')) { + silentLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`); + httpHeadCache.set(url, false); + return false; + } + } catch (e) { - silentLogger.error(`Error checking the MIME type of ${url}: ${e.message}`); - // when failing to check the MIME type (e.g. need to go through proxy), let crawlee handle the request + // silentLogger.error(`Error checking the MIME type of ${url}: ${e.message}`); + // If an error occurs (e.g., a network issue), assume the URL is processible httpHeadCache.set(url, true); return true; } + + // If none of the conditions to skip are met, allow processing of the URL httpHeadCache.set(url, true); return true; }; @@ -511,11 +552,21 @@ const crawlDomain = async ( Authorization: authHeader, ...extraHTTPHeaders, }); + const processible = await isProcessibleUrl(request.url); + if (!processible) { + request.skipNavigation = true; + return null; + } }, ] : [ async ({ request }) => { preNavigationHooks(extraHTTPHeaders); + const processible = await isProcessibleUrl(request.url); + if (!processible) { + request.skipNavigation = true; + return null; + } }, ], requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds