diff --git a/package-lock.json b/package-lock.json index 50e889af..660df3d4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@govtechsg/purple-hats", - "version": "0.10.4", + "version": "0.10.5", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@govtechsg/purple-hats", - "version": "0.10.4", + "version": "0.10.5", "license": "MIT", "dependencies": { "@json2csv/node": "^7.0.3", diff --git a/package.json b/package.json index ffc1e847..9a6c12b6 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@govtechsg/purple-hats", "main": "dist/npmIndex.js", - "version": "0.10.4", + "version": "0.10.5", "type": "module", "imports": { "#root/*.js": "./dist/*.js" diff --git a/src/constants/common.ts b/src/constants/common.ts index 9c407465..92c5cf74 100644 --- a/src/constants/common.ts +++ b/src/constants/common.ts @@ -988,7 +988,7 @@ export const getLinksFromSitemap = async ( if (isLimitReached()) { break; } - if (childSitemapUrlText.endsWith('.xml')) { + if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) { await fetchUrls(childSitemapUrlText); // Recursive call for nested sitemaps } else { addToUrlList(childSitemapUrlText); // Add regular URLs to the list @@ -1020,7 +1020,7 @@ export const getLinksFromSitemap = async ( } const requestList = Object.values(urls); - + return requestList; }; @@ -1786,7 +1786,12 @@ function isValidHttpUrl(urlString) { } export const isFilePath = (url: string): boolean => { - return url.startsWith('file://') || url.startsWith('/'); + const driveLetterPattern = /^[A-Z]:/i; + const backslashPattern = /\\/; + return url.startsWith('file://') || + url.startsWith('/') || + driveLetterPattern.test(url) || + backslashPattern.test(url); }; export function convertLocalFileToPath(url: string): string { diff --git a/src/crawlers/crawlLocalFile.ts b/src/crawlers/crawlLocalFile.ts index 6478732e..ecfb45ce 100644 --- a/src/crawlers/crawlLocalFile.ts +++ b/src/crawlers/crawlLocalFile.ts @@ -7,7 +7,6 @@ import { failedRequestHandler, isUrlPdf, } from './commonCrawlerFunc.js'; - import constants, { guiInfoStatusTypes, basicAuthRegex } from '../constants/constants.js'; import { getLinksFromSitemap, @@ -74,9 +73,9 @@ const crawlLocalFile = async ( convertLocalFileToPath(sitemapUrl); // XML Files - if (!sitemapUrl.match(/\.xml$/i)) { + console.log((!sitemapUrl.match(/\.txt$/i))) + if (!(sitemapUrl.match(/\.xml$/i) || sitemapUrl.match(/\.txt$/i))) { linksFromSitemap = [new Request({ url: sitemapUrl })]; - // Non XML file } else { const username = ''; @@ -145,23 +144,13 @@ const crawlLocalFile = async ( uuidToPdfMapping[pdfFileName] = trimmedUrl; if (!isUrlPdf(request.url)) { - let browserUsed; - // Playwright only supports chromium,firefox and webkit thus hardcoded to chromium - if (browser === 'chromium') { - browserUsed = await playwright.chromium.launch(); - } else if (browser === 'firefox') { - browserUsed = await playwright.firefox.launch(); - } else if (browser === 'webkit') { - browserUsed = await playwright.webkit.launch(); - } else if (browser === 'chrome') { - browserUsed = await playwright.chromium.launch(); //chrome not supported, default to chromium - } else { - console.log('Browser not supported, please use chrome, chromium, firefox, webkit'); - console.log(' '); - return; - } - const context = await browserUsed.newContext(); - const page = await context.newPage(); + + const browserContext = await constants.launcher.launchPersistentContext('', { + headless: process.env.CRAWLEE_HEADLESS === '1', + ...getPlaywrightLaunchOptions(browser), + }); + + const page = await browserContext.newPage(); request.url = convertPathToLocalFile(request.url); await page.goto(request.url); const results = await runAxeScript(includeScreenshots, page, randomToken, null);