From 3a7abd1524e46130798d8355858225f3753f71fa Mon Sep 17 00:00:00 2001 From: Ang Yong <89739997+angyonghaseyo@users.noreply.github.com> Date: Wed, 19 Jun 2024 17:41:40 +0800 Subject: [PATCH] Bug fix on sitemap adds localFileScan, move report directory up one level (#364) * Allow sitemap to be able to recurse all the files (Previously it was just getting the last childSitemap files, instead of all of the files) * Allow pdfScanFunc to run both filePath and url (it uses fs for file path and got for url) * Prevent sitemap from recursing infinitely * Do checking if its a file path or url else skip it, without stopping the scan * Added crawlLocalFile (-c 5) * Added results and log folder name based on the local file name given * Added try catch to all the json parsing for sitemap * Fix bug of able to scan files with dot separator within the file name * Allow all types of files to be scanned (verapdf for pdf files, axescript for non pdf files) * Added typing for crawLocalFile.js * Move reports directory to parent directory --------- Co-authored-by: younglim --- INTEGRATION.md | 2 +- __tests__/mergeAxeResults.test.ts | 2 +- gitlab-pipeline-template.yml | 12 +- src/cli.ts | 6 +- src/combine.ts | 39 ++++- src/constants/cliFunctions.ts | 6 +- src/constants/common.ts | 109 +++++++++--- src/constants/constants.ts | 3 +- src/constants/questions.ts | 5 +- src/crawlers/commonCrawlerFunc.ts | 9 +- src/crawlers/crawlDomain.ts | 4 +- src/crawlers/crawlIntelligentSitemap.ts | 1 + src/crawlers/crawlLocalFile.ts | 201 +++++++++++++++++++++++ src/crawlers/crawlSitemap.ts | 154 +++++++++-------- src/crawlers/pdfScanFunc.ts | 31 +++- src/index.ts | 4 +- src/mergeAxeResults.ts | 18 +- src/screenshotFunc/htmlScreenshotFunc.ts | 2 +- src/screenshotFunc/pdfScreenshotFunc.ts | 4 + src/utils.ts | 2 +- 20 files changed, 477 insertions(+), 137 deletions(-) create mode 100644 src/crawlers/crawlLocalFile.ts diff --git a/INTEGRATION.md b/INTEGRATION.md index 6ea72ddd..619d404a 100644 --- a/INTEGRATION.md +++ b/INTEGRATION.md @@ -197,7 +197,7 @@ Create cypress.config.js with the following contents, and change yo return await purpleA11y.pushScanResults(res, metadata, elementsToClick); }, returnResultsDir() { - return `results/${purpleA11y.randomToken}_${purpleA11y.scanDetails.urlsCrawled.scanned.length}pages/reports/report.html`; + return `results/${purpleA11y.randomToken}_${purpleA11y.scanDetails.urlsCrawled.scanned.length}pages/report.html`; }, finishPurpleA11yTestCase() { purpleA11y.testThresholds(); diff --git a/__tests__/mergeAxeResults.test.ts b/__tests__/mergeAxeResults.test.ts index 0a64ef3a..5f1110a6 100644 --- a/__tests__/mergeAxeResults.test.ts +++ b/__tests__/mergeAxeResults.test.ts @@ -56,7 +56,7 @@ beforeEach(() => { // Reports storagePath, expected report and compiled result files htmlFilename = 'report'; - expectedHTMLFilename = `${expectedStoragePath}/reports/${htmlFilename}.html`; + expectedHTMLFilename = `${expectedStoragePath}/${htmlFilename}.html`; // Mock the JSON result generated from the issues dateTimeStamp = getFormattedTime(); diff --git a/gitlab-pipeline-template.yml b/gitlab-pipeline-template.yml index f8ee8da3..2899d0dc 100644 --- a/gitlab-pipeline-template.yml +++ b/gitlab-pipeline-template.yml @@ -42,13 +42,13 @@ a11y-scan: artifacts: paths: # Stores the report CSV, HTML, summary PDF only to save storage space - - artifacts/reports/report.csv - - artifacts/reports/report.html - - artifacts/reports/scanDetails.csv - - artifacts/reports/summary.pdf + - artifacts/report.csv + - artifacts/report.html + - artifacts/scanDetails.csv + - artifacts/summary.pdf # Include screenhots folder - # - artifacts/reports/elemScreenshots/ + # - artifacts/elemScreenshots/ # Stores the reports folder so it can be accessed through Browse - # - artifacts/reports + # - artifacts/ # Uploads the results as zipped file # - $A11Y_SCAN_ARTIFACT_NAME diff --git a/src/cli.ts b/src/cli.ts index a12e19cc..91acf3a1 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -268,7 +268,7 @@ const scanInit = async (argvs: Answers): Promise => { printMessage([statuses.systemError.message], messageOptions); process.exit(res.status); case statuses.invalidUrl.code: - if (argvs.scanner !== ScannerTypes.SITEMAP) { + if (argvs.scanner !== ScannerTypes.SITEMAP && argvs.scanner !== ScannerTypes.LOCALFILE) { printMessage([statuses.invalidUrl.message], messageOptions); process.exit(res.status); } @@ -277,7 +277,7 @@ const scanInit = async (argvs: Answers): Promise => { file is a sitemap */ const finalFilePath = getFileSitemap(argvs.url); if (finalFilePath) { - argvs.isLocalSitemap = true; + argvs.isLocalFileScan = true; argvs.finalUrl = finalFilePath; if (process.env.VALIDATE_URL_PH_GUI) { console.log('Url is valid'); @@ -367,7 +367,7 @@ const optionsAnswer: Answers = { followRobots: options['followRobots'], customFlowLabel: options['customFlowLabel'], viewportWidth: options['viewportWidth'], - isLocalSitemap: options['isLocalSitemap'], + isLocalFileScan: options['isLocalFileScan'], exportDirectory: options['exportDirectory'], clonedBrowserDataDir: options['clonedBrowserDataDir'], specifiedMaxConcurrency: options['specifiedMaxConcurrency'], diff --git a/src/combine.ts b/src/combine.ts index e5916eca..5d8d5c11 100644 --- a/src/combine.ts +++ b/src/combine.ts @@ -1,6 +1,7 @@ import printMessage from 'print-message'; import crawlSitemap from './crawlers/crawlSitemap.js'; import crawlDomain from './crawlers/crawlDomain.js'; +import crawlLocalFile from './crawlers/crawlLocalFile.js'; import crawlIntelligentSitemap from './crawlers/crawlIntelligentSitemap.js'; import { generateArtifacts } from './mergeAxeResults.js'; import { getHost, createAndUpdateResultsFolders, createDetailsAndLogs } from './utils.js'; @@ -10,6 +11,7 @@ import { consoleLogger, silentLogger } from './logs.js'; import runCustom from './crawlers/runCustom.js'; import { alertMessageOptions } from './constants/cliFunctions.js'; import { Data } from './index.js'; +import { fileURLToPath, pathToFileURL } from 'url'; // Class exports @@ -42,7 +44,7 @@ const combineRun = async (details:Data, deviceToScan:string) => { viewportWidth, playwrightDeviceDetailsObject, maxRequestsPerCrawl, - isLocalSitemap, + isLocalFileScan, browser, userDataDirectory, strategy, @@ -60,7 +62,11 @@ const combineRun = async (details:Data, deviceToScan:string) => { process.env.CRAWLEE_LOG_LEVEL = 'ERROR'; process.env.CRAWLEE_STORAGE_DIR = randomToken; - const host = type === ScannerTypes.SITEMAP && isLocalSitemap ? '' : getHost(url); + const host = + (type === ScannerTypes.SITEMAP && isLocalFileScan) || + (type === ScannerTypes.LOCALFILE && isLocalFileScan) + ? '' + : getHost(url); let blacklistedPatterns:string[] | null = null; try { @@ -72,7 +78,10 @@ const combineRun = async (details:Data, deviceToScan:string) => { } // remove basic-auth credentials from URL - let finalUrl = urlWithoutAuth(url); + let finalUrl = (!(type === ScannerTypes.SITEMAP && isLocalFileScan || type === ScannerTypes.LOCALFILE && isLocalFileScan)) ? urlWithoutAuth(url) : new URL(pathToFileURL(url)); + + //Use the string version of finalUrl to reduce logic at submitForm + let finalUrlString = finalUrl.toString(); const scanDetails = { startTime: new Date(), @@ -80,7 +89,6 @@ const combineRun = async (details:Data, deviceToScan:string) => { crawlType: type, requestUrl: finalUrl, urlsCrawled: new UrlsCrawled(), - }; const viewportSettings:ViewportSettingsClass = new ViewportSettingsClass( @@ -119,6 +127,23 @@ const combineRun = async (details:Data, deviceToScan:string) => { ); break; + case ScannerTypes.LOCALFILE: + urlsCrawledObj = await crawlLocalFile( + url, + randomToken, + host, + viewportSettings, + maxRequestsPerCrawl, + browser, + userDataDirectory, + specifiedMaxConcurrency, + fileTypes, + blacklistedPatterns, + includeScreenshots, + extraHTTPHeaders, + ); + break; + case ScannerTypes.INTELLIGENT: urlsCrawledObj = await crawlIntelligentSitemap( url, @@ -168,6 +193,7 @@ const combineRun = async (details:Data, deviceToScan:string) => { scanDetails.endTime = new Date(); scanDetails.urlsCrawled = urlsCrawledObj; await createDetailsAndLogs(randomToken); + if (scanDetails.urlsCrawled) { if (scanDetails.urlsCrawled.scanned.length > 0) { await createAndUpdateResultsFolders(randomToken); const pagesNotScanned = [ @@ -192,7 +218,7 @@ const combineRun = async (details:Data, deviceToScan:string) => { browser, userDataDirectory, url, // scannedUrl - finalUrl.href, //entryUrl + new URL(finalUrlString).href, //entryUrl type, email, name, @@ -202,7 +228,8 @@ const combineRun = async (details:Data, deviceToScan:string) => { pagesNotScanned.length, metadata, ); - } else { + } +}else { printMessage([`No pages were scanned.`], alertMessageOptions); } }; diff --git a/src/constants/cliFunctions.ts b/src/constants/cliFunctions.ts index 1b69603d..696c487b 100644 --- a/src/constants/cliFunctions.ts +++ b/src/constants/cliFunctions.ts @@ -16,10 +16,10 @@ export const alertMessageOptions = { export const cliOptions: { [key: string]: Options } = { c: { alias: 'scanner', - describe: 'Type of scan, 1) sitemap, 2) website crawl, 3) custom flow, 4) intelligent', + describe: 'Type of scan, 1) sitemap, 2) website crawl, 3) custom flow, 4) intelligent 5) local file', requiresArg: true, coerce: option => { - const choices = ['sitemap', 'website', 'custom', 'intelligent']; + const choices = ['sitemap', 'website', 'custom', 'intelligent', 'localfile']; if (typeof option === 'number') { // Will also allow integer choices if (Number.isInteger(option) && option > 0 && option <= choices.length) { @@ -34,6 +34,8 @@ export const cliOptions: { [key: string]: Options } = { return ScannerTypes.WEBSITE; case 'custom': return ScannerTypes.CUSTOM; + case 'localfile': + return ScannerTypes.LOCALFILE; case 'intelligent': return ScannerTypes.INTELLIGENT; default: diff --git a/src/constants/common.ts b/src/constants/common.ts index 5b25c894..9c407465 100644 --- a/src/constants/common.ts +++ b/src/constants/common.ts @@ -15,7 +15,7 @@ import * as https from 'https'; import os from 'os'; import { minimatch } from 'minimatch'; import { globSync } from 'glob'; -import { LaunchOptions, devices, webkit } from 'playwright'; +import { LaunchOptions, devices, request, webkit } from 'playwright'; import printMessage from 'print-message'; import constants, { getDefaultChromeDataDir, @@ -30,6 +30,7 @@ import { silentLogger } from '../logs.js'; import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js'; import { randomThreeDigitNumberString } from '../utils.js'; import { Answers, Data } from '#root/index.js'; +import { fileURLToPath, pathToFileURL } from 'url'; // validateDirPath validates a provided directory path // returns null if no error @@ -228,8 +229,8 @@ export const getFileSitemap = (filePath: string): string | null => { } const file = fs.readFileSync(filePath, 'utf8'); - const isLocalSitemap = isSitemapContent(file); - return isLocalSitemap ? filePath : null; + const isLocalFileScan = isSitemapContent(file); + return isLocalFileScan || (file != undefined) ? filePath : null; }; export const getUrlMessage = (scanner: ScannerTypes): string => { @@ -239,7 +240,8 @@ export const getUrlMessage = (scanner: ScannerTypes): string => { return 'Please enter URL of website: '; case ScannerTypes.SITEMAP: return 'Please enter URL or file path to sitemap, or drag and drop a sitemap file here: '; - + case ScannerTypes.LOCALFILE: + return 'Please enter file path: '; default: return 'Invalid option'; } @@ -525,7 +527,10 @@ export const checkUrl = async ( } } - if (res.status === constants.urlCheckStatuses.success.code && scanner === ScannerTypes.SITEMAP) { + if ( + (res.status === constants.urlCheckStatuses.success.code && scanner === ScannerTypes.SITEMAP) || + (res.status === constants.urlCheckStatuses.success.code && scanner === ScannerTypes.LOCALFILE) + ) { const isSitemap = isSitemapContent(res.content); if (!isSitemap) { @@ -551,7 +556,7 @@ export const prepareData = async (argv: Answers): Promise => { playwrightDeviceDetailsObject, maxpages, strategy, - isLocalSitemap, + isLocalFileScan, finalUrl, browserToRun, nameEmail, @@ -568,7 +573,7 @@ export const prepareData = async (argv: Answers): Promise => { // construct filename for scan results const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' '); - const domain = argv.isLocalSitemap ? 'custom' : new URL(argv.url).hostname; + const domain = argv.isLocalFileScan ? path.basename(argv.url) : new URL(argv.url).hostname; const sanitisedLabel = customFlowLabel ? `_${customFlowLabel.replaceAll(' ', '_')}` : ''; let resultFilename: string; const randomThreeDigitNumber = randomThreeDigitNumberString(); @@ -594,7 +599,7 @@ export const prepareData = async (argv: Answers): Promise => { playwrightDeviceDetailsObject, maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl, strategy, - isLocalSitemap, + isLocalFileScan, browser: browserToRun, nameEmail, customFlowLabel, @@ -740,6 +745,7 @@ export const getLinksFromSitemap = async ( username: string, password: string, ) => { + const scannedSitemaps = new Set(); const urls = {}; // dictionary of requests to urls to be scanned const isLimitReached = () => Object.keys(urls).length >= maxLinksCount; @@ -753,7 +759,14 @@ export const getLinksFromSitemap = async ( ? (url = addBasicAuthCredentials(url, username, password)) : url; - const request = new Request({ url: url }); + url = convertPathToLocalFile(url); + + let request; + try { + request = new Request({ url: url }); + } catch (e) { + console.log('Error creating request', e); + } if (isUrlPdf(url)) { request.skipNavigation = true; } @@ -837,17 +850,41 @@ export const getLinksFromSitemap = async ( let sitemapType; let isBasicAuth = false; - const parsedUrl = new URL(url); let username = ''; let password = ''; - if (parsedUrl.username !== '' && parsedUrl.password !== '') { - isBasicAuth = true; - username = decodeURIComponent(parsedUrl.username); - password = decodeURIComponent(parsedUrl.password); - parsedUrl.username = ''; - parsedUrl.password = ''; - } + let parsedUrl; + + if (scannedSitemaps.has(url)) { + // Skip processing if the sitemap has already been scanned + return; + } + + scannedSitemaps.add(url); + + // Convert file if its not local file path + url = convertLocalFileToPath(url) + + // Check whether its a file path or a URL + if (isFilePath(url)) { + if (!fs.existsSync(url)) { + return; + } + parsedUrl = url; + } else if(isValidHttpUrl(url)){ + parsedUrl = new URL(url); + + if (parsedUrl.username !== '' && parsedUrl.password !== '') { + isBasicAuth = true; + username = decodeURIComponent(parsedUrl.username); + password = decodeURIComponent(parsedUrl.password); + parsedUrl.username = ''; + parsedUrl.password = ''; + } + } else{ + printMessage([`Invalid Url/Filepath: ${url}`], messageOptions); + return; + } const getDataUsingPlaywright = async () => { const browserContext = await constants.launcher.launchPersistentContext( @@ -859,9 +896,7 @@ export const getLinksFromSitemap = async ( ); const page = await browserContext.newPage(); - await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 }); - if (constants.launcher === webkit) { data = await page.locator('body').innerText(); } else { @@ -904,7 +939,11 @@ export const getLinksFromSitemap = async ( password: password, }, }); + try{ data = await (await instance.get(url, { timeout: 80000 })).data; + } catch(error){ + return; //to skip the error + } } catch (error) { if (error.code === 'ECONNABORTED') { await getDataUsingPlaywright(); @@ -912,6 +951,7 @@ export const getLinksFromSitemap = async ( } } } else { + url = convertLocalFileToPath(url); data = fs.readFileSync(url, 'utf8'); } const $ = cheerio.load(data, { xml: true }); @@ -944,11 +984,15 @@ export const getLinksFromSitemap = async ( case constants.xmlSitemapTypes.xmlIndex: silentLogger.info(`This is a XML format sitemap index.`); for (const childSitemapUrl of $('loc')) { + const childSitemapUrlText = $(childSitemapUrl).text(); if (isLimitReached()) { break; } - - await fetchUrls($(childSitemapUrl).text()); + if (childSitemapUrlText.endsWith('.xml')) { + await fetchUrls(childSitemapUrlText); // Recursive call for nested sitemaps + } else { + addToUrlList(childSitemapUrlText); // Add regular URLs to the list + } } break; case constants.xmlSitemapTypes.xml: @@ -1735,3 +1779,26 @@ export const waitForPageLoaded = async (page, timeout = 10000) => { new Promise((resolve) => setTimeout(resolve, timeout)) ]); } + +function isValidHttpUrl(urlString) { + const pattern = /^(http|https):\/\/[^ "]+$/; + return pattern.test(urlString); +} + +export const isFilePath = (url: string): boolean => { + return url.startsWith('file://') || url.startsWith('/'); +}; + +export function convertLocalFileToPath(url: string): string { + if (url.startsWith('file://')) { + url = fileURLToPath(url); + } + return url; +} + +export function convertPathToLocalFile(filePath: string): string { + if (filePath.startsWith("/")){ + filePath = pathToFileURL(filePath).toString(); + } + return filePath; +} \ No newline at end of file diff --git a/src/constants/constants.ts b/src/constants/constants.ts index 36671090..940026c2 100644 --- a/src/constants/constants.ts +++ b/src/constants/constants.ts @@ -34,7 +34,7 @@ export const blackListedFileExtensions = [ export const getIntermediateScreenshotsPath = (datasetsPath: string): string => `${datasetsPath}/screenshots`; export const destinationPath = (storagePath: string): string => - `${storagePath}/reports/screenshots`; + `${storagePath}/screenshots`; /** Get the path to Default Profile in the Chrome Data Directory * as per https://chromium.googlesource.com/chromium/src/+/master/docs/user_data_dir.md @@ -210,6 +210,7 @@ export enum ScannerTypes { WEBSITE = 'Website', CUSTOM = 'Custom', INTELLIGENT = 'Intelligent', + LOCALFILE = 'LocalFile', } export const guiInfoStatusTypes = { diff --git a/src/constants/questions.ts b/src/constants/questions.ts index 108301fa..2d971e9b 100644 --- a/src/constants/questions.ts +++ b/src/constants/questions.ts @@ -29,6 +29,7 @@ const startScanQuestions = [ { name: 'Website', value: ScannerTypes.WEBSITE }, { name: 'Custom', value: ScannerTypes.CUSTOM }, { name: 'Intelligent', value: ScannerTypes.INTELLIGENT }, + { name: 'Localfile', value: ScannerTypes.LOCALFILE}, ], }, { @@ -104,7 +105,7 @@ const startScanQuestions = [ case statuses.systemError.code: return statuses.systemError.message; case statuses.invalidUrl.code: - if (answers.scanner !== ScannerTypes.SITEMAP) { + if (answers.scanner !== (ScannerTypes.SITEMAP || ScannerTypes.LOCALFILE)) { return statuses.invalidUrl.message; } @@ -113,7 +114,7 @@ const startScanQuestions = [ file is a sitemap */ const finalFilePath = getFileSitemap(url); if (finalFilePath) { - answers.isLocalSitemap = true; + answers.isLocalFileScan = true; answers.finalUrl = finalFilePath; return true; } else { diff --git a/src/crawlers/commonCrawlerFunc.ts b/src/crawlers/commonCrawlerFunc.ts index f8f63f3c..1d57d916 100644 --- a/src/crawlers/commonCrawlerFunc.ts +++ b/src/crawlers/commonCrawlerFunc.ts @@ -5,6 +5,7 @@ import axe, { resultGroups } from 'axe-core'; import { axeScript, guiInfoStatusTypes, saflyIconSelector } from '../constants/constants.js'; import { guiInfoLog } from '../logs.js'; import { takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js'; +import { isFilePath } from '../constants/common.js'; // types type RuleDetails = { @@ -221,8 +222,12 @@ export const failedRequestHandler = async ({ request }) => { }; export const isUrlPdf = url => { - const parsedUrl = new URL(url); - return /\.pdf($|\?|#)/i.test(parsedUrl.pathname) || /\.pdf($|\?|#)/i.test(parsedUrl.href); + if(isFilePath(url)) { + return /\.pdf$/i.test(url); + } else { + const parsedUrl = new URL(url); + return /\.pdf($|\?|#)/i.test(parsedUrl.pathname) || /\.pdf($|\?|#)/i.test(parsedUrl.href); + } }; diff --git a/src/crawlers/crawlDomain.ts b/src/crawlers/crawlDomain.ts index 40a46dde..b3954f11 100644 --- a/src/crawlers/crawlDomain.ts +++ b/src/crawlers/crawlDomain.ts @@ -318,7 +318,7 @@ const crawlDomain = async ( launchContext: { launcher: constants.launcher, launchOptions: getPlaywrightLaunchOptions(browser), - // Bug in Chrome which causes brwoser pool crash when userDataDirectory is set in non-headless mode + // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode userDataDir: userDataDirectory ? process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory @@ -356,7 +356,7 @@ const crawlDomain = async ( preNavigationHooks(extraHTTPHeaders); }, ], - requestHandlerTimeoutSecs: 90, // Alow each page to be processed by up from default 60 seconds + requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds requestHandler: async ({ page, request, diff --git a/src/crawlers/crawlIntelligentSitemap.ts b/src/crawlers/crawlIntelligentSitemap.ts index 78204ecb..fe8f484f 100644 --- a/src/crawlers/crawlIntelligentSitemap.ts +++ b/src/crawlers/crawlIntelligentSitemap.ts @@ -128,6 +128,7 @@ import {chromium} from 'playwright'; url, dataset, //for crawlSitemap to add on to urlsCrawled, //for crawlSitemap to add on to + false, ) if (urlsCrawled.scanned.length < maxRequestsPerCrawl){ diff --git a/src/crawlers/crawlLocalFile.ts b/src/crawlers/crawlLocalFile.ts new file mode 100644 index 00000000..6478732e --- /dev/null +++ b/src/crawlers/crawlLocalFile.ts @@ -0,0 +1,201 @@ +import crawlee, { Request, RequestList } from 'crawlee'; +import printMessage from 'print-message'; +import { + createCrawleeSubFolders, + preNavigationHooks, + runAxeScript, + failedRequestHandler, + isUrlPdf, +} from './commonCrawlerFunc.js'; + +import constants, { guiInfoStatusTypes, basicAuthRegex } from '../constants/constants.js'; +import { + getLinksFromSitemap, + getPlaywrightLaunchOptions, + messageOptions, + isSkippedUrl, + isFilePath, + convertLocalFileToPath, + convertPathToLocalFile, +} from '../constants/common.js'; +import { areLinksEqual, isWhitelistedContentType } from '../utils.js'; +import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js'; +import fs from 'fs'; +import { guiInfoLog } from '../logs.js'; +import playwright from 'playwright'; +import path from 'path'; +import crawlSitemap from './crawlSitemap.js'; + +const crawlLocalFile = async ( + sitemapUrl: string, + randomToken: string, + host: string, + viewportSettings: any, + maxRequestsPerCrawl: number, + browser: string, + userDataDirectory: string, + specifiedMaxConcurrency: number, + fileTypes: string, + blacklistedPatterns: string[], + includeScreenshots: boolean, + extraHTTPHeaders: any, + fromCrawlIntelligentSitemap: boolean = false, //optional + userUrlInputFromIntelligent: any = null, //optional + datasetFromIntelligent: any = null, //optional + urlsCrawledFromIntelligent: any = null, //optional +) => { + let dataset: any; + let urlsCrawled: any; + let linksFromSitemap = []; + + // Boolean to omit axe scan for basic auth URL + let isBasicAuth: boolean; + let basicAuthPage: number = 0; + let finalLinks: Request[] = []; + + if (fromCrawlIntelligentSitemap) { + dataset = datasetFromIntelligent; + urlsCrawled = urlsCrawledFromIntelligent; + } else { + ({ dataset } = await createCrawleeSubFolders(randomToken)); + urlsCrawled = { ...constants.urlsCrawledObj }; + + if (!fs.existsSync(randomToken)) { + fs.mkdirSync(randomToken); + } + } + + // Check if the sitemapUrl is a local file and if it exists + if (!(isFilePath(sitemapUrl)) || !fs.existsSync(sitemapUrl)) { + return; + } + + // Checks if its in the right file format, and change it before placing into linksFromSitemap + convertLocalFileToPath(sitemapUrl); + + // XML Files + if (!sitemapUrl.match(/\.xml$/i)) { + linksFromSitemap = [new Request({ url: sitemapUrl })]; + + // Non XML file + } else { + const username = ''; + const password = ''; + + // Put it to crawlSitemap function to handle xml files + const updatedUrlsCrawled = await crawlSitemap( + sitemapUrl, + randomToken, + host, + viewportSettings, + maxRequestsPerCrawl, + browser, + userDataDirectory, + specifiedMaxConcurrency, + fileTypes, + blacklistedPatterns, + includeScreenshots, + extraHTTPHeaders, + (fromCrawlIntelligentSitemap = false), //optional + (userUrlInputFromIntelligent = null), //optional + (datasetFromIntelligent = null), //optional + (urlsCrawledFromIntelligent = null), //optional + true, + ); + + urlsCrawled = { ...urlsCrawled, ...updatedUrlsCrawled }; + return urlsCrawled; + } + + try { + sitemapUrl = encodeURI(sitemapUrl); + } catch (e) { + console.log(e); + } + + if (basicAuthRegex.test(sitemapUrl)) { + isBasicAuth = true; + // request to basic auth URL to authenticate for browser session + finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` })); + const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`; + // obtain base URL without credentials so that subsequent URLs within the same domain can be scanned + finalLinks.push(new Request({ url: finalUrl })); + basicAuthPage = -2; + } + + let uuidToPdfMapping: Record = {}; //key and value of string type + const isScanHtml: boolean = ['all', 'html-only'].includes(fileTypes); + + printMessage(['Fetching URLs. This might take some time...'], { border: false }); + + finalLinks = [...finalLinks, ...linksFromSitemap]; + + const requestList = await RequestList.open({ + sources: finalLinks, + }); + + printMessage(['Fetch URLs completed. Beginning scan'], messageOptions); + + const request = linksFromSitemap[0]; + const pdfFileName = path.basename(request.url); + const trimmedUrl: string = request.url; + const destinationFilePath: string = `${randomToken}/${pdfFileName}`; + const data: Buffer = fs.readFileSync(trimmedUrl); + fs.writeFileSync(destinationFilePath, data); + uuidToPdfMapping[pdfFileName] = trimmedUrl; + + if (!isUrlPdf(request.url)) { + let browserUsed; + // Playwright only supports chromium,firefox and webkit thus hardcoded to chromium + if (browser === 'chromium') { + browserUsed = await playwright.chromium.launch(); + } else if (browser === 'firefox') { + browserUsed = await playwright.firefox.launch(); + } else if (browser === 'webkit') { + browserUsed = await playwright.webkit.launch(); + } else if (browser === 'chrome') { + browserUsed = await playwright.chromium.launch(); //chrome not supported, default to chromium + } else { + console.log('Browser not supported, please use chrome, chromium, firefox, webkit'); + console.log(' '); + return; + } + const context = await browserUsed.newContext(); + const page = await context.newPage(); + request.url = convertPathToLocalFile(request.url); + await page.goto(request.url); + const results = await runAxeScript(includeScreenshots, page, randomToken, null); + + guiInfoLog(guiInfoStatusTypes.SCANNED, { + numScanned: urlsCrawled.scanned.length, + urlScanned: request.url, + }); + + urlsCrawled.scanned.push({ + url: request.url, + pageTitle: results.pageTitle, + actualUrl: request.loadedUrl, // i.e. actualUrl + }); + + urlsCrawled.scannedRedirects.push({ + fromUrl: request.url, + toUrl: request.loadedUrl, // i.e. actualUrl + }); + + results.url = request.url; + // results.actualUrl = request.loadedUrl; + + await dataset.pushData(results); + } else { + urlsCrawled.scanned.push({ url: trimmedUrl, pageTitle: pdfFileName }); + + await runPdfScan(randomToken); + // transform result format + const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping); + + // push results for each pdf document to key value store + await Promise.all(pdfResults.map(result => dataset.pushData(result))); + } + return urlsCrawled; +}; +export default crawlLocalFile; \ No newline at end of file diff --git a/src/crawlers/crawlSitemap.ts b/src/crawlers/crawlSitemap.ts index aa110847..485c51b6 100644 --- a/src/crawlers/crawlSitemap.ts +++ b/src/crawlers/crawlSitemap.ts @@ -1,4 +1,4 @@ -import crawlee, { Request,RequestList } from 'crawlee'; +import crawlee, { Request, RequestList } from 'crawlee'; import printMessage from 'print-message'; import { createCrawleeSubFolders, @@ -16,6 +16,7 @@ import { isSkippedUrl, urlWithoutAuth, waitForPageLoaded, + isFilePath, } from '../constants/common.js'; import { areLinksEqual, isWhitelistedContentType } from '../utils.js'; import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js'; @@ -27,7 +28,7 @@ const crawlSitemap = async ( randomToken, host, viewportSettings, - maxRequestsPerCrawl, + maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, @@ -39,71 +40,86 @@ const crawlSitemap = async ( userUrlInputFromIntelligent = null, //optional datasetFromIntelligent = null, //optional urlsCrawledFromIntelligent = null, //optional - + crawledFromLocalFile = false, //optional ) => { let dataset; let urlsCrawled; - let linksFromSitemap + let linksFromSitemap; - // Boolean to omit axe scan for basic auth URL let isBasicAuth; let basicAuthPage = 0; - let finalLinks = []; - let authHeader = ""; - - if (fromCrawlIntelligentSitemap){ - dataset=datasetFromIntelligent; + let finalLinks = []; + let authHeader = ''; + + if (fromCrawlIntelligentSitemap) { + dataset = datasetFromIntelligent; urlsCrawled = urlsCrawledFromIntelligent; - } else { ({ dataset } = await createCrawleeSubFolders(randomToken)); urlsCrawled = { ...constants.urlsCrawledObj }; - + if (!fs.existsSync(randomToken)) { fs.mkdirSync(randomToken); } } - const parsedUrl = new URL(sitemapUrl); - let username = "" - let password = ""; - if (parsedUrl.username !=="" && parsedUrl.password !=="") { - isBasicAuth = true; - username = decodeURIComponent(parsedUrl.username); - password = decodeURIComponent(parsedUrl.password); + let parsedUrl; + let username = ''; + let password = ''; - // Create auth header - authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`; + if (!crawledFromLocalFile && isFilePath(sitemapUrl)) { + console.log('Local file crawling not supported for sitemap. Please provide a valid URL.'); + return; + } + + if (isFilePath(sitemapUrl)) { + parsedUrl = sitemapUrl; + } else { + parsedUrl = new URL(sitemapUrl); + if (parsedUrl.username !== '' && parsedUrl.password !== '') { + isBasicAuth = true; + username = decodeURIComponent(parsedUrl.username); + password = decodeURIComponent(parsedUrl.password); - parsedUrl.username = ""; - parsedUrl.password = ""; + // Create auth header + authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`; + parsedUrl.username = ''; + parsedUrl.password = ''; + } } - linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, username, password) - + linksFromSitemap = await getLinksFromSitemap( + sitemapUrl, + maxRequestsPerCrawl, + browser, + userDataDirectory, + userUrlInputFromIntelligent, + fromCrawlIntelligentSitemap, + username, + password, + ); /** * Regex to match http://username:password@hostname.com * utilised in scan strategy to ensure subsequent URLs within the same domain are scanned. * First time scan with original `url` containing credentials is strictly to authenticate for browser session * subsequent URLs are without credentials. * basicAuthPage is set to -1 for basic auth URL to ensure it is not counted towards maxRequestsPerCrawl - */ + */ + + sitemapUrl = encodeURI(sitemapUrl); - sitemapUrl = encodeURI(sitemapUrl) - if (isBasicAuth) { // request to basic auth URL to authenticate for browser session finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` })); const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`; - + // obtain base URL without credentials so that subsequent URLs within the same domain can be scanned finalLinks.push(new Request({ url: finalUrl })); basicAuthPage = -2; - } - - + } + let pdfDownloads = []; let uuidToPdfMapping = {}; const isScanHtml = ['all', 'html-only'].includes(fileTypes); @@ -111,10 +127,7 @@ const crawlSitemap = async ( const { playwrightDeviceDetailsObject } = viewportSettings; const { maxConcurrency } = constants; - - printMessage(['Fetching URLs. This might take some time...'], { border: false }); - finalLinks = [...finalLinks, ...linksFromSitemap]; @@ -127,8 +140,12 @@ const crawlSitemap = async ( launchContext: { launcher: constants.launcher, launchOptions: getPlaywrightLaunchOptions(browser), - // Bug in Chrome which causes brwoser pool crash when userDataDirectory is set in non-headless mode - userDataDir: userDataDirectory ? (process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '') : '', + // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode + userDataDir: userDataDirectory + ? process.env.CRAWLEE_HEADLESS !== '0' + ? userDataDirectory + : '' + : '', }, retryOnBlocked: true, browserPoolOptions: { @@ -147,36 +164,34 @@ const crawlSitemap = async ( requestList, preNavigationHooks: isBasicAuth ? [ - async ({ page, request }) => { - await page.setExtraHTTPHeaders({ - Authorization: authHeader, - ...extraHTTPHeaders, - }); - }, - ] + async ({ page, request }) => { + await page.setExtraHTTPHeaders({ + Authorization: authHeader, + ...extraHTTPHeaders, + }); + }, + ] : [ - async ({ page, request }) => { - preNavigationHooks(extraHTTPHeaders) - //insert other code here - }, - ], + async ({ page, request }) => { + preNavigationHooks(extraHTTPHeaders); + //insert other code here + }, + ], requestHandlerTimeoutSecs: 90, requestHandler: async ({ page, request, response, sendRequest }) => { - await waitForPageLoaded(page, 10000); // Set basic auth header if needed if (isBasicAuth) { await page.setExtraHTTPHeaders({ - 'Authorization': authHeader + Authorization: authHeader, }); const currentUrl = new URL(request.url); currentUrl.username = username; currentUrl.password = password; request.url = currentUrl.href; } - - + const actualUrl = request.loadedUrl || request.url; if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) { @@ -241,13 +256,13 @@ const crawlSitemap = async ( numScanned: urlsCrawled.scanned.length, urlScanned: request.url, }); - + const isRedirected = !areLinksEqual(request.loadedUrl, request.url); if (isRedirected) { const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some( item => (item.actualUrl || item.url) === request.loadedUrl, ); - + if (isLoadedUrlInCrawledUrls) { urlsCrawled.notScannedRedirects.push({ fromUrl: request.url, @@ -255,22 +270,25 @@ const crawlSitemap = async ( }); return; } - + urlsCrawled.scanned.push({ url: urlWithoutAuth(request.url), pageTitle: results.pageTitle, actualUrl: request.loadedUrl, // i.e. actualUrl }); - + urlsCrawled.scannedRedirects.push({ fromUrl: urlWithoutAuth(request.url), toUrl: request.loadedUrl, // i.e. actualUrl }); - + results.url = request.url; results.actualUrl = request.loadedUrl; } else { - urlsCrawled.scanned.push({ url: urlWithoutAuth(request.url), pageTitle: results.pageTitle }); + urlsCrawled.scanned.push({ + url: urlWithoutAuth(request.url), + pageTitle: results.pageTitle, + }); } await dataset.pushData(results); } else { @@ -278,22 +296,23 @@ const crawlSitemap = async ( numScanned: urlsCrawled.scanned.length, urlScanned: request.url, }); - + isScanHtml && urlsCrawled.invalid.push(actualUrl); } } }, failedRequestHandler: async ({ request }) => { - - if (isBasicAuth){ - request.url ? request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}` : null; + if (isBasicAuth) { + request.url + ? (request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}`) + : null; } // check if scanned pages have reached limit due to multi-instances of handler running if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) { return; } - + guiInfoLog(guiInfoStatusTypes.ERROR, { numScanned: urlsCrawled.scanned.length, urlScanned: request.url, @@ -309,9 +328,6 @@ const crawlSitemap = async ( await requestList.isFinished(); - - - if (pdfDownloads.length > 0) { // wait for pdf downloads to complete await Promise.all(pdfDownloads); @@ -333,13 +349,11 @@ const crawlSitemap = async ( await Promise.all(pdfResults.map(result => dataset.pushData(result))); } - - if (!fromCrawlIntelligentSitemap){ + if (!fromCrawlIntelligentSitemap) { guiInfoLog(guiInfoStatusTypes.COMPLETED, {}); } return urlsCrawled; - }; export default crawlSitemap; diff --git a/src/crawlers/pdfScanFunc.ts b/src/crawlers/pdfScanFunc.ts index 3f48aef5..100c0604 100644 --- a/src/crawlers/pdfScanFunc.ts +++ b/src/crawlers/pdfScanFunc.ts @@ -8,6 +8,7 @@ import { createRequire } from 'module'; import os from 'os'; import path from 'path'; import { getPageFromContext } from '../screenshotFunc/pdfScreenshotFunc.js'; +import { isFilePath } from '../constants/common.js'; const require = createRequire(import.meta.url); @@ -144,10 +145,18 @@ export const handlePdfDownload = (randomToken, pdfDownloads, request, sendReques pdfDownloads.push( new Promise(async resolve => { - const pdfResponse = await sendRequest({ responseType: 'buffer', isStream: true }); - pdfResponse.setEncoding('binary'); - - const bufs = []; // to check for pdf validity + let bufs = []; + let pdfResponse; + + if (isFilePath(url)) { + // Read the file from the file system + const filePath = new URL(url).pathname; + pdfResponse = fs.createReadStream(filePath, { encoding: 'binary' }); + } else { + // Send HTTP/HTTPS request + pdfResponse = await sendRequest({ responseType: 'buffer', isStream: true }); + pdfResponse.setEncoding('binary'); + } const downloadFile = fs.createWriteStream(`${randomToken}/${pdfFileName}.pdf`, { flags: 'a', }); @@ -216,17 +225,24 @@ export const mapPdfScanResults = async (randomToken, uuidToUrlMapping) => { const intermediateFolder = randomToken; const intermediateResultPath = `${intermediateFolder}/${constants.pdfScanResultFileName}`; - const rawdata = fs.readFileSync(intermediateResultPath); - const output = JSON.parse(rawdata.toString()); + const rawdata = fs.readFileSync(intermediateResultPath, 'utf-8'); + + let parsedJsonData; + try { + parsedJsonData = JSON.parse(rawdata); + } catch (err) { + consoleLogger.log(err); + } const errorMeta = require('../constants/errorMeta.json'); const resultsList = []; + if (parsedJsonData) { // jobs: files that are scanned const { report: { jobs }, - } = output; + } = parsedJsonData; // loop through all jobs for (let jobIdx = 0; jobIdx < jobs.length; jobIdx++) { @@ -277,6 +293,7 @@ export const mapPdfScanResults = async (randomToken, uuidToUrlMapping) => { resultsList.push(translated); } +} return resultsList; }; diff --git a/src/index.ts b/src/index.ts index d47d76b5..22d1f228 100644 --- a/src/index.ts +++ b/src/index.ts @@ -39,7 +39,7 @@ export type Answers = { metadata: string; maxpages: number; strategy: string; - isLocalSitemap: boolean; + isLocalFileScan: boolean; finalUrl: string; customFlowLabel: string; specifiedMaxConcurrency: number; @@ -63,7 +63,7 @@ export type Data = { playwrightDeviceDetailsObject: Object; maxRequestsPerCrawl: number; strategy: string; - isLocalSitemap: boolean; + isLocalFileScan: boolean; browser: string; nameEmail: string; customFlowLabel: string; diff --git a/src/mergeAxeResults.ts b/src/mergeAxeResults.ts index bbeb10b0..d9a8ec36 100644 --- a/src/mergeAxeResults.ts +++ b/src/mergeAxeResults.ts @@ -106,7 +106,7 @@ const parseContentToJson = async rPath => const writeCsv = async (allIssues, storagePath) => { - const csvOutput = createWriteStream(`${storagePath}/reports/report.csv`, { encoding: 'utf8' }); + const csvOutput = createWriteStream(`${storagePath}/report.csv`, { encoding: 'utf8' }); const formatPageViolation = pageNum => { if (pageNum < 0) return 'Document'; return `Page ${pageNum}`; @@ -201,7 +201,7 @@ const writeHTML = async (allIssues, storagePath, htmlFilename = 'report') => { filename: path.join(__dirname, './static/ejs/report.ejs'), }); const html = template(allIssues); - fs.writeFileSync(`${storagePath}/reports/${htmlFilename}.html`, html); + fs.writeFileSync(`${storagePath}/${htmlFilename}.html`, html); }; const writeSummaryHTML = async (allIssues, storagePath, htmlFilename = 'summary') => { @@ -210,7 +210,7 @@ const writeSummaryHTML = async (allIssues, storagePath, htmlFilename = 'summary' filename: path.join(__dirname, './static/ejs/summary.ejs'), }); const html = template(allIssues); - fs.writeFileSync(`${storagePath}/reports/${htmlFilename}.html`, html); + fs.writeFileSync(`${storagePath}/${htmlFilename}.html`, html); }; // Proper base64 encoding function using Buffer @@ -230,7 +230,7 @@ const writeBase64 = async (allIssues, storagePath, htmlFilename = 'report.html') const encodedScanItems = base64Encode(items); const encodedScanData = base64Encode(rest); - const filePath = path.join(storagePath, 'reports', 'scanDetails.csv'); + const filePath = path.join(storagePath, 'scanDetails.csv'); const directoryPath = path.dirname(filePath); if (!fs.existsSync(directoryPath)) { @@ -239,7 +239,7 @@ const writeBase64 = async (allIssues, storagePath, htmlFilename = 'report.html') await fs.promises.writeFile(filePath, `scanData_base64,scanItems_base64\n${encodedScanData},${encodedScanItems}`); - const htmlFilePath = path.join(storagePath, 'reports', htmlFilename); + const htmlFilePath = path.join(storagePath, htmlFilename); let htmlContent = fs.readFileSync(htmlFilePath, 'utf8'); const allIssuesJson = JSON.stringify(allIssues); @@ -282,8 +282,8 @@ if (os.platform() === 'linux') { } const writeSummaryPdf = async (storagePath, filename = 'summary') => { - const htmlFilePath = `${storagePath}/reports/${filename}.html`; - const fileDestinationPath = `${storagePath}/reports/${filename}.pdf`; + const htmlFilePath = `${storagePath}/${filename}.html`; + const fileDestinationPath = `${storagePath}/${filename}.pdf`; const browser = await chromium.launch({ headless: true, channel: browserChannel, @@ -468,7 +468,7 @@ const createRuleIdJson = allIssues => { const moveElemScreenshots = (randomToken, storagePath) => { const currentScreenshotsPath = `${randomToken}/elemScreenshots`; - const resultsScreenshotsPath = `${storagePath}/reports/elemScreenshots`; + const resultsScreenshotsPath = `${storagePath}/elemScreenshots`; if (fs.existsSync(currentScreenshotsPath)) { fs.moveSync(currentScreenshotsPath, resultsScreenshotsPath); } @@ -490,7 +490,7 @@ export const generateArtifacts = async ( const storagePath = getStoragePath(randomToken); - urlScanned = urlWithoutAuth(urlScanned); + urlScanned = (scanType === ScannerTypes.SITEMAP || scanType === ScannerTypes.LOCALFILE) ? urlScanned : urlWithoutAuth(urlScanned); const formatAboutStartTime = dateString => { const utcStartTimeDate = new Date(dateString); diff --git a/src/screenshotFunc/htmlScreenshotFunc.ts b/src/screenshotFunc/htmlScreenshotFunc.ts index e8a20667..2812722c 100644 --- a/src/screenshotFunc/htmlScreenshotFunc.ts +++ b/src/screenshotFunc/htmlScreenshotFunc.ts @@ -156,7 +156,7 @@ const saveImageBufferToFile = (buffer, fileName) => { // export const takeScreenshotForHTMLElements = async (screenshotData, storagePath, browserToRun) => { -// const screenshotDir = `${storagePath}/reports/screenshots`; +// const screenshotDir = `${storagePath}/screenshots`; // let screenshotItems = []; // let randomToken = `cloned-${Date.now()}`; // const clonedDir = getClonedProfilesWithRandomToken(browserToRun, randomToken); diff --git a/src/screenshotFunc/pdfScreenshotFunc.ts b/src/screenshotFunc/pdfScreenshotFunc.ts index 3eaaf4d1..d6a32795 100644 --- a/src/screenshotFunc/pdfScreenshotFunc.ts +++ b/src/screenshotFunc/pdfScreenshotFunc.ts @@ -337,6 +337,7 @@ export const getSelectedPageByLocation = bboxLocation => { }; export const getPageFromContext = async (context, pdfFilePath) => { + try{ const loadingTask = pdfjs.getDocument({ url: pdfFilePath, // canvasFactory, @@ -348,6 +349,9 @@ export const getPageFromContext = async (context, pdfFilePath) => { const structureTree = await pdf._pdfInfo.structureTree; const page = getBboxPage({ location: context }, structureTree); return page; +} catch (error){ + // Error handling +} }; export const getBboxPages = (bboxes, structure) => { diff --git a/src/utils.ts b/src/utils.ts index 0a59d29f..1cc079e5 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -122,7 +122,7 @@ export const writeToUserDataTxt = async (key, value) => { export const createAndUpdateResultsFolders = async randomToken => { const storagePath = getStoragePath(randomToken); - await fs.ensureDir(`${storagePath}/reports`); + await fs.ensureDir(`${storagePath}`); const intermediatePdfResultsPath = `${randomToken}/${constants.pdfScanResultFileName}`;