From d2aa6b121c5bcf9c199252e7bb663438f7497a65 Mon Sep 17 00:00:00 2001 From: Georgetxm <52128673+Georgetxm@users.noreply.github.com> Date: Fri, 30 Jun 2023 12:19:13 +0800 Subject: [PATCH] Browser based scans and scanning in proxy environment (#132) * Ability to scan for basic-authenticated sites * Ability to scan cookie-authenticated sites with Chrome/Edge profiles. * Browser-based scans and concurrent browser-based scans * "-b" flag in cli to specify the browser to be used * "-s" flag in cli to specify the crawling strategy to be domain or hostname * Updated paths as per the new Purple-HATS directory structure * Network settings on Playwright for proxy environments with server-side rendering * Rename passed_items.json to passed_items.json.txt * Updated README.md --- README.md | 15 +- cli.js | 208 ++- combine.js | 29 +- constants/cliFunctions.js | 22 +- constants/common.js | 1163 ++++++++++++----- constants/constants.js | 361 +++-- constants/questions.js | 2 +- crawlers/commonCrawlerFunc.js | 37 +- crawlers/crawlDomain.js | 69 +- crawlers/crawlSitemap.js | 52 +- index.js | 27 +- mergeAxeResults.js | 2 +- package-lock.json | 4 +- package.json | 4 +- playwrightAxeGenerator.js | 1064 ++++++++------- static/ejs/partials/scripts/ruleOffcanvas.ejs | 2 +- utils.js | 35 +- 17 files changed, 2026 insertions(+), 1070 deletions(-) diff --git a/README.md b/README.md index 0f02216d..9c5bfa4d 100644 --- a/README.md +++ b/README.md @@ -203,11 +203,12 @@ Options: ailable in website and sitemap scans [number] -h, --headless Whether to run the scan in headless mode. Defaults to y es. [string] [choices: "yes", "no"] [default: "yes"] - --reportbreakdown Will break down the main report according to impact - [boolean] [default: false] - --warn Track for issues of target impact level - [choices: "critical", "serious", "moderate", "minor", "none"] [default: "none" - ] + -b, --browserToRun Browser to run the scan on: 1) Chromium, 2) Chrome, 3) Ed + ge. Defaults to Chromium. + [choices: "chrome", "edge", "chromium"] [default: "chromium"] + -s, --strategy Strategy to choose which links to crawl in a website scan + . Defaults to "same-domain". + [choices: "same-domain", "same-hostname"] Examples: To scan sitemap of website:', 'node cli.js -c [ 1 | Sitemap ] -d -u @@ -218,10 +219,12 @@ Examples: -w ``` -### Mobile Device Options +### Device Options
Click here for list of device options supported +- "Desktop" (defaults to a 1280x720 viewport) +- "Mobile" (defaults to iPhone 11 viewport) - "Desktop Chrome HiDPI" - "Desktop Edge HiDPI" - "Desktop Firefox HiDPI" diff --git a/cli.js b/cli.js index f91bd426..d8f0f265 100644 --- a/cli.js +++ b/cli.js @@ -1,24 +1,29 @@ #!/usr/bin/env node +/* eslint-disable no-fallthrough */ /* eslint-disable no-undef */ /* eslint-disable no-param-reassign */ import fs from 'fs-extra'; import _yargs from 'yargs'; import { hideBin } from 'yargs/helpers'; import printMessage from 'print-message'; +import { devices } from 'playwright'; +import { cleanUp, zipResults, setHeadlessMode, getVersion, getStoragePath } from './utils.js'; import { - cleanUp, - zipResults, - setHeadlessMode, - setThresholdLimits, - getVersion, - getStoragePath, -} from './utils.js'; -import { checkUrl, prepareData, isFileSitemap } from './constants/common.js'; + checkUrl, + prepareData, + isFileSitemap, + cloneChromeProfiles, + cloneEdgeProfiles, + deleteClonedChromeProfiles, + deleteClonedEdgeProfiles, +} from './constants/common.js'; import { cliOptions, messageOptions } from './constants/cliFunctions.js'; -import constants from './constants/constants.js'; +import constants, { + getDefaultChromeDataDir, + getDefaultEdgeDataDir, +} from './constants/constants.js'; import combineRun from './combine.js'; import playwrightAxeGenerator from './playwrightAxeGenerator.js'; -import { devices } from 'playwright'; import { silentLogger } from './logs.js'; const appVersion = getVersion(); @@ -34,13 +39,13 @@ Usage: node cli.js -c -d -w -u OPTIONS`, .options(cliOptions) .example([ [ - `To scan sitemap of website:', 'node cli.js -c [ 1 | ${constants.scannerTypes.sitemap} ] -d -u -w `, + `To scan sitemap of website:', 'node cli.js -c [ 1 | sitemap ] -u [ -d | -w ]`, ], [ - `To scan a website', 'node cli.js -c [ 2 | ${constants.scannerTypes.website} ] -d -u -w `, + `To scan a website', 'node cli.js -c [ 2 | website ] -u [ -d | -w ]`, ], [ - `To start a custom flow scan', 'node cli.js -c [ 3 | ${constants.scannerTypes.custom} ] -d -u -w `, + `To start a custom flow scan', 'node cli.js -c [ 3 | custom ] -u [ -d | -w ]`, ], ]) .coerce('c', option => { @@ -65,7 +70,7 @@ Usage: node cli.js -c -d -w -u OPTIONS`, }) .coerce('d', option => { const device = devices[option]; - if (option != 'Desktop' && !device) { + if (!device && option !== 'Desktop' && option !== 'Mobile') { printMessage( [`Invalid device. Please provide an existing device to start the scan.`], messageOptions, @@ -97,9 +102,34 @@ Usage: node cli.js -c -d -w -u OPTIONS`, } return option; }) + .coerce('b', option => { + const { choices } = cliOptions.b; + if (typeof option === 'number') { + if (Number.isInteger(option) && option > 0 && option <= choices.length) { + option = choices[option - 1]; + } else { + printMessage( + [ + 'Invalid option', + `Please enter an integer (1 to ${choices.length}) or keywords (${choices.join(', ')}).`, + ], + messageOptions, + ); + process.exit(1); + } + } + + return option; + }) .check(argvs => { if (argvs.scanner === 'custom' && argvs.maxpages) { - throw new Error('-p or --maxpages is only available in website and sitemap scans'); + throw new Error('-p or --maxpages is only available in website and sitemap scans.'); + } + return true; + }) + .check(argvs => { + if (argvs.scanner !== 'website' && argvs.strategy) { + throw new Error('-s or --strategy is only available in website scans.'); } return true; }) @@ -109,13 +139,114 @@ Usage: node cli.js -c -d -w -u OPTIONS`, const scanInit = async argvs => { argvs.scanner = constants.scannerTypes[argvs.scanner]; argvs.headless = argvs.headless === 'yes'; + argvs.browserToRun = constants.browserTypes[argvs.browserToRun]; + + let useChrome = false; + let useEdge = false; + let chromeDataDir = null; + let edgeDataDir = null; + // Empty string for profile directory will use incognito mode in playwright + let clonedDataDir = ''; - const res = await checkUrl(argvs.scanner, argvs.url); + if (argvs.browserToRun === constants.browserTypes.chrome) { + chromeDataDir = getDefaultChromeDataDir(); + clonedDataDir = cloneChromeProfiles(); + if (chromeDataDir && clonedDataDir) { + argvs.browserToRun = constants.browserTypes.chrome; + useChrome = true; + } else { + printMessage(['Unable to use Chrome, falling back to Edge browser...'], messageOptions); + edgeDataDir = getDefaultEdgeDataDir(); + clonedDataDir = cloneEdgeProfiles(); + if (edgeDataDir && clonedDataDir) { + useEdge = true; + argvs.browserToRun = constants.browserTypes.edge; + } else { + printMessage( + ['Unable to use both Chrome and Edge, falling back to Chromium...'], + messageOptions, + ); + argvs.browserToRun = constants.browserTypes.chromium; + clonedDataDir = ''; + } + } + } else if (argvs.browserToRun === constants.browserTypes.edge) { + edgeDataDir = getDefaultEdgeDataDir(); + clonedDataDir = cloneEdgeProfiles(); + if (edgeDataDir && clonedDataDir) { + useEdge = true; + argvs.browserToRun = constants.browserTypes.edge; + } else { + printMessage(['Unable to use Edge, falling back to Chrome browser...'], messageOptions); + chromeDataDir = getDefaultChromeDataDir(); + clonedDataDir = cloneChromeProfiles(); + if (chromeDataDir && clonedDataDir) { + useChrome = true; + argvs.browserToRun = constants.browserTypes.chrome; + } else { + printMessage( + ['Unable to use both Chrome and Edge, falling back to Chromium...'], + messageOptions, + ); + argvs.browserToRun = constants.browserTypes.chromium; + clonedDataDir = ''; + } + } + } else { + argvs.browserToRun = constants.browserTypes.chromium; + clonedDataDir = ''; + } + + if (argvs.customDevice === 'Desktop' || argvs.customDevice === 'Mobile') { + argvs.deviceChosen = argvs.customDevice; + delete argvs.customDevice; + } + + // Creating the playwrightDeviceDetailObject + // for use in crawlDomain & crawlSitemap's preLaunchHook + if (argvs.deviceChosen === 'Mobile' || argvs.customDevice === 'iPhone 11') { + argvs.playwrightDeviceDetailsObject = devices['iPhone 11']; + } else if (argvs.customDevice === 'Samsung Galaxy S9+') { + argvs.playwrightDeviceDetailsObject = devices['Galaxy S9+']; + } else if (argvs.viewportWidth) { + argvs.playwrightDeviceDetailsObject = { + viewport: { width: Number(argvs.viewportWidth), height: 720 }, + }; + } else if (argvs.customDevice) { + argvs.playwrightDeviceDetailsObject = devices[argvs.customDevice.replace('_', / /g)]; + } else { + argvs.playwrightDeviceDetailsObject = {}; + } + + const res = await checkUrl( + argvs.scanner, + argvs.url, + argvs.browserToRun, + clonedDataDir, + argvs.playwrightDeviceDetailsObject, + ); + + if (argvs.scanner === constants.scannerTypes.website && !argvs.strategy) { + argvs.strategy = 'same-domain'; + } const statuses = constants.urlCheckStatuses; + + // File clean up after url check + // files will clone a second time below if url check passes + if (useChrome) { + deleteClonedChromeProfiles(); + } else if (useEdge) { + deleteClonedEdgeProfiles(); + } + + // eslint-disable-next-line default-case switch (res.status) { case statuses.success.code: argvs.finalUrl = res.url; break; + case statuses.unauthorised.code: + printMessage([statuses.unauthorised.message], messageOptions); + process.exit(res.status); case statuses.cannotBeResolved.code: printMessage([statuses.cannotBeResolved.message], messageOptions); process.exit(res.status); @@ -127,7 +258,6 @@ const scanInit = async argvs => { printMessage([statuses.invalidUrl.message], messageOptions); process.exit(res.status); } - /* if sitemap scan is selected, treat this URL as a filepath isFileSitemap will tell whether the filepath exists, and if it does, whether the file is a sitemap */ @@ -140,6 +270,8 @@ const scanInit = async argvs => { case statuses.notASitemap.code: printMessage([statuses.notASitemap.message], messageOptions); process.exit(res.status); + default: + break; } const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' '); @@ -152,12 +284,14 @@ const scanInit = async argvs => { let screenToScan; - if (!argvs.customDevice && !argvs.viewportWidth) { - screenToScan = 'Desktop'; + if (argvs.deviceChosen) { + screenToScan = argvs.deviceChosen; } else if (argvs.customDevice) { screenToScan = argvs.customDevice; - } else { + } else if (argvs.viewportWidth) { screenToScan = `CustomWidth_${argvs.viewportWidth}px`; + } else { + screenToScan = 'Desktop'; } data.randomToken = `PHScan_${domain}_${date}_${time}_${argvs.scanner.replaceAll( @@ -165,6 +299,27 @@ const scanInit = async argvs => { '_', )}_${screenToScan.replaceAll(' ', '_')}`; + /** + * Cloning a second time with random token for parallel browser sessions + * Also To mitigate agaisnt known bug where cookies are + * overriden after each browser session - i.e. logs user out + * after checkingUrl and unable to utilise same cookie for scan + * */ + if (useChrome) { + clonedDataDir = cloneChromeProfiles(data.randomToken); + data.browser = constants.browserTypes.chrome; + data.userDataDirectory = clonedDataDir; + } else if (useEdge) { + clonedDataDir = cloneEdgeProfiles(data.randomToken); + data.browser = constants.browserTypes.edge; + data.userDataDirectory = clonedDataDir; + } + // Defaults to chromium by not specifying channels in Playwright, if no browser is found + else { + data.browser = constants.browserTypes.chromium; + data.userDataDirectory = ''; + } + printMessage([`Purple HATS version: ${appVersion}`, 'Starting scan...'], messageOptions); if (argvs.scanner === constants.scannerTypes.custom) { @@ -181,8 +336,14 @@ const scanInit = async argvs => { await combineRun(data, screenToScan); } + // Delete cloned directory + if (useChrome) { + deleteClonedChromeProfiles(); + } else if (useEdge) { + deleteClonedEdgeProfiles(); + } // Delete dataset and request queues - cleanUp(data.randomToken); + await cleanUp(data.randomToken); return getStoragePath(data.randomToken); }; @@ -195,8 +356,8 @@ scanInit(options).then(async storagePath => { await fs .ensureDir(storagePath) - .then(async () => { - await zipResults(constants.cliZipFileName, storagePath); + .then(() => { + zipResults(constants.cliZipFileName, storagePath); const messageToDisplay = [ `Report of this run is at ${constants.cliZipFileName}`, `Results directory is at ${storagePath}`, @@ -208,6 +369,7 @@ scanInit(options).then(async storagePath => { ); } printMessage(messageToDisplay); + process.exit(0); }) .catch(error => { printMessage([`Error in zipping results: ${error}`]); diff --git a/combine.js b/combine.js index 8e8db41b..8e8f53c3 100644 --- a/combine.js +++ b/combine.js @@ -4,12 +4,8 @@ import crawlSitemap from './crawlers/crawlSitemap.js'; import crawlDomain from './crawlers/crawlDomain.js'; import { generateArtifacts } from './mergeAxeResults.js'; -import { - getHost, - createAndUpdateResultsFolders, - createDetailsAndLogs, -} from './utils.js'; -import constants from './constants/constants.js'; +import { getHost, createAndUpdateResultsFolders, createDetailsAndLogs } from './utils.js'; +import constants, { basicAuthRegex } from './constants/constants.js'; const combineRun = async (details, deviceToScan) => { const envDetails = { ...details }; @@ -22,25 +18,35 @@ const combineRun = async (details, deviceToScan) => { deviceChosen, customDevice, viewportWidth, + playwrightDeviceDetailsObject, maxRequestsPerCrawl, isLocalSitemap, + browser, + userDataDirectory, + strategy, } = envDetails; process.env.CRAWLEE_STORAGE_DIR = randomToken; - const host = - type === constants.scannerTypes.sitemap && isLocalSitemap ? '' : getHost(url); + const host = type === constants.scannerTypes.sitemap && isLocalSitemap ? '' : getHost(url); + + // remove basic-auth credentials from URL + let finalUrl = url; + if (basicAuthRegex.test(url)) { + finalUrl = `${url.split('://')[0]}://${url.split('@')[1]}`; + } const scanDetails = { startTime: new Date().getTime(), crawlType: type, - requestUrl: url, + requestUrl: finalUrl, }; const viewportSettings = { deviceChosen, customDevice, viewportWidth, + playwrightDeviceDetailsObject, }; let urlsCrawled; @@ -52,6 +58,8 @@ const combineRun = async (details, deviceToScan) => { host, viewportSettings, maxRequestsPerCrawl, + browser, + userDataDirectory, ); break; @@ -62,6 +70,9 @@ const combineRun = async (details, deviceToScan) => { host, viewportSettings, maxRequestsPerCrawl, + browser, + userDataDirectory, + strategy, ); break; diff --git a/constants/cliFunctions.js b/constants/cliFunctions.js index 622dd1ce..728b0c39 100644 --- a/constants/cliFunctions.js +++ b/constants/cliFunctions.js @@ -1,4 +1,4 @@ -import constants from "./constants.js"; +import constants from './constants.js'; export const messageOptions = { border: false, @@ -58,16 +58,20 @@ export const cliOptions = { default: 'yes', demandOption: false, }, - reportbreakdown: { - describe: 'Will break down the main report according to impact', - type: 'boolean', - default: false, + b: { + alias: 'browserToRun', + describe: 'Browser to run the scan on: 1) Chromium, 2) Chrome, 3) Edge. Defaults to Chromium.', + choices: Object.keys(constants.browserTypes), + requiresArg: true, + default: 'chromium', demandOption: false, }, - warn: { - describe: 'Track for issues of target impact level', - choices: ['critical', 'serious', 'moderate', 'minor', 'none'], - default: 'none', + s: { + alias: 'strategy', + describe: + 'Strategy to choose which links to crawl in a website scan. Defaults to "same-domain".', + choices: ['same-domain', 'same-hostname'], + requiresArg: true, demandOption: false, }, }; diff --git a/constants/common.js b/constants/common.js index 331c9c5d..3ed8a18a 100644 --- a/constants/common.js +++ b/constants/common.js @@ -1,334 +1,829 @@ -/* eslint-disable camelcase */ -/* eslint-disable no-use-before-define */ -import validator from 'validator'; -import axios from 'axios'; -import { JSDOM } from 'jsdom'; -import * as cheerio from 'cheerio'; -import crawlee from 'crawlee'; -import { parseString } from 'xml2js'; -import fs from 'fs'; -import constants from './constants.js'; -import { silentLogger } from '../logs.js'; -import * as https from 'https'; -import { devices } from 'playwright'; - -const document = new JSDOM('').window; - -const httpsAgent = new https.Agent({ - // Run in environments with custom certificates - rejectUnauthorized: false, -}); - -export const messageOptions = { - border: false, - marginTop: 2, - marginBottom: 2, -}; - -const urlOptions = { - protocols: ['http', 'https'], - require_protocol: true, - require_tld: false, -}; - -const queryCheck = s => document.createDocumentFragment().querySelector(s); -export const isSelectorValid = selector => { - try { - queryCheck(selector); - } catch (e) { - return false; - } - return true; -}; - -// Refer to NPM validator's special characters under sanitizers for escape() -const blackListCharacters = '\\<>&\'"'; - -export const isValidXML = async content => { - let status; - let parsedContent = ''; - parseString(content, (err, result) => { - if (result) { - status = true; - parsedContent = result; - } - if (err) { - status = false; - } - }); - return { status, parsedContent }; -}; - -export const isSkippedUrl = (page, whitelistedDomains) => { - const isWhitelisted = whitelistedDomains.filter(pattern => { - if (pattern) { - return new RegExp(pattern).test(page.url()); - } - return false; - }); - - const noMatch = Object.keys(isWhitelisted).every(key => { - return isWhitelisted[key].length === 0; - }); - - return !noMatch; -}; - -export const isFileSitemap = filePath => { - if (!fs.existsSync(filePath)) { - return false; - } - const file = fs.readFileSync(filePath, 'utf8'); - return isSitemapContent(file); -}; - -export const getUrlMessage = scanner => { - switch (scanner) { - case constants.scannerTypes.website: - case constants.scannerTypes.custom: - return 'Please enter URL of website: '; - case constants.scannerTypes.sitemap: - return 'Please enter URL or file path to sitemap, or drag and drop a sitemap file here: '; - - default: - return 'Invalid option'; - } -}; - -export const isInputValid = inputString => { - if (!validator.isEmpty(inputString)) { - const removeBlackListCharacters = validator.escape(inputString); - - if (validator.isAscii(removeBlackListCharacters)) { - return true; - } - } - - return false; -}; - -export const sanitizeUrlInput = url => { - // Sanitize that there is no blacklist characters - const sanitizeUrl = validator.blacklist(url, blackListCharacters); - const data = {}; - if (validator.isURL(sanitizeUrl, urlOptions)) { - data.isValid = true; - } else { - data.isValid = false; - } - - data.url = sanitizeUrl; - return data; -}; - -const checkUrlConnectivity = async url => { - const res = {}; - - const data = sanitizeUrlInput(url); - - if (data.isValid) { - // Validate the connectivity of URL if the string format is url format - // User-Agent is modified to emulate a browser to handle cases where some sites ban non browser agents, resulting in a 403 error - await axios - .get(data.url, { headers: { 'User-Agent': devices['Desktop Chrome HiDPI'].userAgent }, httpsAgent, timeout: 15000 }) - .then(async response => { - const redirectUrl = response.request.res.responseUrl; - res.status = constants.urlCheckStatuses.success.code; - - if (redirectUrl != null) { - res.url = redirectUrl; - } else { - res.url = url; - } - - res.content = response.data; - }) - .catch(error => { - if (error.response) { - // enters here if server responds with a status other than 2xx - // the scan should still proceed even if error codes are received, so that accessibility scans for error pages can be done too - res.status = constants.urlCheckStatuses.success.code; - res.url = url; - res.content = error.response.data; - return res; - } else if (error.request) { - // enters here if URL cannot be accessed - res.status = constants.urlCheckStatuses.cannotBeResolved.code; - } else { - res.status = constants.urlCheckStatuses.systemError.code; - } - silentLogger.error(error); - }); - } else { - // enters here if input is not a URL or not using http/https protocols - res.status = constants.urlCheckStatuses.invalidUrl.code; - } - - return res; -}; - -const isSitemapContent = async content => { - const { status: isValid } = await isValidXML(content); - if (!isValid) { - const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi'); - const regexForUrl = new RegExp('^.*(http|https):/{2}.*$', 'gmi'); - // Check that the page is not a HTML page but still contains website links - if (!String(content).match(regexForHtml) && String(content).match(regexForUrl)) { - silentLogger.info( - 'Sitemap URL provided is a Valid URL but it is not in XML sitemap, RSS, nor Atom formats.', - ); - return true; - } - silentLogger.info('Not a sitemap, is most likely a HTML page; Possibly a malformed sitemap.'); - return false; - } - - return true; -}; - -export const checkUrl = async (scanner, url) => { - const res = await checkUrlConnectivity(url); - - if ( - res.status === constants.urlCheckStatuses.success.code && - scanner === constants.scannerTypes.sitemap - ) { - const isSitemap = await isSitemapContent(res.content); - - if (!isSitemap) { - res.status = constants.urlCheckStatuses.notASitemap.code; - } - } - - return res; -}; - -const isEmptyObject = obj => !Object.keys(obj).length; - -export const prepareData = argv => { - if (isEmptyObject(argv)) { - throw Error('No inputs should be provided'); - } - const { - scanner, - headless, - url, - deviceChosen, - customDevice, - viewportWidth, - maxpages, - isLocalSitemap, - finalUrl, - } = argv; - - return { - type: scanner, - url: isLocalSitemap ? url : finalUrl, - isHeadless: headless, - deviceChosen, - customDevice, - viewportWidth, - maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl, - isLocalSitemap, - }; -}; - -export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount) => { - const urls = new Set(); // for HTML documents - - const isLimitReached = () => { - return urls.size >= maxLinksCount; - }; - - const processXmlSitemap = async ($, sitemapType, selector) => { - for (const urlElement of $(selector)) { - if (isLimitReached()) { - return; - } - let url; - if (sitemapType === constants.xmlSitemapTypes.atom) { - url = $(urlElement).prop('href'); - } else { - url = $(urlElement).text(); - } - urls.add(url); - } - }; - - const processNonStandardSitemap = data => { - const urlsFromData = crawlee.extractUrls({ string: data }).slice(0, maxLinksCount); - urlsFromData.forEach(url => urls.add(url)); - }; - - const fetchUrls = async url => { - let data; - if (validator.isURL(url, urlOptions)) { - const instance = axios.create({ - httpsAgent: new https.Agent({ - rejectUnauthorized: false, - }), - }); - - data = await (await instance.get(url)).data; - } else { - data = fs.readFileSync(url, 'utf8'); - } - const $ = cheerio.load(data, { xml: true }); - - // This case is when the document is not an XML format document - if ($(':root').length === 0) { - processNonStandardSitemap(data); - return; - } - - // Root element - const root = $(':root')[0]; - - const { xmlns } = root.attribs; - const xmlFormatNamespace = 'http://www.sitemaps.org/schemas/sitemap/0.9'; - - let sitemapType; - - if (root.name === 'urlset' && xmlns === xmlFormatNamespace) { - sitemapType = constants.xmlSitemapTypes.xml; - } else if (root.name === 'sitemapindex' && xmlns === xmlFormatNamespace) { - sitemapType = constants.xmlSitemapTypes.xmlIndex; - } else if (root.name === 'rss') { - sitemapType = constants.xmlSitemapTypes.rss; - } else if (root.name === 'feed') { - sitemapType = constants.xmlSitemapTypes.atom; - } else { - sitemapType = constants.xmlSitemapTypes.unknown; - } - - switch (sitemapType) { - case constants.xmlSitemapTypes.xmlIndex: - silentLogger.info(`This is a XML format sitemap index.`); - for (const childSitemapUrl of $('loc')) { - if (isLimitReached()) { - break; - } - await fetchUrls($(childSitemapUrl, false).text()); - } - break; - case constants.xmlSitemapTypes.xml: - silentLogger.info(`This is a XML format sitemap.`); - await processXmlSitemap($, sitemapType, 'loc'); - break; - case constants.xmlSitemapTypes.rss: - silentLogger.info(`This is a RSS format sitemap.`); - await processXmlSitemap($, sitemapType, 'link'); - break; - case constants.xmlSitemapTypes.atom: - silentLogger.info(`This is a Atom format sitemap.`); - await processXmlSitemap($, sitemapType, 'link'); - break; - default: - silentLogger.info(`This is an unrecognised XML sitemap format.`); - processNonStandardSitemap(data); - } - }; - - await fetchUrls(sitemapUrl); - return Array.from(urls); -}; +/* eslint-disable consistent-return */ +/* eslint-disable no-console */ +/* eslint-disable camelcase */ +/* eslint-disable no-use-before-define */ +import validator from 'validator'; +import axios from 'axios'; +import { JSDOM } from 'jsdom'; +import * as cheerio from 'cheerio'; +import crawlee, { constructRegExpObjectsFromPseudoUrls } from 'crawlee'; +import { parseString } from 'xml2js'; +import fs from 'fs'; +import path from 'path'; +import * as https from 'https'; +import os from 'os'; +import { globSync } from 'glob'; +import { chromium, devices } from 'playwright'; +import printMessage from 'print-message'; +import constants, { getDefaultChromeDataDir, getDefaultEdgeDataDir, proxy } from './constants.js'; +import { silentLogger } from '../logs.js'; + +const document = new JSDOM('').window; + +const httpsAgent = new https.Agent({ + // Run in environments with custom certificates + rejectUnauthorized: false, +}); + +export const messageOptions = { + border: false, + marginTop: 2, + marginBottom: 2, +}; + +const urlOptions = { + protocols: ['http', 'https'], + require_protocol: true, + require_tld: false, +}; + +const queryCheck = s => document.createDocumentFragment().querySelector(s); +export const isSelectorValid = selector => { + try { + queryCheck(selector); + } catch (e) { + return false; + } + return true; +}; + +// Refer to NPM validator's special characters under sanitizers for escape() +const blackListCharacters = '\\<>&\'"'; + +export const isValidXML = async content => { + // fs.writeFileSync('sitemapcontent.txt', content); + let status; + let parsedContent = ''; + parseString(content, (err, result) => { + if (result) { + status = true; + parsedContent = result; + } + if (err) { + status = false; + } + }); + return { status, parsedContent }; +}; + +export const isSkippedUrl = (page, whitelistedDomains) => { + const isWhitelisted = whitelistedDomains.filter(pattern => { + pattern = pattern.replace(/[\n\r]+/g, ''); + + if (pattern) { + return new RegExp(pattern).test(page.url()); + } + return false; + }); + + const noMatch = Object.keys(isWhitelisted).every(key => isWhitelisted[key].length === 0); + + return !noMatch; +}; + +export const isFileSitemap = filePath => { + if (!fs.existsSync(filePath)) { + return false; + } + const file = fs.readFileSync(filePath, 'utf8'); + return isSitemapContent(file); +}; + +export const getUrlMessage = scanner => { + switch (scanner) { + case constants.scannerTypes.website: + case constants.scannerTypes.custom: + return 'Please enter URL of website: '; + case constants.scannerTypes.sitemap: + return 'Please enter URL or file path to sitemap, or drag and drop a sitemap file here: '; + + default: + return 'Invalid option'; + } +}; + +export const isInputValid = inputString => { + if (!validator.isEmpty(inputString)) { + const removeBlackListCharacters = validator.escape(inputString); + + if (validator.isAscii(removeBlackListCharacters)) { + return true; + } + } + + return false; +}; + +export const sanitizeUrlInput = url => { + // Sanitize that there is no blacklist characters + const sanitizeUrl = validator.blacklist(url, blackListCharacters); + const data = {}; + if (validator.isURL(sanitizeUrl, urlOptions)) { + data.isValid = true; + } else { + data.isValid = false; + } + + data.url = sanitizeUrl; + return data; +}; + +const checkUrlConnectivity = async url => { + const res = {}; + + const data = sanitizeUrlInput(url); + + if (data.isValid) { + // Validate the connectivity of URL if the string format is url format + // User-Agent is modified to emulate a browser to handle cases where some sites ban non browser agents, resulting in a 403 error + await axios + .get(data.url, { + headers: { 'User-Agent': devices['Desktop Chrome HiDPI'].userAgent }, + httpsAgent, + timeout: 15000, + }) + .then(async response => { + const redirectUrl = response.request.res.responseUrl; + res.status = constants.urlCheckStatuses.success.code; + + if (redirectUrl != null) { + res.url = redirectUrl; + } else { + res.url = url; + } + + res.content = response.data; + }) + .catch(error => { + if (error.response) { + if (error.response.status === 401) { + // enters here if URL is protected by basic auth + res.status = constants.urlCheckStatuses.unauthorised.code; + } else { + // enters here if server responds with a status other than 2xx + // the scan should still proceed even if error codes are received, so that accessibility scans for error pages can be done too + res.status = constants.urlCheckStatuses.success.code; + } + res.url = url; + res.content = error.response.data; + return res; + } + if (error.request) { + // enters here if URL cannot be accessed + res.status = constants.urlCheckStatuses.cannotBeResolved.code; + } else { + res.status = constants.urlCheckStatuses.systemError.code; + } + silentLogger.error(error); + }); + } else { + // enters here if input is not a URL or not using http/https protocols + res.status = constants.urlCheckStatuses.invalidUrl.code; + } + + return res; +}; + +const checkUrlConnectivityWithBrowser = async ( + url, + browserToRun, + clonedDataDir, + playwrightDeviceDetailsObject, +) => { + const res = {}; + + let viewport = null; + let userAgent = null; + + if (Object.keys(playwrightDeviceDetailsObject).length > 0) { + if ('viewport' in playwrightDeviceDetailsObject) { + viewport = playwrightDeviceDetailsObject.viewport; + } + + if ('userAgent' in playwrightDeviceDetailsObject) { + userAgent = playwrightDeviceDetailsObject.userAgent; + } + } + + // Validate the connectivity of URL if the string format is url format + const data = sanitizeUrlInput(url); + + if (data.isValid) { + const browserContext = await chromium.launchPersistentContext(clonedDataDir, { + ...getPlaywrightLaunchOptions(browserToRun), + ...(viewport && { viewport }), + ...(userAgent && { userAgent }), + }); + // const context = await browser.newContext(); + const page = await browserContext.newPage(); + + // method will not throw an error when any valid HTTP status code is returned by the remote server, including 404 "Not Found" and 500 "Internal Server Error". + // navigation to about:blank or navigation to the same URL with a different hash, which would succeed and return null. + try { + const response = await page.goto(url, { + timeout: 30000, + ...(proxy && { waitUntil: 'commit' }), + }); + + try { + await page.waitForLoadState('networkidle', { timeout: 10000 }); + } catch (e) { + silentLogger.info('Unable to detect networkidle'); + } + + if (response.status() === 401) { + res.status = constants.urlCheckStatuses.unauthorised.code; + } else { + res.status = constants.urlCheckStatuses.success.code; + } + + // Check for redirect link + const redirectUrl = await response.request().url(); + + if (redirectUrl != null) { + res.url = redirectUrl; + } else { + res.url = url; + } + + res.content = await page.content(); + } catch (error) { + // not sure what errors are thrown + console.log(error); + silentLogger.error(error); + res.status = constants.urlCheckStatuses.systemError.code; + } finally { + await browserContext.close(); + } + } else { + // enters here if input is not a URL or not using http/https protocols + res.status = constants.urlCheckStatuses.invalidUrl.code; + } + + return res; +}; + +export const isSitemapContent = async content => { + const { status: isValid } = await isValidXML(content); + if (isValid) { + return true; + } + + const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi'); + const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi'); + const regexForUrl = new RegExp('^.*(http|https):/{2}.*$', 'gmi'); + + if (String(content).match(regexForHtml) && String(content).match(regexForXmlSitemap)) { + // is an XML sitemap wrapped in a HTML document + return true; + } + if (!String(content).match(regexForHtml) && String(content).match(regexForUrl)) { + // treat this as a txt sitemap where all URLs will be extracted for crawling + return true; + } + // is HTML webpage + return false; +}; + +export const checkUrl = async ( + scanner, + url, + browser, + clonedDataDir, + playwrightDeviceDetailsObject, +) => { + let res; + + if (browser) { + res = await checkUrlConnectivityWithBrowser( + url, + browser, + clonedDataDir, + playwrightDeviceDetailsObject, + ); + } else { + res = await checkUrlConnectivity(url); + } + + if ( + res.status === constants.urlCheckStatuses.success.code && + scanner === constants.scannerTypes.sitemap + ) { + const isSitemap = await isSitemapContent(res.content); + + if (!isSitemap) { + res.status = constants.urlCheckStatuses.notASitemap.code; + } + } + + return res; +}; + +const isEmptyObject = obj => !Object.keys(obj).length; + +export const prepareData = argv => { + if (isEmptyObject(argv)) { + throw Error('No inputs should be provided'); + } + const { + scanner, + headless, + url, + deviceChosen, + customDevice, + viewportWidth, + playwrightDeviceDetailsObject, + maxpages, + strategy, + isLocalSitemap, + finalUrl, + browserBased, + } = argv; + + return { + type: scanner, + url: isLocalSitemap ? url : finalUrl, + isHeadless: headless, + isBrowserBased: browserBased, + deviceChosen, + customDevice, + viewportWidth, + playwrightDeviceDetailsObject, + maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl, + strategy, + isLocalSitemap, + }; +}; + +export const getLinksFromSitemap = async ( + sitemapUrl, + maxLinksCount, + browser, + userDataDirectory, +) => { + const urls = new Set(); // for HTML documents + + const isLimitReached = () => urls.size >= maxLinksCount; + + const processXmlSitemap = async ($, sitemapType, selector) => { + for (const urlElement of $(selector)) { + if (isLimitReached()) { + return; + } + let url; + if (sitemapType === constants.xmlSitemapTypes.atom) { + url = $(urlElement).prop('href'); + } else { + url = $(urlElement).text(); + } + urls.add(url); + } + }; + + const processNonStandardSitemap = data => { + const urlsFromData = crawlee.extractUrls({ string: data }).slice(0, maxLinksCount); + urlsFromData.forEach(url => urls.add(url)); + }; + + const fetchUrls = async url => { + let data; + let sitemapType; + if (validator.isURL(url, urlOptions)) { + if (browser) { + const browserContext = await chromium.launchPersistentContext( + userDataDirectory, + getPlaywrightLaunchOptions(browser), + ); + const page = await browserContext.newPage(); + await page.goto(url, { waitUntil: 'networkidle', timeout: 30000 }); + + const urlSet = page.locator('urlset'); + const sitemapIndex = page.locator('sitemapindex'); + const rss = page.locator('rss'); + const feed = page.locator('feed'); + + const isRoot = async locator => (await locator.count()) > 0; + + if (await isRoot(urlSet)) { + data = await urlSet.evaluate(elem => elem.outerHTML); + } else if (await isRoot(sitemapIndex)) { + data = await sitemapIndex.evaluate(elem => elem.outerHTML); + } else if (await isRoot(rss)) { + data = await rss.evaluate(elem => elem.outerHTML); + } else if (await isRoot(feed)) { + data = await feed.evaluate(elem => elem.outerHTML); + } + + await browserContext.close(); + } else { + const instance = axios.create({ + httpsAgent: new https.Agent({ + rejectUnauthorized: false, + }), + }); + data = await (await instance.get(url)).data; + } + } else { + data = fs.readFileSync(url, 'utf8'); + } + const $ = cheerio.load(data, { xml: true }); + + // This case is when the document is not an XML format document + if ($(':root').length === 0) { + processNonStandardSitemap(data); + return; + } + + // Root element + const root = $(':root')[0]; + + const { xmlns } = root.attribs; + const xmlFormatNamespace = 'http://www.sitemaps.org/schemas/sitemap/0.9'; + + if (root.name === 'urlset' && xmlns === xmlFormatNamespace) { + sitemapType = constants.xmlSitemapTypes.xml; + } else if (root.name === 'sitemapindex' && xmlns === xmlFormatNamespace) { + sitemapType = constants.xmlSitemapTypes.xmlIndex; + } else if (root.name === 'rss') { + sitemapType = constants.xmlSitemapTypes.rss; + } else if (root.name === 'feed') { + sitemapType = constants.xmlSitemapTypes.atom; + } else { + sitemapType = constants.xmlSitemapTypes.unknown; + } + + switch (sitemapType) { + case constants.xmlSitemapTypes.xmlIndex: + silentLogger.info(`This is a XML format sitemap index.`); + for (const childSitemapUrl of $('loc')) { + if (isLimitReached()) { + break; + } + await fetchUrls($(childSitemapUrl, false).text()); + } + break; + case constants.xmlSitemapTypes.xml: + silentLogger.info(`This is a XML format sitemap.`); + await processXmlSitemap($, sitemapType, 'loc'); + break; + case constants.xmlSitemapTypes.rss: + silentLogger.info(`This is a RSS format sitemap.`); + await processXmlSitemap($, sitemapType, 'link'); + break; + case constants.xmlSitemapTypes.atom: + silentLogger.info(`This is a Atom format sitemap.`); + await processXmlSitemap($, sitemapType, 'link'); + break; + default: + silentLogger.info(`This is an unrecognised XML sitemap format.`); + processNonStandardSitemap(data); + } + }; + + await fetchUrls(sitemapUrl); + return Array.from(urls); +}; + +/** + * Clone the Chrome profile cookie files to the destination directory + * @param {*} options glob options object + * @param {*} destDir destination directory + * @returns boolean indicating whether the operation was successful + */ +const cloneChromeProfileCookieFiles = (options, destDir) => { + let profileCookiesDir; + // Cookies file per profile is located in .../User Data//Network/Cookies for windows + // and ../Chrome//Cookies for mac + let profileNamesRegex; + if (os.platform() === 'win32') { + profileCookiesDir = globSync('**/Network/Cookies', { + ...options, + ignore: ['Purple-HATS/**'], + }); + profileNamesRegex = /User Data\\(.*?)\\Network/; + } else if (os.platform() === 'darwin') { + // maxDepth 2 to avoid copying cookies from the Purple-HATS directory if it exists + profileCookiesDir = globSync('**/Cookies', { + ...options, + ignore: 'Purple-HATS/**', + }); + profileNamesRegex = /Chrome\/(.*?)\/Cookies/; + } + + if (profileCookiesDir.length > 0) { + let success = true; + profileCookiesDir.forEach(dir => { + const profileName = dir.match(profileNamesRegex)[1]; + if (profileName) { + let destProfileDir = path.join(destDir, profileName); + if (os.platform() === 'win32') { + destProfileDir = path.join(destProfileDir, 'Network'); + } + // Recursive true to create all parent directories (e.g. PbProfile/Default/Cookies) + if (!fs.existsSync(destProfileDir)) { + fs.mkdirSync(destProfileDir, { recursive: true }); + if (!fs.existsSync(destProfileDir)) { + fs.mkdirSync(destProfileDir); + } + } + + // Prevents duplicate cookies file if the cookies already exist + if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) { + try { + fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies')); + } catch (err) { + silentLogger.error(err); + printMessage([err], messageOptions); + success = false; + } + } + } + }); + return success; + } + + silentLogger.warn('Unable to find Chrome profile cookies file in the system.'); + printMessage(['Unable to find Chrome profile cookies file in the system.'], messageOptions); + return false; +}; + +/** + * Clone the Chrome profile cookie files to the destination directory + * @param {*} options glob options object + * @param {*} destDir destination directory + * @returns boolean indicating whether the operation was successful + */ +const cloneEdgeProfileCookieFiles = (options, destDir) => { + let profileCookiesDir; + // Cookies file per profile is located in .../User Data//Network/Cookies for windows + // and ../Chrome//Cookies for mac + let profileNamesRegex; + // Ignores the cloned Purple-HATS directory if exists + if (os.platform() === 'win32') { + profileCookiesDir = globSync('**/Network/Cookies', { + ...options, + ignore: 'Purple-HATS/**', + }); + profileNamesRegex = /User Data\\(.*?)\\Network/; + } else if (os.platform() === 'darwin') { + // Ignores copying cookies from the Purple-HATS directory if it exists + profileCookiesDir = globSync('**/Cookies', { + ...options, + ignore: 'Purple-HATS/**', + }); + profileNamesRegex = /Microsoft Edge\/(.*?)\/Cookies/; + } + + if (profileCookiesDir.length > 0) { + let success = true; + profileCookiesDir.forEach(dir => { + const profileName = dir.match(profileNamesRegex)[1]; + if (profileName) { + let destProfileDir = path.join(destDir, profileName); + if (os.platform() === 'win32') { + destProfileDir = path.join(destProfileDir, 'Network'); + } + // Recursive true to create all parent directories (e.g. PbProfile/Default/Cookies) + if (!fs.existsSync(destProfileDir)) { + fs.mkdirSync(destProfileDir, { recursive: true }); + if (!fs.existsSync(destProfileDir)) { + fs.mkdirSync(destProfileDir); + } + } + + // Prevents duplicate cookies file if the cookies already exist + if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) { + try { + fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies')); + } catch (err) { + silentLogger.error(err); + printMessage([err], messageOptions); + success = false; + } + } + } + }); + return success; + } + silentLogger.warn('Unable to find Edge profile cookies file in the system.'); + printMessage(['Unable to find Edge profile cookies file in the system.'], messageOptions); + return false; +}; + +/** + * Both Edge and Chrome Local State files are located in the .../User Data directory + * @param {*} options - glob options object + * @param {string} destDir - destination directory + * @returns boolean indicating whether the operation was successful + */ +const cloneLocalStateFile = (options, destDir) => { + const localState = globSync('**/*Local State', { + ...options, + maxDepth: 1, + }); + + if (localState.length > 0) { + let success = true; + localState.forEach(dir => { + try { + fs.copyFileSync(dir, path.join(destDir, 'Local State')); + } catch (err) { + silentLogger.error(err); + printMessage([err], messageOptions); + success = false; + } + }); + return success; + } + silentLogger.warn('Unable to find local state file in the system.'); + printMessage(['Unable to find local state file in the system.'], messageOptions); + return false; +}; + +/** + * Checks if the Chrome data directory exists and creates a clone + * of all profile within the Purple-HATS directory located in the + * .../User Data directory for Windows and + * .../Chrome directory for Mac. + * @param {string} randomToken - random token to append to the cloned directory + * @returns {string} cloned data directory, null if any of the sub files failed to copy + */ +export const cloneChromeProfiles = randomToken => { + const baseDir = getDefaultChromeDataDir(); + + if (!baseDir) { + console.warn('Unable to find Chrome data directory in the system.'); + return; + } + + let destDir; + + if (randomToken) { + destDir = path.join(baseDir, `Purple-HATS-${randomToken}`); + } else { + destDir = path.join(baseDir, 'Purple-HATS'); + } + + if (fs.existsSync(destDir)) { + deleteClonedChromeProfiles(); + } + + if (!fs.existsSync(destDir)) { + fs.mkdirSync(destDir); + } + + const baseOptions = { + cwd: baseDir, + recursive: true, + absolute: true, + nodir: true, + }; + const cloneLocalStateFileSucess = cloneLocalStateFile(baseOptions, destDir); + if (cloneChromeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSucess) { + return destDir; + } + + return null; +}; + +/** + * Checks if the Edge data directory exists and creates a clone + * of all profile within the Purple-HATS directory located in the + * .../User Data directory for Windows and + * .../Microsoft Edge directory for Mac. + * @param {string} randomToken - random token to append to the cloned directory + * @returns {string} cloned data directory, null if any of the sub files failed to copy + */ +export const cloneEdgeProfiles = randomToken => { + const baseDir = getDefaultEdgeDataDir(); + + if (!baseDir) { + console.warn('Unable to find Edge data directory in the system.'); + return; + } + + let destDir; + + if (randomToken) { + destDir = path.join(baseDir, `Purple-HATS-${randomToken}`); + } else { + destDir = path.join(baseDir, 'Purple-HATS'); + } + + if (fs.existsSync(destDir)) { + deleteClonedEdgeProfiles(); + } + + if (!fs.existsSync(destDir)) { + fs.mkdirSync(destDir); + } + + const baseOptions = { + cwd: baseDir, + recursive: true, + absolute: true, + nodir: true, + }; + + const cloneLocalStateFileSucess = cloneLocalStateFile(baseOptions, destDir); + if (cloneEdgeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSucess) { + return destDir; + } + + return null; +}; + +/** + * Deletes all the cloned Purple-HATS directories in the Chrome data directory + * @returns null + */ +export const deleteClonedChromeProfiles = () => { + const baseDir = getDefaultChromeDataDir(); + + if (!baseDir) { + console.warn(`Unable to find Chrome data directory in the system.`); + return; + } + + // Find all the Purple-HATS directories in the Chrome data directory + const destDir = globSync('**/Purple-HATS*', { + cwd: baseDir, + recursive: true, + absolute: true, + }); + + if (destDir.length > 0) { + destDir.forEach(dir => { + if (fs.existsSync(dir)) { + try { + fs.rmSync(dir, { recursive: true }); + } catch (err) { + silentLogger.warn(`Unable to delete ${dir} folder in the Chrome data directory. ${err}`); + console.warn(`Unable to delete ${dir} folder in the Chrome data directory. ${err}}`); + } + } + }); + return; + } + + silentLogger.warn('Unable to find Purple-HATS directory in the Chrome data directory.'); + console.warn('Unable to find Purple-HATS directory in the Chrome data directory.'); +}; + +/** + * Deletes all the cloned Purple-HATS directories in the Chrome data directory + * @returns null + */ +export const deleteClonedEdgeProfiles = () => { + const baseDir = getDefaultEdgeDataDir(); + + if (!baseDir) { + console.warn(`Unable to find Edge data directory in the system.`); + return; + } + + // Find all the Purple-HATS directories in the Chrome data directory + const destDir = globSync('**/Purple-HATS*', { + cwd: baseDir, + recursive: true, + absolute: true, + }); + + if (destDir.length > 0) { + destDir.forEach(dir => { + if (fs.existsSync(dir)) { + try { + fs.rmSync(dir, { recursive: true }); + } catch (err) { + silentLogger.warn(`Unable to delete ${dir} folder in the Chrome data directory. ${err}`); + console.warn(`Unable to delete ${dir} folder in the Chrome data directory. ${err}}`); + } + } + }); + } +}; + +/** + * @param {string} browser browser name ("chrome" or "edge", null for chromium, the default Playwright browser) + * @returns playwright launch options object. For more details: https://playwright.dev/docs/api/class-browsertype#browser-type-launch + */ +export const getPlaywrightLaunchOptions = browser => { + let channel; + if (browser === constants.browserTypes.chromium) { + channel = null; + } else { + channel = browser; + } + const options = { + // Drop the --use-mock-keychain flag to allow MacOS devices + // to use the cloned cookies. + ignoreDefaultArgs: ['--use-mock-keychain'], + args: constants.launchOptionsArgs, + ...(channel && { channel }), // Having no channel is equivalent to "chromium" + }; + if (proxy) { + options.headless = false; + options.slowMo = 1000; // To ensure server-side rendered proxy page is loaded + } + return options; +}; diff --git a/constants/constants.js b/constants/constants.js index e7c4f00b..4c6aa51b 100644 --- a/constants/constants.js +++ b/constants/constants.js @@ -1,115 +1,246 @@ -import path from 'path'; -import { fileURLToPath } from 'url'; -import fs from 'fs-extra'; -import { globSync } from 'glob'; -import which from 'which'; -import os from 'os'; -import { spawnSync } from 'child_process'; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); - -const maxRequestsPerCrawl = 100; - -export const intermediateScreenshotsPath = './screenshots'; -export const destinationPath = storagePath => `${storagePath}/screenshots`; - -export const removeQuarantineFlag = function (searchPath) { - if (os.platform() === 'darwin') { - let execPaths = globSync(searchPath, { absolute: true, recursive: true, nodir: true }); - if (execPaths.length > 0) { - execPaths.forEach(filePath => spawnSync('xattr', ['-d', 'com.apple.quarantine', filePath])); - } - } -}; - -export const getExecutablePath = function (dir, file) { - let execPaths = globSync(dir + '/' + file, { absolute: true, recursive: true, nodir: true }); - - if (execPaths.length === 0) { - let execInPATH = which.sync(file, { nothrow: true }); - - if (execInPATH) { - return fs.realpathSync(execInPATH); - } - return null; - } else { - removeQuarantineFlag(execPaths[0]); - return execPaths[0]; - } -}; - -// for crawlers -export const axeScript = 'node_modules/axe-core/axe.min.js'; - -const urlsCrawledObj = { - toScan: [], - scanned: [], - invalid: [], - outOfDomain: [], -}; - -const scannerTypes = { - sitemap: 'Sitemap', - website: 'Website', - custom: 'Custom', -}; - -// Check if running in docker container -let launchOptionsArgs = []; -if (fs.existsSync('/.dockerenv')) { - launchOptionsArgs = ['--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage']; -} - -export const impactOrder = { - minor: 0, - moderate: 1, - serious: 2, - critical: 3, -}; - -const urlCheckStatuses = { - success: { code: 0 }, - invalidUrl: { code: 11, message: 'Invalid URL or URL is not using http or https.' }, - cannotBeResolved: { - code: 12, - message: - 'Provided URL cannot be accessed. Please verify your internet connectivity and the correctness of the domain.', - }, - errorStatusReceived: { // unused for now - code: 13, - message: 'Provided URL cannot be accessed. Server responded with code ', // append it with the response code received, - }, - systemError: { - code: 14, - message: 'Something went wrong when verifying the URL. Please try again later.', - }, - notASitemap: { code: 15, message: 'Provided URL or filepath is not a sitemap.' }, -}; - -const xmlSitemapTypes = { - xml: 0, - xmlIndex: 1, - rss: 2, - atom: 3, - unknown: 4, -}; - -export default { - allIssueFileName: 'all_issues', - cliZipFileName: 'a11y-scan-results.zip', - maxRequestsPerCrawl, - maxConcurrency: 50, - scannerTypes, - urlsCrawledObj, - impactOrder, - launchOptionsArgs: launchOptionsArgs, - xmlSitemapTypes, - urlCheckStatuses, -}; - -export const rootPath = __dirname; -export const wcagWebPage = 'https://www.w3.org/TR/WCAG21/'; -const latestAxeVersion = '4.4'; -export const axeVersion = latestAxeVersion; -export const axeWebPage = `https://dequeuniversity.com/rules/axe/${latestAxeVersion}/`; +import path from 'path'; +import { fileURLToPath } from 'url'; +import fs from 'fs-extra'; +import { globSync } from 'glob'; +import which from 'which'; +import os from 'os'; +import { spawnSync } from 'child_process'; +import { silentLogger } from '../logs.js'; +import { execSync } from 'child_process'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const maxRequestsPerCrawl = 100; + +export const intermediateScreenshotsPath = './screenshots'; +export const destinationPath = storagePath => `${storagePath}/screenshots`; + +/** Get the path to Default Profile in the Chrome Data Directory + * as per https://chromium.googlesource.com/chromium/src/+/master/docs/user_data_dir.md + * @returns {string} path to Default Profile in the Chrome Data Directory + */ +export const getDefaultChromeDataDir = () => { + try { + let defaultChromeDataDir = null; + if (os.platform() === 'win32') { + defaultChromeDataDir = path.join( + os.homedir(), + 'AppData', + 'Local', + 'Google', + 'Chrome', + 'User Data', + ); + } else if (os.platform() === 'darwin') { + defaultChromeDataDir = path.join( + os.homedir(), + 'Library', + 'Application Support', + 'Google', + 'Chrome', + ); + } + if (defaultChromeDataDir && fs.existsSync(defaultChromeDataDir)) { + return defaultChromeDataDir; + } else { + return null; + } + } catch (error) { + console.error(`Error in getDefaultChromeDataDir(): ${error}`); + } +}; + +/** + * Get the path to Default Profile in the Edge Data Directory + * @returns {string} - path to Default Profile in the Edge Data Directory + */ +export const getDefaultEdgeDataDir = () => { + try { + let defaultEdgeDataDir = null; + if (os.platform() === 'win32') { + defaultEdgeDataDir = path.join( + os.homedir(), + 'AppData', + 'Local', + 'Microsoft', + 'Edge', + 'User Data', + ); + } else if (os.platform() === 'darwin') { + defaultEdgeDataDir = path.join( + os.homedir(), + 'Library', + 'Application Support', + 'Microsoft Edge', + ); + } + + if (defaultEdgeDataDir && fs.existsSync(defaultEdgeDataDir)) { + return defaultEdgeDataDir; + } else { + return null; + } + } catch (error) { + console.error(`Error in getDefaultEdgeDataDir(): ${error}`); + } +}; + +export const removeQuarantineFlag = function (searchPath) { + if (os.platform() === 'darwin') { + let execPaths = globSync(searchPath, { absolute: true, recursive: true, nodir: true }); + if (execPaths.length > 0) { + execPaths.forEach(filePath => spawnSync('xattr', ['-d', 'com.apple.quarantine', filePath])); + } + } +}; + +export const getExecutablePath = function (dir, file) { + let execPaths = globSync(dir + '/' + file, { absolute: true, recursive: true, nodir: true }); + + if (execPaths.length === 0) { + let execInPATH = which.sync(file, { nothrow: true }); + + if (execInPATH) { + return fs.realpathSync(execInPATH); + } + return null; + } else { + removeQuarantineFlag(execPaths[0]); + return execPaths[0]; + } +}; +/** + * Matches the pattern user:password@domain.com + */ +export const basicAuthRegex = /^.*\/\/.*:.*@.*$/i; + +// for crawlers +export const axeScript = path.join(__dirname, '../node_modules/axe-core/axe.min.js'); + +const urlsCrawledObj = { + toScan: [], + scanned: [], + invalid: [], + outOfDomain: [], +}; + +const scannerTypes = { + sitemap: 'Sitemap', + website: 'Website', + custom: 'Custom', +}; + +let launchOptionsArgs = []; + +// Check if running in docker container +if (fs.existsSync('/.dockerenv')) { + launchOptionsArgs = ['--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage']; +} + +export const getProxy = () => { + if (os.platform() === 'win32') { + let internetSettings; + try { + internetSettings = execSync( + 'Get-ItemProperty -Path "Registry::HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Internet Settings"', + { shell: 'powershell.exe' }, + ) + .toString() + .split('\n'); + } catch (e) { + console.log(e.toString()); + silentLogger.error(e.toString()); + } + + const getSettingValue = settingName => + internetSettings + .find(s => s.startsWith(settingName)) + // split only once at with ':' as the delimiter + ?.split(/:(.*)/s)[1] + ?.trim(); + + if (getSettingValue('AutoConfigURL')) { + return { type: 'autoConfig', url: getSettingValue('AutoConfigURL') }; + } else if (getSettingValue('ProxyEnable') === '1') { + return { type: 'manualProxy', url: getSettingValue('ProxyServer') }; + } else { + return null; + } + } else { + // develop for mac + return null; + } +}; + +export const proxy = getProxy(); + +if (proxy && proxy.type === 'autoConfig') { + launchOptionsArgs.push(`--proxy-pac-url=${proxy.url}`); +} else if (proxy && proxy.type === 'manualProxy') { + launchOptionsArgs.push(`--proxy-server=${proxy.url}`); +} + +export const impactOrder = { + minor: 0, + moderate: 1, + serious: 2, + critical: 3, +}; + +const urlCheckStatuses = { + success: { code: 0 }, + invalidUrl: { code: 11, message: 'Invalid URL or URL is not using http or https.' }, + cannotBeResolved: { + code: 12, + message: + 'Provided URL cannot be accessed. Please verify your internet connectivity and the correctness of the domain.', + }, + errorStatusReceived: { + // unused for now + code: 13, + message: 'Provided URL cannot be accessed. Server responded with code ', // append it with the response code received, + }, + systemError: { + code: 14, + message: 'Something went wrong when verifying the URL. Please try again later.', + }, + notASitemap: { code: 15, message: 'Provided URL or filepath is not a sitemap.' }, + unauthorised: { code: 16, message: 'Provided URL needs basic authorisation.' }, +}; + +const browserTypes = { + chrome: 'chrome', + edge: 'msedge', + chromium: 'chromium', +}; + +const xmlSitemapTypes = { + xml: 0, + xmlIndex: 1, + rss: 2, + atom: 3, + unknown: 4, +}; + +export default { + allIssueFileName: 'all_issues', + cliZipFileName: 'a11y-scan-results.zip', + maxRequestsPerCrawl, + maxConcurrency: 50, + scannerTypes, + browserTypes, + urlsCrawledObj, + impactOrder, + launchOptionsArgs: launchOptionsArgs, + xmlSitemapTypes, + urlCheckStatuses, +}; + +export const rootPath = __dirname; +export const wcagWebPage = 'https://www.w3.org/TR/WCAG21/'; +const latestAxeVersion = '4.4'; +export const axeVersion = latestAxeVersion; +export const axeWebPage = `https://dequeuniversity.com/rules/axe/${latestAxeVersion}/`; + +export const saflyIconSelector = `#__safly_icon`; diff --git a/constants/questions.js b/constants/questions.js index ddd88430..8e19f0af 100644 --- a/constants/questions.js +++ b/constants/questions.js @@ -84,7 +84,7 @@ const questions = [ /* if sitemap scan is selected, treat this URL as a filepath isFileSitemap will tell whether the filepath exists, and if it does, whether the file is a sitemap */ - if (isFileSitemap(answers.url)) { + if (isFileSitemap(url)) { answers.isLocalSitemap = true; return true; } else { diff --git a/crawlers/commonCrawlerFunc.js b/crawlers/commonCrawlerFunc.js index 0b082388..bd12777d 100644 --- a/crawlers/commonCrawlerFunc.js +++ b/crawlers/commonCrawlerFunc.js @@ -2,7 +2,7 @@ /* eslint-disable no-param-reassign */ import crawlee from 'crawlee'; import axe from 'axe-core'; -import { axeScript } from '../constants/constants.js'; +import { axeScript, saflyIconSelector } from '../constants/constants.js'; export const filterAxeResults = (results, pageTitle) => { const { violations, incomplete, passes, url } = results; @@ -80,19 +80,24 @@ export const filterAxeResults = (results, pageTitle) => { export const runAxeScript = async (page, selectors = []) => { await crawlee.playwrightUtils.injectFile(page, axeScript); - const results = await page.evaluate(selectors => { - axe.configure({ - branding: { - application: 'purple-hats', - }, - }); - return axe.run(selectors, { - resultTypes: ['violations', 'passes', 'incomplete'], - }); - }, selectors); - const pageTitle = await page.evaluate(() => { - return document.title; - }); + const results = await page.evaluate( + async ({ selectors, saflyIconSelector }) => { + // remove so that axe does not scan + document.querySelector(saflyIconSelector)?.remove(); + + axe.configure({ + branding: { + application: 'purple-hats', + }, + }); + return axe.run(selectors, { + resultTypes: ['violations', 'passes', 'incomplete'], + }); + }, + { selectors, saflyIconSelector }, + ); + + const pageTitle = await page.evaluate(() => document.title); return filterAxeResults(results, pageTitle); }; @@ -104,10 +109,10 @@ export const createCrawleeSubFolders = async randomToken => { export const preNavigationHooks = [ async (_crawlingContext, gotoOptions) => { - gotoOptions = { waitUntil: 'domcontentloaded', timeout: 30000 }; + gotoOptions = { waitUntil: 'networkidle', timeout: 30000 }; }, ]; export const failedRequestHandler = async ({ request }) => { crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`); -}; +}; \ No newline at end of file diff --git a/crawlers/crawlDomain.js b/crawlers/crawlDomain.js index eeaa920c..5640a4f2 100644 --- a/crawlers/crawlDomain.js +++ b/crawlers/crawlDomain.js @@ -1,44 +1,59 @@ import crawlee from 'crawlee'; -import { devices } from 'playwright'; - import { createCrawleeSubFolders, preNavigationHooks, runAxeScript, failedRequestHandler, } from './commonCrawlerFunc.js'; -import constants from '../constants/constants.js'; +import constants, { basicAuthRegex } from '../constants/constants.js'; +import { getPlaywrightLaunchOptions } from '../constants/common.js'; -const crawlDomain = async (url, randomToken, host, viewportSettings, maxRequestsPerCrawl) => { +const crawlDomain = async ( + url, + randomToken, + host, + viewportSettings, + maxRequestsPerCrawl, + browser, + userDataDirectory, + strategy, +) => { const urlsCrawled = { ...constants.urlsCrawledObj }; const { maxConcurrency } = constants; - const { deviceChosen, customDevice, viewportWidth } = viewportSettings; + const { playwrightDeviceDetailsObject } = viewportSettings; const { dataset, requestQueue } = await createCrawleeSubFolders(randomToken); - await requestQueue.addRequest({ url }); + let finalUrl; + let pagesCrawled; + // Boolean to omit axe scan for basic auth URL + let isBasicAuth = false; + /** + * Regex to match http://username:password@hostname.com + * utilised in scan strategy to ensure subsequent URLs within the same domain are scanned. + * First time scan with original `url` containing credentials is strictly to authenticate for browser session + * subsequent URLs are without credentials. + * pagesCrawled is set to -1 for basic auth URL to ensure it is not counted towards maxRequestsPerCrawl + */ + + if (basicAuthRegex.test(url)) { + isBasicAuth = true; + // request to basic auth URL to authenticate for browser session + await requestQueue.addRequest({ url, uniqueKey: `auth:${url}` }); - // customDevice check for website scan - let device; - if (deviceChosen === 'Mobile' || customDevice === 'iPhone 11') { - device = devices['iPhone 11']; - } else if (customDevice === 'Samsung Galaxy S9+') { - device = devices['Galaxy S9+']; - } else if (viewportWidth) { - device = { viewport: { width: Number(viewportWidth), height: 720 } }; - } else if (customDevice) { - device = devices[customDevice.replace('_', / /g)]; + // obtain base URL without credentials so that subsequent URLs within the same domain can be scanned + finalUrl = `${url.split('://')[0]}://${url.split('@')[1]}`; + await requestQueue.addRequest({ url: finalUrl }); + pagesCrawled = -1; } else { - device = {}; + await requestQueue.addRequest({ url }); + pagesCrawled = 0; } - let pagesCrawled = 0; - const crawler = new crawlee.PlaywrightCrawler({ launchContext: { - launchOptions: { - args: constants.launchOptionsArgs, - }, + launchOptions: getPlaywrightLaunchOptions(browser), + userDataDir: userDataDirectory || '', }, browserPoolOptions: { useFingerprints: false, @@ -48,7 +63,7 @@ const crawlDomain = async (url, randomToken, host, viewportSettings, maxRequests ...launchContext.launchOptions, bypassCSP: true, ignoreHTTPSErrors: true, - ...device, + ...playwrightDeviceDetailsObject, }; }, ], @@ -60,11 +75,13 @@ const crawlDomain = async (url, randomToken, host, viewportSettings, maxRequests return; } pagesCrawled++; - + const currentUrl = request.url; const location = await page.evaluate('location'); - if (location.host.includes(host)) { + if (isBasicAuth) { + isBasicAuth = false; + } else if (location.host.includes(host)) { const results = await runAxeScript(page); await dataset.pushData(results); urlsCrawled.scanned.push(currentUrl); @@ -72,7 +89,7 @@ const crawlDomain = async (url, randomToken, host, viewportSettings, maxRequests await enqueueLinks({ // set selector matches anchor elements with href but not contains # or starting with mailto: selector: 'a:not(a[href*="#"],a[href^="mailto:"])', - strategy: 'same-domain', + strategy, requestQueue, transformRequestFunction(req) { // ignore all links ending with `.pdf` diff --git a/crawlers/crawlSitemap.js b/crawlers/crawlSitemap.js index ebb1c26c..5b2c6c85 100644 --- a/crawlers/crawlSitemap.js +++ b/crawlers/crawlSitemap.js @@ -1,5 +1,4 @@ import crawlee from 'crawlee'; -import { devices } from 'playwright'; import printMessage from 'print-message'; import { createCrawleeSubFolders, @@ -9,7 +8,11 @@ import { } from './commonCrawlerFunc.js'; import constants from '../constants/constants.js'; -import { getLinksFromSitemap, messageOptions } from '../constants/common.js'; +import { + getLinksFromSitemap, + getPlaywrightLaunchOptions, + messageOptions, +} from '../constants/common.js'; import { isWhitelistedContentType } from '../utils.js'; const crawlSitemap = async ( @@ -18,56 +21,43 @@ const crawlSitemap = async ( host, viewportSettings, maxRequestsPerCrawl, + browser, + userDataDirectory, ) => { const urlsCrawled = { ...constants.urlsCrawledObj }; - const { deviceChosen, customDevice, viewportWidth } = viewportSettings; + const { playwrightDeviceDetailsObject } = viewportSettings; const { maxConcurrency } = constants; printMessage(['Fetching URLs. This might take some time...'], { border: false }); const requestList = new crawlee.RequestList({ - sources: await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl), + sources: await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory), }); await requestList.initialize(); printMessage(['Fetch URLs completed. Beginning scan'], messageOptions); const { dataset } = await createCrawleeSubFolders(randomToken); - - let device; - if (deviceChosen === 'Mobile' || customDevice === 'iPhone 11') { - device = devices['iPhone 11']; - } else if (customDevice === 'Samsung Galaxy S9+') { - device = devices['Galaxy S9+']; - } else if (viewportWidth) { - device = { viewport: { width: Number(viewportWidth), height: 720 }}; - } else if (customDevice) { - device = devices[customDevice.replace('_', / /g)]; - } else { - device = {}; - } const crawler = new crawlee.PlaywrightCrawler({ launchContext: { - launchOptions: { - args: constants.launchOptionsArgs, - }, + launchOptions: getPlaywrightLaunchOptions(browser), + userDataDir: userDataDirectory || '', }, browserPoolOptions: { useFingerprints: false, - preLaunchHooks: [async (pageId, launchContext) => { - - launchContext.launchOptions = { - ...launchContext.launchOptions, - bypassCSP: true, - ignoreHTTPSErrors: true, - ...device, - }; - - }], + preLaunchHooks: [ + async (pageId, launchContext) => { + launchContext.launchOptions = { + ...launchContext.launchOptions, + bypassCSP: true, + ignoreHTTPSErrors: true, + ...playwrightDeviceDetailsObject, + }; + }, + ], }, requestList, preNavigationHooks, requestHandler: async ({ page, request, response }) => { - const currentUrl = request.url; const contentType = response.headers()['content-type']; const status = response.status(); diff --git a/index.js b/index.js index 66f9e5bc..430560dc 100644 --- a/index.js +++ b/index.js @@ -10,6 +10,7 @@ import questions from './constants/questions.js'; import combineRun from './combine.js'; import playwrightAxeGenerator from './playwrightAxeGenerator.js'; import constants from './constants/constants.js'; +import { devices } from 'playwright'; printMessage( [ @@ -26,20 +27,35 @@ printMessage( ); inquirer.prompt(questions).then(async answers => { - const data = prepareData(answers); - - setHeadlessMode(data.isHeadless); - let screenToScan; + let playwrightDeviceDetailsObject = {}; if (answers.deviceChosen !== 'Custom') { screenToScan = answers.deviceChosen; + if (answers.deviceChosen === 'Mobile') { + playwrightDeviceDetailsObject = devices['iPhone 11']; + } } else if (answers.customDevice !== 'Specify viewport') { screenToScan = answers.customDevice; - } else { + // Only iPhone 11 & Samsung Galaxy S9+ are selectable + if (answers.customDevice === 'Samsung Galaxy S9+') { + playwrightDeviceDetailsObject = devices['Galaxy S9+']; + } else { + playwrightDeviceDetailsObject = devices[answers.customDevice]; + } + } else if (answers.viewportWidth) { screenToScan = `CustomWidth_${answers.viewportWidth}px`; + playwrightDeviceDetailsObject = { + viewport: { width: Number(answers.viewportWidth), height: 720 }, + }; } + answers.playwrightDeviceDetailsObject = playwrightDeviceDetailsObject; + + const data = prepareData(answers); + + setHeadlessMode(data.isHeadless); + const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' '); const domain = answers.isLocalSitemap ? 'custom' : new URL(answers.url).hostname; @@ -58,4 +74,5 @@ inquirer.prompt(questions).then(async answers => { } // Delete dataset and request queues cleanUp(data.randomToken); + process.exit(0); }); diff --git a/mergeAxeResults.js b/mergeAxeResults.js index 195b1a8a..aa58657d 100644 --- a/mergeAxeResults.js +++ b/mergeAxeResults.js @@ -52,7 +52,7 @@ const writeResults = async (allissues, storagePath, jsonFilename = 'compiledResu try { await fs.writeFile(`${storagePath}/reports/${jsonFilename}.json`, finalResultsInJson); await fs.writeFile( - `${storagePath}/reports/passed_items.json`, + `${storagePath}/reports/passed_items.json.txt`, JSON.stringify(passedItemsJson, null, 4), ); } catch (writeResultsError) { diff --git a/package-lock.json b/package-lock.json index 7ad53497..a0205c70 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@govtechsg/purple-hats", - "version": "0.0.16-alpha", + "version": "0.9.0", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "@govtechsg/purple-hats", - "version": "0.0.16-alpha", + "version": "0.9.0", "license": "MIT", "dependencies": { "axe-core": "^4.6.2", diff --git a/package.json b/package.json index 7da80aa5..8d1fa6e4 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@govtechsg/purple-hats", "main": "npmIndex.js", - "version": "0.0.16-alpha", + "version": "0.9.0", "type": "module", "imports": { "#root/*.js": "./*.js" @@ -53,4 +53,4 @@ "url": "https://github.com/GovTechSG/purple-hats-dev/issues" }, "homepage": "https://github.com/GovTechSG/purple-hats-dev#readme" -} +} \ No newline at end of file diff --git a/playwrightAxeGenerator.js b/playwrightAxeGenerator.js index 98efee8e..579a381f 100644 --- a/playwrightAxeGenerator.js +++ b/playwrightAxeGenerator.js @@ -1,471 +1,593 @@ -import { execSync } from 'child_process'; -import fs from 'fs'; -import os from 'os'; -import path from 'path'; -import readline from 'readline'; -import safe from 'safe-regex'; -import { devices } from 'playwright'; -import { consoleLogger, silentLogger } from './logs.js'; - -const playwrightAxeGenerator = async (domain, data) => { - const blacklistedPatternsFilename = 'exclusions.txt'; - let blacklistedPatterns = null; - - if (fs.existsSync(blacklistedPatternsFilename)) { - blacklistedPatterns = fs.readFileSync(blacklistedPatternsFilename).toString().split('\n'); - - let unsafe = blacklistedPatterns.filter(function (pattern) { - return !safe(pattern); - }); - - if (unsafe.length > 0) { - let unsafeExpressionsError = - "Unsafe expressions detected: '" + - unsafe + - "' Please revise " + - blacklistedPatternsFilename; - consoleLogger.error(unsafeExpressionsError); - silentLogger.error(unsafeExpressionsError); - process.exit(1); - } - } - - let { isHeadless, randomToken, deviceChosen, customDevice, viewportWidth } = data; - const block1 = `import { chromium, devices, webkit } from 'playwright'; - import { createCrawleeSubFolders, runAxeScript } from '#root/crawlers/commonCrawlerFunc.js'; - import { generateArtifacts } from '#root/mergeAxeResults.js'; - import { createAndUpdateResultsFolders, createDetailsAndLogs, createScreenshotsFolder } from '#root/utils.js'; - import constants, { intermediateScreenshotsPath, getExecutablePath, removeQuarantineFlag } from '#root/constants/constants.js'; - import fs from 'fs'; - import path from 'path'; - import { isSkippedUrl } from '#root/constants/common.js'; - import { spawnSync } from 'child_process'; - import safe from 'safe-regex'; - import { consoleLogger, silentLogger } from '#root/logs.js'; - const blacklistedPatternsFilename = 'exclusions.txt'; - -process.env.CRAWLEE_STORAGE_DIR = '${randomToken}'; -const compareExe = getExecutablePath('**/ImageMagick*/bin','compare'); - -if (!compareExe) { - let ImagMagickNotFoundError = "Could not find ImageMagick compare. Please ensure ImageMagick is installed at current directory."; - consoleLogger.error(ImagMagickNotFoundError); - silentLogger.error(ImagMagickNotFoundError); - process.exit(1); -} - -removeQuarantineFlag('**/ImageMagick*/lib/*.dylib'); -const ImageMagickPath = path.resolve(compareExe, '../../'); -process.env.MAGICK_HOME = ImageMagickPath; -process.env.DYLD_LIBRARY_PATH = ImageMagickPath + '/lib/'; - -const scanDetails = { - startTime: new Date().getTime(), - crawlType: 'Custom Flow', - requestUrl: '${domain}', -}; - -const urlsCrawled = { ...constants.urlsCrawledObj }; -const { dataset } = await createCrawleeSubFolders( - '${randomToken}', -); - -let blacklistedPatterns = null; - -if (fs.existsSync(blacklistedPatternsFilename)) { - blacklistedPatterns = fs.readFileSync(blacklistedPatternsFilename).toString().split('\\n'); - - let unsafe = blacklistedPatterns.filter(function (pattern) { - return !safe(pattern); - }); - - if (unsafe.length > 0) { - let unsafeExpressionsError = - "Unsafe expressions detected: '" + - unsafe + - "' Please revise " + - blacklistedPatternsFilename; - consoleLogger.error(unsafeExpressionsError); - silentLogger.error(unsafeExpressionsError); - process.exit(1); - } -} - -var index = 1; -var urlImageDictionary = {}; -let pageUrl; - -const checkIfScanRequired = async page => { - const imgPath = './screenshots/PHScan-screenshot' + index.toString() + '.png'; - - index += 1; - - const fullPageSize = await page.evaluate(() => { - return { - width: Math.max( - document.body.scrollWidth, - document.documentElement.scrollWidth, - document.body.offsetWidth, - document.documentElement.offsetWidth, - document.body.clientWidth, - document.documentElement.clientWidth, - ), - height: Math.max( - document.body.scrollHeight, - document.documentElement.scrollHeight, - document.body.offsetHeight, - document.documentElement.offsetHeight, - document.body.clientHeight, - document.documentElement.clientHeight, - ), - }; - }); - - const originalSize = page.viewportSize(); - await page.setViewportSize(fullPageSize); - const usesInfiniteScroll = async () => { - const prevHeight = await page.evaluate(() => document.body.scrollHeight); - - await page.evaluate(() => { - window.scrollTo(0, document.body.scrollHeight); - }); - - const isLoadMoreContent = async () => { - return new Promise((resolve) => { - setTimeout(async () => { - await page.waitForLoadState('domcontentloaded'); - - const result = await page.evaluate((prevHeight) => { - const currentHeight = document.body.scrollHeight; - return (currentHeight > prevHeight); - }, prevHeight); - - resolve(result); - }, 5000); - }); - } - - const result = await isLoadMoreContent(); - return result; - }; - - if (await usesInfiniteScroll()){ - pageUrl = page.url(); - await page.screenshot({ - path: imgPath, - clip: { - x: 0, - y: 0, - width: fullPageSize.width, - height: 5400 - }, - fullPage: true, - }); - } else { - pageUrl = page.url(); - await page.screenshot({ path: imgPath, fullPage: true }); - } - await page.setViewportSize(originalSize); - - var isSimilarPage = false; - - if (!urlImageDictionary[pageUrl]) { - urlImageDictionary[pageUrl] = [imgPath]; - return true; - } else { - try { - var currImg = imgPath; - var currImgCanny = currImg.replace(/.[^/.]+$/, '') + '-canny.png'; - spawnSync('convert', [currImg, '-canny', '0x1+10%+30%', currImgCanny]); - - for (const prevImg of urlImageDictionary[pageUrl]) { - var prevImgCanny = prevImg.replace(/.[^/.]+$/, '') + '-canny.png'; - - spawnSync('convert', [prevImg, '-canny', '0x1+10%+30%', prevImgCanny]); - - const nccOutput = spawnSync(compareExe, ['-metric', 'NCC', prevImgCanny, currImgCanny, 'null:']); - - const output = parseFloat(nccOutput.stderr.toString().trim()); - - if (output > 0.5) { - fs.unlink(currImg, err => { - if (err) throw err; - }); - - isSimilarPage = true; - - break; - } - } - - if (!isSimilarPage) { - urlImageDictionary[pageUrl].push(currImg) - return true; - } - - } catch (error) { - console.error('error: ', error); - } - } -}; - -const runAxeScan = async page => { - const host = new URL(pageUrl).hostname; - const result = await runAxeScript(page); - await dataset.pushData(result); - urlsCrawled.scanned.push(pageUrl); -} - - -const processPage = async page => { - await page.waitForLoadState('domcontentloaded'); - - if (await checkIfScanRequired(page)) { - if (blacklistedPatterns && isSkippedUrl(page, blacklistedPatterns)) { - return; - } else { - await runAxeScan(page); - } - }; -};`; - - const block2 = ` return urlsCrawled; - })().then(async (urlsCrawled) => { - fs.readdir(intermediateScreenshotsPath, (err, files) => { - if (err) { - console.error(\`Error reading directory: \${err}\`); - return; - } - const filteredFiles = files.filter(file => file.includes('canny')); - - filteredFiles.forEach(file => { - fs.unlink(\`./screenshots/\${file}\`, err => { - if (err) throw err; - }); - }); - }); - - scanDetails.endTime = new Date().getTime(); - scanDetails.urlsCrawled = urlsCrawled; - await createDetailsAndLogs(scanDetails, '${randomToken}'); - await createAndUpdateResultsFolders('${randomToken}'); - createScreenshotsFolder('${randomToken}'); - await generateArtifacts('${randomToken}', '${domain}', 'Customized', '${ - viewportWidth - ? `CustomWidth_${viewportWidth}px` - : customDevice - ? customDevice - : deviceChosen - ? deviceChosen - : 'Desktop' - }'); - });`; - - let tmpDir; - const appPrefix = 'purple-hats'; - - if (!fs.existsSync('./custom_flow_scripts')) { - fs.mkdirSync('./custom_flow_scripts'); - } - - const generatedScript = `./custom_flow_scripts/generatedScript-${randomToken}.js`; - - console.log( - ` ℹ️ A new browser will be launched shortly.\n Navigate and record custom steps for ${domain} in the new browser.\n Close the browser when you are done recording your steps.`, - ); - - try { - tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), appPrefix)); - - let browser = 'webkit'; - let userAgentOpts = null; - - // Performance workaround for macOS Big Sur and Windows to force Chromium browser instead of Webkit - if ( - (os.platform() === 'darwin' && os.release().startsWith('20.')) || - os.platform() === 'win32' - ) { - browser = 'chromium'; - - if (deviceChosen === 'Mobile') { - customDevice = 'iPhone 11'; - } - - if (customDevice && !viewportWidth) { - viewportWidth = devices[customDevice].viewport.width; - userAgentOpts = `--user-agent \"${devices[customDevice].userAgent}\"`; - } - } - - let codegenCmd = `npx playwright codegen --target javascript -o ${tmpDir}/intermediateScript.js ${domain}`; - let extraCodegenOpts = `${userAgentOpts} --browser ${browser} --block-service-workers --ignore-https-errors`; - let codegenResult; - - if (viewportWidth || customDevice === 'Specify viewport') { - codegenResult = execSync( - `${codegenCmd} --viewport-size=${viewportWidth},720 ${extraCodegenOpts}`, - ); - } else if (deviceChosen === 'Mobile') { - codegenResult = execSync(`${codegenCmd} --device="iPhone 11" ${extraCodegenOpts}`); - } else if (!customDevice || customDevice === 'Desktop' || deviceChosen === 'Desktop') { - codegenResult = execSync(`${codegenCmd} ${extraCodegenOpts}`); - } else if (customDevice === 'Samsung Galaxy S9+') { - codegenResult = execSync(`${codegenCmd} --device="Galaxy S9+" ${extraCodegenOpts}`); - } else if (customDevice) { - codegenResult = execSync(`${codegenCmd} --device="${customDevice}" ${extraCodegenOpts}`); - } else { - console.error( - `Error: Unable to parse device requested for scan. Please check the input parameters.`, - ); - } - - if (codegenResult.toString()) { - console.error(`Error running Codegen: ${codegenResult.toString()}`); - } - - const fileStream = fs.createReadStream(`${tmpDir}/intermediateScript.js`); - - const rl = readline.createInterface({ - input: fileStream, - crlfDelay: Infinity, - }); - - const appendToGeneratedScript = data => { - fs.appendFileSync(generatedScript, `${data}\n`); - }; - - let firstGoToUrl = false; - let lastGoToUrl; - let nextStepNeedsProcessPage = false; - - for await (let line of rl) { - if ( - line.trim() === `const { chromium } = require('playwright');` || - line.trim() === `const { webkit } = require('playwright');` || - line.trim() === `const { chromium, devices } = require('playwright');` || - line.trim() === `const { webkit, devices } = require('playwright');` - ) { - appendToGeneratedScript(block1); - continue; - } - if (line.trim() === `headless: false` && isHeadless) { - appendToGeneratedScript(`headless: true`); - continue; - } - if (line.trim() === `const browser = await webkit.launch({`) { - appendToGeneratedScript(`const browser = await chromium.launch({`); - continue; - } - if (line.trim() === `(async () => {`) { - appendToGeneratedScript(`await (async () => {`); - continue; - } - if (line.trim() === `const page = await context.newPage();`) { - if (deviceChosen === 'Mobile') { - appendToGeneratedScript(line); - appendToGeneratedScript( - ` const pageHeight = page.viewportSize().height - await page.setViewportSize({ - width: 360, - height: pageHeight, - isMobile: true, - });`, - ); - } else if (viewportWidth) { - appendToGeneratedScript(line); - appendToGeneratedScript( - `const pageHeight = page.viewportSize().height - await page.setViewportSize({ - width: ${viewportWidth}, - height: pageHeight, - isMobile: true, - });`, - ); - } else { - appendToGeneratedScript(line); - } - continue; - } - - let pageObj = 'page'; - - if (line.trim().startsWith(`await page`)) { - const regexPageObj = /(?<=await )(.*?)(?=\.)/; - pageObj = line.match(regexPageObj)[0]; - } - - if (line.trim().includes(`.goto(`)) { - if (!firstGoToUrl) { - firstGoToUrl = true; - appendToGeneratedScript( - `${line} - await processPage(page); - `, - ); - continue; - } else { - const regexURL = /(?<=goto\(\')(.*?)(?=\'\))/; - const foundURL = line.match(regexURL)[0]; - const withoutParamsURL = foundURL.split('?')[0]; - lastGoToUrl = withoutParamsURL; - continue; - } - } else if (lastGoToUrl) { - appendToGeneratedScript(` - await ${pageObj}.waitForURL('${lastGoToUrl}**',{timeout: 60000}); - await processPage(page); - `); - - lastGoToUrl = null; - } else if (nextStepNeedsProcessPage) { - appendToGeneratedScript(`await processPage(page);`); - nextStepNeedsProcessPage = false; - } - - if (line.trim().includes('getBy') || line.trim().includes('click()')) { - const lastIndex = line.lastIndexOf('.'); - const locator = line.substring(0, lastIndex); - appendToGeneratedScript( - ` (${locator}.count()>1)? [console.log('Please re-click the intended DOM element'), page.setDefaultTimeout(0)]: - ${line} - `, - ); - - nextStepNeedsProcessPage = true; - continue; - } else { - nextStepNeedsProcessPage = false; - } - - if (line.trim() === `await browser.close();`) { - appendToGeneratedScript(line); - appendToGeneratedScript(block2); - break; - } - - appendToGeneratedScript(line); - } - - fileStream.destroy(); - console.log(` Browser closed. Replaying steps and running accessibility scan...\n`); - - await import(generatedScript); - } catch (e) { - console.error(`Error: ${e}`); - throw e; - } finally { - try { - if (tmpDir) { - fs.rmSync(tmpDir, { recursive: true, force: true }); - } - } catch (e) { - console.error( - `An error has occurred while removing the temp folder at ${tmpDir}. Please remove it manually. Error: ${e}`, - ); - } - - console.log(`\n You may re-run the recorded steps by executing:\n\tnode ${generatedScript} \n`); - } -}; - -export default playwrightAxeGenerator; +import { execSync } from 'child_process'; +import fs from 'fs'; +import os from 'os'; +import path from 'path'; +import readline from 'readline'; +import safe from 'safe-regex'; +import { devices } from 'playwright'; +import { consoleLogger, silentLogger } from './logs.js'; +import { fileURLToPath } from 'url'; +import { proxy } from './constants/constants.js'; + +// Do NOT remove. These import statements will be used when the custom flow scan is run from the GUI app +import { chromium, webkit } from 'playwright'; +import { createCrawleeSubFolders, runAxeScript } from '#root/crawlers/commonCrawlerFunc.js'; +import { generateArtifacts } from '#root/mergeAxeResults.js'; +import { + createAndUpdateResultsFolders, + createDetailsAndLogs, + createScreenshotsFolder, +} from '#root/utils.js'; +import constants, { + intermediateScreenshotsPath, + getExecutablePath, + removeQuarantineFlag, +} from '#root/constants/constants.js'; +import { isSkippedUrl } from '#root/constants/common.js'; +import { spawnSync } from 'child_process'; +import { getDefaultChromeDataDir, getDefaultEdgeDataDir } from './constants/constants.js'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const playwrightAxeGenerator = async (domain, data) => { + const blacklistedPatternsFilename = 'exclusions.txt'; + let blacklistedPatterns = null; + + if (fs.existsSync(blacklistedPatternsFilename)) { + blacklistedPatterns = fs.readFileSync(blacklistedPatternsFilename).toString().split('\n'); + + let unsafe = blacklistedPatterns.filter(function (pattern) { + return !safe(pattern); + }); + + if (unsafe.length > 0) { + let unsafeExpressionsError = + "Unsafe expressions detected: '" + + unsafe + + "' Please revise " + + blacklistedPatternsFilename; + consoleLogger.error(unsafeExpressionsError); + silentLogger.error(unsafeExpressionsError); + process.exit(1); + } + } + + let { isHeadless, randomToken, deviceChosen, customDevice, viewportWidth } = data; + + // these will be appended to the generated script if the scan is run from CLI/index. + // this is so as the final generated script can be rerun after the scan. + const importStatements = ` + import { chromium, devices, webkit } from 'playwright'; + import { createCrawleeSubFolders, runAxeScript } from '#root/crawlers/commonCrawlerFunc.js'; + import { generateArtifacts } from '#root/mergeAxeResults.js'; + import { createAndUpdateResultsFolders, createDetailsAndLogs, createScreenshotsFolder } from '#root/utils.js'; + import constants, { intermediateScreenshotsPath, getExecutablePath, removeQuarantineFlag } from '#root/constants/constants.js'; + import fs from 'fs'; + import path from 'path'; + import { isSkippedUrl } from '#root/constants/common.js'; + import { spawnSync } from 'child_process'; + import safe from 'safe-regex'; + import { consoleLogger, silentLogger } from '#root/logs.js'; + + `; + const block1 = `const blacklistedPatternsFilename = 'exclusions.txt'; + +process.env.CRAWLEE_STORAGE_DIR = '${randomToken}'; +const compareExe = getExecutablePath('**/ImageMagick*/bin','compare'); + +if (!compareExe) { + let ImagMagickNotFoundError = "Could not find ImageMagick compare. Please ensure ImageMagick is installed at current directory."; + consoleLogger.error(ImagMagickNotFoundError); + silentLogger.error(ImagMagickNotFoundError); + process.exit(1); +} + +removeQuarantineFlag('**/ImageMagick*/lib/*.dylib'); +const ImageMagickPath = path.resolve(compareExe, '../../'); +process.env.MAGICK_HOME = ImageMagickPath; +process.env.DYLD_LIBRARY_PATH = ImageMagickPath + '/lib/'; + +const scanDetails = { + startTime: new Date().getTime(), + crawlType: 'Custom Flow', + requestUrl: '${domain}', +}; + +const urlsCrawled = { ...constants.urlsCrawledObj }; +const { dataset } = await createCrawleeSubFolders( + '${randomToken}', +); + +let blacklistedPatterns = null; + +if (fs.existsSync(blacklistedPatternsFilename)) { + blacklistedPatterns = fs.readFileSync(blacklistedPatternsFilename).toString().split('\\n'); + + let unsafe = blacklistedPatterns.filter(function (pattern) { + return !safe(pattern); + }); + + if (unsafe.length > 0) { + let unsafeExpressionsError = + "Unsafe expressions detected: '" + + unsafe + + "' Please revise " + + blacklistedPatternsFilename; + consoleLogger.error(unsafeExpressionsError); + silentLogger.error(unsafeExpressionsError); + process.exit(1); + } +} + +var index = 1; +var urlImageDictionary = {}; +let pageUrl; + +const checkIfScanRequired = async page => { + const imgPath = './screenshots/PHScan-screenshot' + index.toString() + '.png'; + + index += 1; + + const fullPageSize = await page.evaluate(() => { + return { + width: Math.max( + document.body.scrollWidth, + document.documentElement.scrollWidth, + document.body.offsetWidth, + document.documentElement.offsetWidth, + document.body.clientWidth, + document.documentElement.clientWidth, + ), + height: Math.max( + document.body.scrollHeight, + document.documentElement.scrollHeight, + document.body.offsetHeight, + document.documentElement.offsetHeight, + document.body.clientHeight, + document.documentElement.clientHeight, + ), + }; + }); + + const originalSize = page.viewportSize(); + await page.setViewportSize(fullPageSize); + const usesInfiniteScroll = async () => { + const prevHeight = await page.evaluate(() => document.body.scrollHeight); + + await page.evaluate(() => { + window.scrollTo(0, document.body.scrollHeight); + }); + + const isLoadMoreContent = async () => { + return new Promise((resolve) => { + setTimeout(async () => { + await page.waitForLoadState('domcontentloaded'); + + const result = await page.evaluate((prevHeight) => { + const currentHeight = document.body.scrollHeight; + return (currentHeight > prevHeight); + }, prevHeight); + + resolve(result); + }, 5000); + }); + } + + const result = await isLoadMoreContent(); + return result; + }; + + if (await usesInfiniteScroll()){ + pageUrl = page.url(); + consoleLogger.info('Screenshot page at: ', pageUrl); + silentLogger.info('Screenshot page at: ', pageUrl); + + await page.screenshot({ + path: imgPath, + clip: { + x: 0, + y: 0, + width: fullPageSize.width, + height: 5400 + }, + fullPage: true, + }); + } else { + pageUrl = page.url(); + await page.screenshot({ path: imgPath, fullPage: true }); + } + await page.setViewportSize(originalSize); + + var isSimilarPage = false; + + if (!urlImageDictionary[pageUrl]) { + urlImageDictionary[pageUrl] = [imgPath]; + return true; + } else { + try { + var currImg = imgPath; + var currImgCanny = currImg.replace(/.[^/.]+$/, '') + '-canny.png'; + spawnSync('convert', [currImg, '-canny', '0x1+10%+30%', currImgCanny]); + + for (const prevImg of urlImageDictionary[pageUrl]) { + var prevImgCanny = prevImg.replace(/.[^/.]+$/, '') + '-canny.png'; + + spawnSync('convert', [prevImg, '-canny', '0x1+10%+30%', prevImgCanny]); + + const nccOutput = spawnSync(compareExe, ['-metric', 'NCC', prevImgCanny, currImgCanny, 'null:']); + + const output = parseFloat(nccOutput.stderr.toString().trim()); + + if (output > 0.5) { + fs.unlink(currImg, err => { + if (err) throw err; + }); + + isSimilarPage = true; + + break; + } + } + + if (!isSimilarPage) { + urlImageDictionary[pageUrl].push(currImg) + return true; + } + + } catch (error) { + console.error('error: ', error); + } + } +}; + +const runAxeScan = async page => { + const result = await runAxeScript(page); + await dataset.pushData(result); + urlsCrawled.scanned.push(page.url()); +} + + +const processPage = async page => { + try { + await page.waitForLoadState('networkidle', {'timeout': 10000 }); + } catch (e) { + consoleLogger.info('Unable to detect networkidle'); + silentLogger.info('Unable to detect networkidle'); + } + + consoleLogger.info('Visiting page at: ',page.url()); + silentLogger.info('Visiting page at: ',page.url()); + + if (blacklistedPatterns && isSkippedUrl(page, blacklistedPatterns)) { + return; + } else { + const scanRequired = await checkIfScanRequired(page); + + if (scanRequired) { + await runAxeScan(page); + } + } + + +};`; + + const block2 = ` return urlsCrawled; + })().then(async (urlsCrawled) => { + fs.readdir(intermediateScreenshotsPath, (err, files) => { + if (err) { + console.error(\`Error reading directory: \${err}\`); + return; + } + const filteredFiles = files.filter(file => file.includes('canny')); + + filteredFiles.forEach(file => { + fs.unlink(\`./screenshots/\${file}\`, err => { + if (err) throw err; + }); + }); + }); + + scanDetails.endTime = new Date().getTime(); + scanDetails.urlsCrawled = urlsCrawled; + await createDetailsAndLogs(scanDetails, '${randomToken}'); + await createAndUpdateResultsFolders('${randomToken}'); + createScreenshotsFolder('${randomToken}'); + await generateArtifacts('${randomToken}', '${domain}', 'Customized', '${ + viewportWidth + ? `CustomWidth_${viewportWidth}px` + : customDevice + ? customDevice + : deviceChosen + ? deviceChosen + : 'Desktop' + }'); + });`; + + let tmpDir; + const appPrefix = 'purple-hats'; + + if (!fs.existsSync('./custom_flow_scripts')) { + fs.mkdirSync('./custom_flow_scripts'); + } + + const generatedScript = `./custom_flow_scripts/generatedScript-${randomToken}.js`; + + console.log( + ` ℹ️ A new browser will be launched shortly.\n Navigate and record custom steps for ${domain} in the new browser.\n Close the browser when you are done recording your steps.`, + ); + + try { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), appPrefix)); + + let browser = 'webkit'; + let userAgentOpts = null; + let channel = null; + + // Performance workaround for macOS Big Sur and Windows to force Chromium browser instead of Webkit + if ( + (os.platform() === 'darwin' && os.release().startsWith('20.')) || + os.platform() === 'win32' + ) { + browser = 'chromium'; + + if (deviceChosen === 'Mobile') { + customDevice = 'iPhone 11'; + } + + if (customDevice && !viewportWidth) { + viewportWidth = devices[customDevice].viewport.width; + userAgentOpts = `--user-agent \"${devices[customDevice].userAgent}\"`; + } + } + + if (os.platform() === 'win32' && getDefaultChromeDataDir()) { + channel = 'chrome'; + } + + let codegenCmd = `npx playwright codegen --target javascript -o ${tmpDir}/intermediateScript.js ${domain}`; + let extraCodegenOpts = `${userAgentOpts} --browser ${browser} --block-service-workers --ignore-https-errors ${ + channel && `--channel ${channel}` + }`; + + if (viewportWidth || customDevice === 'Specify viewport') { + codegenCmd = `${codegenCmd} --viewport-size=${viewportWidth},720 ${extraCodegenOpts}`; + } else if (deviceChosen === 'Mobile') { + codegenCmd = `${codegenCmd} --device="iPhone 11" ${extraCodegenOpts}`; + } else if (!customDevice || customDevice === 'Desktop' || deviceChosen === 'Desktop') { + codegenCmd = `${codegenCmd} ${extraCodegenOpts}`; + } else if (customDevice === 'Samsung Galaxy S9+') { + codegenCmd = `${codegenCmd} --device="Galaxy S9+" ${extraCodegenOpts}`; + } else if (customDevice) { + codegenCmd = `${codegenCmd} --device="${customDevice}" ${extraCodegenOpts}`; + } else { + console.error( + `Error: Unable to parse device requested for scan. Please check the input parameters.`, + ); + } + + const codegenResult = execSync(codegenCmd, { cwd: __dirname }); + + if (codegenResult.toString()) { + console.error(`Error running Codegen: ${codegenResult.toString()}`); + } + + const fileStream = fs.createReadStream(`${tmpDir}/intermediateScript.js`); + + const rl = readline.createInterface({ + input: fileStream, + crlfDelay: Infinity, + }); + + const appendToGeneratedScript = data => { + fs.appendFileSync(generatedScript, `${data}\n`); + }; + + let firstGoToUrl = false; + let lastGoToUrl; + let nextStepNeedsProcessPage = false; + + // used when running a scan on a machine with proxy + let awaitingProxyLogin = false; + let secondGotoMicrosoftLoginSeen = false; + + if (!process.env.RUNNING_FROM_PH_GUI) { + appendToGeneratedScript(importStatements); + } + + for await (let line of rl) { + if (/page\d.close\(\)/.test(line.trim())){ + const handleUndefinedPageBlock = `try{ + ${line} + } catch(err){ + console.log(err) + }` + appendToGeneratedScript(handleUndefinedPageBlock) + continue; + } + + if ( + line.trim() === `const { chromium } = require('playwright');` || + line.trim() === `const { webkit } = require('playwright');` || + line.trim() === `const { chromium, devices } = require('playwright');` || + line.trim() === `const { webkit, devices } = require('playwright');` + ) { + appendToGeneratedScript(block1); + continue; + } + if (line.trim() === `headless: false`) { + if (proxy) { + appendToGeneratedScript(`slowMo: 100,`); + if (proxy.type === 'autoConfig') { + appendToGeneratedScript(`args: ['--proxy-pac-url=${proxy.url}'],`); + } else { + appendToGeneratedScript(`args: ['--proxy-server=${proxy.url}'],`); + } + } + if (!proxy && isHeadless) { + appendToGeneratedScript(`headless: true`); + continue; + } + } + if (line.trim() === `const browser = await webkit.launch({`) { + appendToGeneratedScript(`const browser = await chromium.launch({`); + continue; + } + if (line.trim() === `(async () => {`) { + appendToGeneratedScript(`await (async () => {`); + continue; + } + if (line.trim() === `const page = await context.newPage();`) { + if (deviceChosen === 'Mobile') { + appendToGeneratedScript(line); + appendToGeneratedScript( + ` const pageHeight = page.viewportSize().height + await page.setViewportSize({ + width: 360, + height: pageHeight, + isMobile: true, + });`, + ); + } else if (viewportWidth) { + appendToGeneratedScript(line); + appendToGeneratedScript( + `const pageHeight = page.viewportSize().height + await page.setViewportSize({ + width: ${viewportWidth}, + height: pageHeight, + isMobile: true, + });`, + ); + } else { + appendToGeneratedScript(line); + } + continue; + } + + let pageObj = 'page'; + + if (line.trim().startsWith(`await page`)) { + const regexPageObj = /(?<=await )(.*?)(?=\.)/; + pageObj = line.match(regexPageObj)[0]; + } + + if (proxy && line.trim().startsWith(`await page.goto('https://login.microsoftonline.com/`)) { + if (!awaitingProxyLogin) { + awaitingProxyLogin = true; + continue; + } else if (!secondGotoMicrosoftLoginSeen) { + secondGotoMicrosoftLoginSeen = true; + continue; + } + } + + if (awaitingProxyLogin) { + if (line.trim().startsWith(`await page.goto('${domain}`)) { + awaitingProxyLogin = false; + } else { + continue; + } + } + + if (line.trim().includes(`.goto(`)) { + if (!firstGoToUrl) { + if (line.trim().startsWith(`await page.goto('https://login.singpass.gov.sg`)) { + continue; + } + firstGoToUrl = true; + const firstGoToAddress = line.split(`('`)[1].split(`')`)[0]; + appendToGeneratedScript( + `${line} + await page.waitForURL('${firstGoToAddress}', {timeout: 60000}); + await processPage(page); + `, + ); + continue; + } else { + const regexURL = /(?<=goto\(\')(.*?)(?=\'\))/; + const foundURL = line.match(regexURL)[0]; + const withoutParamsURL = foundURL.split('?')[0]; + lastGoToUrl = withoutParamsURL; + continue; + } + } else if (lastGoToUrl) { + appendToGeneratedScript(` + await ${pageObj}.waitForURL('${lastGoToUrl}**',{timeout: 60000}); + await processPage(page); + `); + + lastGoToUrl = null; + } else if (nextStepNeedsProcessPage) { + appendToGeneratedScript(`await processPage(page);`); + nextStepNeedsProcessPage = false; + } + + if ( + (line.trim().includes('getBy') && !line.trim().includes('getByPlaceholder')) || + line.trim().includes('click()') + ) { + const lastIndex = line.lastIndexOf('.'); + const locator = line.substring(0, lastIndex); + appendToGeneratedScript( + ` (${locator}.count()>1)? [console.log('Please re-click the intended DOM element'), page.setDefaultTimeout(0)]: + ${line} + `, + ); + + nextStepNeedsProcessPage = true; + continue; + } else { + nextStepNeedsProcessPage = false; + } + + if (line.trim() === `await browser.close();`) { + appendToGeneratedScript(line); + appendToGeneratedScript(block2); + break; + } + + appendToGeneratedScript(line); + } + + fileStream.destroy(); + console.log(` Browser closed. Replaying steps and running accessibility scan...\n`); + + if (process.env.RUNNING_FROM_PH_GUI) { + const genScriptString = fs.readFileSync(generatedScript, 'utf-8'); + const genScriptCompleted = new Promise((resolve, reject) => { + eval(`(async () => { + try { + ${genScriptString} + resolve(); + } catch (e) { + reject(e) + } + })();`); + }); + await genScriptCompleted; + } else { + await import(generatedScript); + } + } catch (e) { + console.error(`Error: ${e}`); + throw e; + } finally { + try { + if (tmpDir) { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + } catch (e) { + console.error( + `An error has occurred while removing the temp folder at ${tmpDir}. Please remove it manually. Error: ${e}`, + ); + } + + if (!process.env.RUNNING_FROM_PH_GUI) { + console.log( + `\n You may re-run the recorded steps by executing:\n\tnode ${generatedScript} \n`, + ); + } + } +}; + +export default playwrightAxeGenerator; \ No newline at end of file diff --git a/static/ejs/partials/scripts/ruleOffcanvas.ejs b/static/ejs/partials/scripts/ruleOffcanvas.ejs index 351e6b21..88042a9b 100644 --- a/static/ejs/partials/scripts/ruleOffcanvas.ejs +++ b/static/ejs/partials/scripts/ruleOffcanvas.ejs @@ -133,7 +133,7 @@ category summary is clicked %> const contentContainer = document.getElementById('expandedRuleCategoryContent'); if (category === 'passed') { - contentContainer.innerHTML = `You may find the list of passed HTML elements in passed_items.json.`; + contentContainer.innerHTML = `You may find the list of passed HTML elements in passed_items.json.txt.`; return; } diff --git a/utils.js b/utils.js index bebce51c..5ed42d93 100644 --- a/utils.js +++ b/utils.js @@ -1,4 +1,4 @@ -import { exec, execFile, execSync } from 'child_process'; +import { execFileSync, execSync } from 'child_process'; import path from 'path'; import os from 'os'; import { fileURLToPath } from 'url'; @@ -19,7 +19,7 @@ export const getVersion = () => { return versionNum; }; -export const getHost = url => new URL(url).host +export const getHost = url => new URL(url).host; export const getCurrentDate = () => { const date = new Date(); @@ -121,31 +121,30 @@ export const setThresholdLimits = setWarnLevel => { process.env.WARN_LEVEL = setWarnLevel; }; -export const zipResults = async (zipName, resultsPath) => { +export const zipResults = (zipName, resultsPath) => { // Check prior zip file exist and remove if (fs.existsSync(zipName)) { - fs.unlink(zipName); + fs.unlinkSync(zipName); } if (os.platform() === 'win32') { - exec( - `Get-ChildItem -Path "${resultsPath}\\*.*" -Recurse | Compress-Archive -DestinationPath "${zipName}"`, - { shell: 'powershell.exe' }, - err => { - if (err) { - throw err; - } - }, - ); + try { + execSync( + `Get-ChildItem -Path "${resultsPath}\\*.*" -Recurse | Compress-Archive -DestinationPath "${zipName}"`, + { shell: 'powershell.exe' }, + ); + } catch (err) { + throw err; + } } else { // To zip up files recursively )-r) in the results folder path // Will only zip up the content of the results folder path with (-j) i.e. junk the path const command = '/usr/bin/zip'; const args = ['-r', '-j', zipName, resultsPath]; - execFile(command, args, err => { - if (err) { - throw err; - } - }); + try { + execFileSync(command, args); + } catch (err) { + throw err; + } } };