Skip to content

Commit

Permalink
Scan Engine Improvements (#139)
Browse files Browse the repository at this point in the history
* Ability to set `maxConcurrency` to slow down scan rate to mitigate sites with Web Application Firewall (WAF) throttling requests
* Omit scanning pages that returns non `200` status code
* Omit axe reported false-positive and needs review accessibility findings to simplify triage
* Omit scanning URLs that render as page when the URL ends with known file extension
* Ability to load URL containing GET request parameters / form response as the starting page
* Update dependencies

---------

Co-authored-by: Georgetxm
Co-authored-by: magdeline
Co-authored-by: greyguy21
  • Loading branch information
younglim authored Jul 21, 2023
1 parent 83af5d7 commit 69d3c53
Show file tree
Hide file tree
Showing 14 changed files with 127 additions and 64 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,9 @@ Options:
[string] [required]
-j, --customFlowLabel Give Custom Flow Scan a label for easier reference in t
he report [string]
-t, --specifiedMaxConcurrency Maximum number of pages to scan concurrently.
Use for sites with throttling. Defaults to 25.
[number]
Examples:
To scan sitemap of website:', 'node cli.js -c [ 1 | Sitemap ] -d <device> -u
<url_link> -w <viewportWidth>
Expand Down
Binary file added a11y-scan-results.zip
Binary file not shown.
12 changes: 11 additions & 1 deletion cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,16 @@ Usage: node cli.js -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,

return option;
})
.coerce('t', option => {
if (!Number.isInteger(option) || Number(option) <= 0) {
printMessage(
[`Invalid number for max concurrency. Please provide a positive integer.`],
messageOptions,
);
process.exit(1);
}
return option;
})
.coerce('k', nameEmail => {
if (nameEmail.indexOf(':') === -1) {
printMessage(
Expand Down Expand Up @@ -168,7 +178,7 @@ const scanInit = async argvs => {
argvs.scanner = constants.scannerTypes[argvs.scanner];
argvs.headless = argvs.headless === 'yes';
argvs.browserToRun = constants.browserTypes[argvs.browserToRun];

let useChrome = false;
let useEdge = false;
let chromeDataDir = null;
Expand Down
11 changes: 10 additions & 1 deletion combine.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const combineRun = async (details, deviceToScan) => {
browser,
userDataDirectory,
strategy,
specifiedMaxConcurrency,
} = envDetails;

process.env.CRAWLEE_STORAGE_DIR = randomToken;
Expand Down Expand Up @@ -62,6 +63,7 @@ const combineRun = async (details, deviceToScan) => {
maxRequestsPerCrawl,
browser,
userDataDirectory,
specifiedMaxConcurrency,
);
break;

Expand All @@ -75,6 +77,7 @@ const combineRun = async (details, deviceToScan) => {
browser,
userDataDirectory,
strategy,
specifiedMaxConcurrency,
);
break;

Expand All @@ -88,7 +91,13 @@ const combineRun = async (details, deviceToScan) => {

if (scanDetails.urlsCrawled.scanned.length > 0) {
await createAndUpdateResultsFolders(randomToken);
const basicFormHTMLSnippet = await generateArtifacts(randomToken, url, type, deviceToScan, urlsCrawled.scanned);
const basicFormHTMLSnippet = await generateArtifacts(
randomToken,
url,
type,
deviceToScan,
urlsCrawled.scanned,
);
const [name, email] = nameEmail.split(':');
await submitFormViaPlaywright(
browser,
Expand Down
11 changes: 9 additions & 2 deletions constants/cliFunctions.js
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,22 @@ export const cliOptions = {
alias: 'customFlowLabel',
describe: 'Give Custom Flow Scan a label for easier reference in the report',
type: 'string',
requiresArg: true,
demandOption: false
requiresArg: true,
demandOption: false,
},
k: {
alias: 'nameEmail',
describe: `To personalise your experience, we will be collecting your name, email address and app usage data. Your information fully complies with GovTech’s Privacy Policy. Please provice your name and email address in this format "John Doe:[email protected]".`,
type: 'string',
demandOption: true,
},
t: {
alias: 'specifiedMaxConcurrency',
describe:
'Maximum number of pages to scan concurrently. Use for sites with throttling. Defaults to 25.',
type: 'number',
demandOption: false,
},
};

export const configureReportSetting = isEnabled => {
Expand Down
10 changes: 9 additions & 1 deletion constants/common.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import constants, {
formDataFields,
whitelistedAttributes,
mutedAttributeValues,
blackListedFileExtensions,
} from './constants.js';
import { silentLogger } from '../logs.js';

Expand Down Expand Up @@ -100,6 +101,11 @@ export const sortAlphaAttributes = htmlString => {
return entireHtml;
};

export const isBlacklistedFileExtensions = (url, blacklistedFileExtensions) => {
const urlExtension = url.split('.').pop();
return blacklistedFileExtensions.includes(urlExtension);
};

const document = new JSDOM('').window;

const httpsAgent = new https.Agent({
Expand Down Expand Up @@ -422,6 +428,7 @@ export const prepareData = argv => {
browserToRun,
nameEmail,
customFlowLabel,
specifiedMaxConcurrency,
} = argv;

return {
Expand All @@ -437,7 +444,8 @@ export const prepareData = argv => {
isLocalSitemap,
browser: browserToRun,
nameEmail,
customFlowLabel
customFlowLabel,
specifiedMaxConcurrency,
};
};

Expand Down
20 changes: 18 additions & 2 deletions constants/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,22 @@ export const mutedAttributeValues = [
`aria-labelledby`,
];

export const blackListedFileExtensions = [
'css',
'js',
'txt',
'mp3',
'mp4',
'jpg',
'jpeg',
'png',
'svg',
'gif',
'woff',
'pdf',
'zip',
];

export const intermediateScreenshotsPath = './screenshots';
export const destinationPath = storagePath => `${storagePath}/screenshots`;

Expand Down Expand Up @@ -237,7 +253,7 @@ export const impactOrder = {
};

export const formDataFields = {
formUrl: `https://docs.google.com/forms/d/e/1FAIpQLSem5C8fyNs5TiU5Vv2Y63-SH7CHN86f-LEPxeN_1u_ldUbgUA/formResponse`,
formUrl: `https://docs.google.com/forms/d/e/1FAIpQLSeUmqoVRSvMrW1DRi1KNMemWyKvDbEWGJp2dve4qb8QB3Zgvw/formResponse`,
websiteUrlField: 'entry.1562345227',
scanTypeField: 'entry.1148680657',
emailField: 'entry.52161304',
Expand Down Expand Up @@ -284,7 +300,7 @@ export default {
allIssueFileName: 'all_issues',
cliZipFileName: 'a11y-scan-results.zip',
maxRequestsPerCrawl,
maxConcurrency: 50,
maxConcurrency: 25,
scannerTypes,
browserTypes,
urlsCrawledObj,
Expand Down
17 changes: 7 additions & 10 deletions crawlers/commonCrawlerFunc.js
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
/* eslint-disable no-unused-vars */
/* eslint-disable no-param-reassign */
import crawlee from 'crawlee';
import crawlee, { playwrightUtils } from 'crawlee';
import axe from 'axe-core';
import { axeScript, saflyIconSelector } from '../constants/constants.js';

export const filterAxeResults = (results, pageTitle) => {
const { violations, incomplete, passes, url } = results;
const { violations, passes, url } = results;

let totalItems = 0;
const mustFix = { totalItems: 0, rules: {} };
const goodToFix = { totalItems: 0, rules: {} };
const passed = { totalItems: 0, rules: {} };

const process = (item, needsReview = false) => {
const process = (item) => {
const { id: rule, help: description, helpUrl, tags, nodes } = item;

if (rule === 'frame-tested') return;
Expand All @@ -24,11 +24,9 @@ export const filterAxeResults = (results, pageTitle) => {
if (!(rule in category.rules)) {
category.rules[rule] = { description, helpUrl, conformance, totalItems: 0, items: [] };
}
const message = needsReview
? failureSummary.slice(failureSummary.indexOf('\n') + 1).trim()
: failureSummary;
const message = failureSummary;
category.rules[rule].items.push(
needsReview ? { html, message, needsReview } : { html, message },
{ html, message },
);
category.rules[rule].totalItems += 1;
category.totalItems += 1;
Expand All @@ -46,7 +44,6 @@ export const filterAxeResults = (results, pageTitle) => {
};

violations.forEach(item => process(item));
incomplete.forEach(item => process(item, true));

passes.forEach(item => {
const { id: rule, help: description, helpUrl, tags, nodes } = item;
Expand Down Expand Up @@ -91,7 +88,7 @@ export const runAxeScript = async (page, selectors = []) => {
},
});
return axe.run(selectors, {
resultTypes: ['violations', 'passes', 'incomplete'],
resultTypes: ['violations', 'passes'],
});
},
{ selectors, saflyIconSelector },
Expand All @@ -115,4 +112,4 @@ export const preNavigationHooks = [

export const failedRequestHandler = async ({ request }) => {
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
};
};
33 changes: 26 additions & 7 deletions crawlers/crawlDomain.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import crawlee from 'crawlee';
import crawlee, { playwrightUtils } from 'crawlee';
import {
createCrawleeSubFolders,
preNavigationHooks,
runAxeScript,
failedRequestHandler,
} from './commonCrawlerFunc.js';
import constants, { basicAuthRegex } from '../constants/constants.js';
import { getPlaywrightLaunchOptions } from '../constants/common.js';
import constants, { basicAuthRegex, blackListedFileExtensions } from '../constants/constants.js';
import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions } from '../constants/common.js';

const crawlDomain = async (
url,
Expand All @@ -17,6 +17,7 @@ const crawlDomain = async (
browser,
userDataDirectory,
strategy,
specifiedMaxConcurrency,
) => {
const urlsCrawled = { ...constants.urlsCrawledObj };
const { maxConcurrency } = constants;
Expand Down Expand Up @@ -70,21 +71,39 @@ const crawlDomain = async (
},
requestQueue,
preNavigationHooks,
requestHandler: async ({ page, request, enqueueLinks, enqueueLinksByClickingElements }) => {
requestHandler: async ({
page,
request,
response,
enqueueLinks,
enqueueLinksByClickingElements,
}) => {
const currentUrl = request.url;

if (isBlacklistedFileExtensions(currentUrl, blackListedFileExtensions)) {
urlsCrawled.invalid.push(currentUrl);
return;
}

if (response.status() !== 200) {
urlsCrawled.invalid.push(request.url);
return;
}

if (pagesCrawled === maxRequestsPerCrawl) {
urlsCrawled.invalid.push(request.url);
return;
}
pagesCrawled++;

const currentUrl = request.url;
const location = await page.evaluate('location');

if (isBasicAuth) {
isBasicAuth = false;
} else if (location.host.includes(host)) {
const results = await runAxeScript(page);
await dataset.pushData(results);
urlsCrawled.scanned.push({url: currentUrl, pageTitle: results.pageTitle});
urlsCrawled.scanned.push({ url: currentUrl, pageTitle: results.pageTitle });

await enqueueLinks({
// set selector matches anchor elements with href but not contains # or starting with mailto:
Expand Down Expand Up @@ -116,7 +135,7 @@ const crawlDomain = async (
},
failedRequestHandler,
maxRequestsPerCrawl,
maxConcurrency,
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
});

await crawler.run();
Expand Down
5 changes: 3 additions & 2 deletions crawlers/crawlSitemap.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ const crawlSitemap = async (
maxRequestsPerCrawl,
browser,
userDataDirectory,
specifiedMaxConcurrency,
) => {
const urlsCrawled = { ...constants.urlsCrawledObj };
const { playwrightDeviceDetailsObject } = viewportSettings;
Expand Down Expand Up @@ -65,14 +66,14 @@ const crawlSitemap = async (
if (status === 200 && isWhitelistedContentType(contentType)) {
const results = await runAxeScript(page);
await dataset.pushData(results);
urlsCrawled.scanned.push({url: currentUrl, pageTitle: results.pageTitle});
urlsCrawled.scanned.push({ url: currentUrl, pageTitle: results.pageTitle });
} else {
urlsCrawled.invalid.push(currentUrl);
}
},
failedRequestHandler,
maxRequestsPerCrawl,
maxConcurrency,
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
});

await crawler.run();
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"glob": "^9.1.2",
"inquirer": "^9.1.4",
"jsdom": "^21.0.0",
"playwright": "1.32.1",
"playwright": "1.36.1",
"print-message": "^3.0.1",
"safe-regex": "^2.1.1",
"validator": "^13.7.0",
Expand Down
4 changes: 2 additions & 2 deletions playwrightAxeGenerator.js
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ const processPage = async page => {
try {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), appPrefix));

let browser = 'webkit';
let browser = 'chromium';
let userAgentOpts = null;
let channel = null;

Expand All @@ -360,7 +360,7 @@ const processPage = async page => {
channel = 'chrome';
}

let codegenCmd = `npx playwright codegen --target javascript -o ${tmpDir}/intermediateScript.js ${data.url}`;
let codegenCmd = `npx playwright codegen --target javascript -o "${tmpDir}/intermediateScript.js" "${data.url}"`;
let extraCodegenOpts = `${userAgentOpts} --browser ${browser} --block-service-workers --ignore-https-errors ${
channel && `--channel ${channel}`
}`;
Expand Down
Loading

0 comments on commit 69d3c53

Please sign in to comment.