Skip to content

Commit

Permalink
Allow Txt to be scanned and support for window path (#371)
Browse files Browse the repository at this point in the history
* Added windows letter path into isFilePath

* added support for txt sitemap

* Handling for backslash pattern for windows

* Support local file scan on Windows without need for npx install playwright browsers

* Bump package version

---------

Co-authored-by: younglim <[email protected]>
  • Loading branch information
angyonghaseyo and younglim authored Jun 21, 2024
1 parent 939ff74 commit 877bbf0
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 26 deletions.
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "@govtechsg/purple-hats",
"main": "dist/npmIndex.js",
"version": "0.10.4",
"version": "0.10.5",
"type": "module",
"imports": {
"#root/*.js": "./dist/*.js"
Expand Down
11 changes: 8 additions & 3 deletions src/constants/common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -988,7 +988,7 @@ export const getLinksFromSitemap = async (
if (isLimitReached()) {
break;
}
if (childSitemapUrlText.endsWith('.xml')) {
if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
await fetchUrls(childSitemapUrlText); // Recursive call for nested sitemaps
} else {
addToUrlList(childSitemapUrlText); // Add regular URLs to the list
Expand Down Expand Up @@ -1020,7 +1020,7 @@ export const getLinksFromSitemap = async (
}

const requestList = Object.values(urls);

return requestList;
};

Expand Down Expand Up @@ -1786,7 +1786,12 @@ function isValidHttpUrl(urlString) {
}

export const isFilePath = (url: string): boolean => {
return url.startsWith('file://') || url.startsWith('/');
const driveLetterPattern = /^[A-Z]:/i;
const backslashPattern = /\\/;
return url.startsWith('file://') ||
url.startsWith('/') ||
driveLetterPattern.test(url) ||
backslashPattern.test(url);
};

export function convertLocalFileToPath(url: string): string {
Expand Down
29 changes: 9 additions & 20 deletions src/crawlers/crawlLocalFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import {
failedRequestHandler,
isUrlPdf,
} from './commonCrawlerFunc.js';

import constants, { guiInfoStatusTypes, basicAuthRegex } from '../constants/constants.js';
import {
getLinksFromSitemap,
Expand Down Expand Up @@ -74,9 +73,9 @@ const crawlLocalFile = async (
convertLocalFileToPath(sitemapUrl);

// XML Files
if (!sitemapUrl.match(/\.xml$/i)) {
console.log((!sitemapUrl.match(/\.txt$/i)))
if (!(sitemapUrl.match(/\.xml$/i) || sitemapUrl.match(/\.txt$/i))) {
linksFromSitemap = [new Request({ url: sitemapUrl })];

// Non XML file
} else {
const username = '';
Expand Down Expand Up @@ -145,23 +144,13 @@ const crawlLocalFile = async (
uuidToPdfMapping[pdfFileName] = trimmedUrl;

if (!isUrlPdf(request.url)) {
let browserUsed;
// Playwright only supports chromium,firefox and webkit thus hardcoded to chromium
if (browser === 'chromium') {
browserUsed = await playwright.chromium.launch();
} else if (browser === 'firefox') {
browserUsed = await playwright.firefox.launch();
} else if (browser === 'webkit') {
browserUsed = await playwright.webkit.launch();
} else if (browser === 'chrome') {
browserUsed = await playwright.chromium.launch(); //chrome not supported, default to chromium
} else {
console.log('Browser not supported, please use chrome, chromium, firefox, webkit');
console.log(' ');
return;
}
const context = await browserUsed.newContext();
const page = await context.newPage();

const browserContext = await constants.launcher.launchPersistentContext('', {
headless: process.env.CRAWLEE_HEADLESS === '1',
...getPlaywrightLaunchOptions(browser),
});

const page = await browserContext.newPage();
request.url = convertPathToLocalFile(request.url);
await page.goto(request.url);
const results = await runAxeScript(includeScreenshots, page, randomToken, null);
Expand Down

0 comments on commit 877bbf0

Please sign in to comment.