diff --git a/crawlers/crawlDomain.js b/crawlers/crawlDomain.js index e1cf9768..dc589691 100644 --- a/crawlers/crawlDomain.js +++ b/crawlers/crawlDomain.js @@ -20,8 +20,7 @@ import { import { areLinksEqual } from '../utils.js'; import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js'; import fs from 'fs'; -import { guiInfoLog } from '../logs.js'; -import { chromium } from 'playwright'; +import { silentLogger, guiInfoLog } from '../logs.js'; const crawlDomain = async ( url, @@ -106,22 +105,78 @@ const crawlDomain = async ( }; }); - await enqueueLinksByClickingElements({ - // set selector matches - // NOT - // IS role='link' or button onclick - // enqueue new page URL - // handle onclick - selector: ':not(a):is([role="link"], button[onclick])', - transformRequestFunction(req) { - req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, ''); - if (isUrlPdf(req.url)) { - // playwright headless mode does not support navigation to pdf document - req.skipNavigation = true; + const handleOnClickEvent = async () => { + // Intercepting click events to handle cases where request was issued before the frame is created + // when a new tab is opened + await page.context().route('**', async route => { + if (route.request().resourceType() === 'document') { + try { + const isTopFrameNavigationRequest = () => { + return route.request().isNavigationRequest() + && route.request().frame() === page.mainFrame(); + } + + if (isTopFrameNavigationRequest()) { + await requestQueue.addRequest({ url, skipNavigation: isUrlPdf(url) }); + await route.abort('aborted'); + } else { + route.continue(); + } + } catch (e) { + silentLogger.info(e); + route.continue(); + } } - return req; - }, - }); + }) + } + await page.exposeFunction('handleOnClickEvent', handleOnClickEvent) + + await page.evaluate(() => { + document.addEventListener('click', (event) => handleOnClickEvent(event)); + }) + + page.on('request', async request => { + // Intercepting requests to handle cases where request was issued before the frame is created + await page.context().route(request.url(), async route => { + try { + const isTopFrameNavigationRequest = () => { + return route.request().isNavigationRequest() + && route.request().frame() === page.mainFrame(); + } + + if (route.request().resourceType() === 'document') { + if (isTopFrameNavigationRequest()) { + await requestQueue.addRequest({ url, skipNavigation: isUrlPdf(url) }); + } + } + } catch (e) { + silentLogger.info(e); + } + }) + }) + + // Try catch is necessary clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle + try { + await enqueueLinksByClickingElements({ + // set selector matches + // NOT + // IS role='link' or button onclick + // enqueue new page URL + // handle onclick + selector: ':not(a):is([role="link"], button[onclick])', + transformRequestFunction(req) { + req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, ''); + if (isUrlPdf(req.url)) { + // playwright headless mode does not support navigation to pdf document + req.skipNavigation = true; + } + return req; + }, + waitForPageIdleSecs: 10000 + }) + } catch (e) { + silentLogger.info(e); + } }; const crawler = new crawlee.PlaywrightCrawler({ @@ -233,57 +288,66 @@ const crawlDomain = async ( pagesCrawled += 1; - if (isBasicAuth) { - isBasicAuth = false; - } else { - if (isScanHtml) { - const results = await runAxeScript(needsReview, includeScreenshots, page, randomToken); - guiInfoLog(guiInfoStatusTypes.SCANNED, { - numScanned: urlsCrawled.scanned.length, - urlScanned: request.url, - }); - - // For deduplication, if the URL is redirected, we want to store the original URL and the redirected URL (actualUrl) - const isRedirected = !areLinksEqual(request.loadedUrl, request.url); - if (isRedirected) { - const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some( - item => (item.actualUrl || item.url) === request.loadedUrl, - ); - - if (isLoadedUrlInCrawledUrls) { - urlsCrawled.notScannedRedirects.push({ + try { + if (isBasicAuth) { + isBasicAuth = false; + } else { + if (isScanHtml) { + const results = await runAxeScript(needsReview, includeScreenshots, page, randomToken); + guiInfoLog(guiInfoStatusTypes.SCANNED, { + numScanned: urlsCrawled.scanned.length, + urlScanned: request.url, + }); + + // For deduplication, if the URL is redirected, we want to store the original URL and the redirected URL (actualUrl) + const isRedirected = !areLinksEqual(request.loadedUrl, request.url); + if (isRedirected) { + const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some( + item => (item.actualUrl || item.url) === request.loadedUrl, + ); + + if (isLoadedUrlInCrawledUrls) { + urlsCrawled.notScannedRedirects.push({ + fromUrl: request.url, + toUrl: request.loadedUrl, // i.e. actualUrl + }); + return; + } + + urlsCrawled.scanned.push({ + url: request.url, + pageTitle: results.pageTitle, + actualUrl: request.loadedUrl, // i.e. actualUrl + }); + + urlsCrawled.scannedRedirects.push({ fromUrl: request.url, toUrl: request.loadedUrl, // i.e. actualUrl }); - return; + + results.url = request.url; + results.actualUrl = request.loadedUrl; + } else { + urlsCrawled.scanned.push({ url: request.url, pageTitle: results.pageTitle }); } - - urlsCrawled.scanned.push({ - url: request.url, - pageTitle: results.pageTitle, - actualUrl: request.loadedUrl, // i.e. actualUrl - }); - - urlsCrawled.scannedRedirects.push({ - fromUrl: request.url, - toUrl: request.loadedUrl, // i.e. actualUrl - }); - - results.url = request.url; - results.actualUrl = request.loadedUrl; + await dataset.pushData(results); } else { - urlsCrawled.scanned.push({ url: request.url, pageTitle: results.pageTitle }); + guiInfoLog(guiInfoStatusTypes.SKIPPED, { + numScanned: urlsCrawled.scanned.length, + urlScanned: request.url, + }); + urlsCrawled.blacklisted.push(request.url); } - await dataset.pushData(results); - } else { - guiInfoLog(guiInfoStatusTypes.SKIPPED, { - numScanned: urlsCrawled.scanned.length, - urlScanned: request.url, - }); - urlsCrawled.blacklisted.push(request.url); - } - await enqueueProcess(page, enqueueLinks, enqueueLinksByClickingElements); + await enqueueProcess(page, enqueueLinks, enqueueLinksByClickingElements); + } + } catch (e) { + silentLogger.info(e); + guiInfoLog(guiInfoStatusTypes.ERROR, { + numScanned: urlsCrawled.scanned.length, + urlScanned: request.url, + }); + urlsCrawled.error.push({ url: request.url }); } }, failedRequestHandler: async ({ request }) => {