From 5a2a56555165bbdc358135f18c46a31f35620c8f Mon Sep 17 00:00:00 2001 From: kaiba Date: Mon, 19 Feb 2024 16:04:56 +0900 Subject: [PATCH] Fix #147 Set `purgeOnStart: true` to process multiple sites as a server --- CHANGELOG.md | 6 +-- src/core.ts | 150 ++++++++++++++++++++++++++------------------------- 2 files changed, 80 insertions(+), 76 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59c8e32..388b12c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,12 @@ # [1.4.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.3.0...v1.4.0) (2024-01-15) - ### Bug Fixes -* linting ([0f4e58b](https://github.com/BuilderIO/gpt-crawler/commit/0f4e58b400eab312e7b595d7a2472bae93055415)) - +- linting ([0f4e58b](https://github.com/BuilderIO/gpt-crawler/commit/0f4e58b400eab312e7b595d7a2472bae93055415)) ### Features -* add server api readme docs ([717e625](https://github.com/BuilderIO/gpt-crawler/commit/717e625f47257bdbd96437acb7242bcd28c233ba)) +- add server api readme docs ([717e625](https://github.com/BuilderIO/gpt-crawler/commit/717e625f47257bdbd96437acb7242bcd28c233ba)) # [1.3.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.2.1...v1.3.0) (2024-01-06) diff --git a/src/core.ts b/src/core.ts index 48c56c6..c996f2b 100644 --- a/src/core.ts +++ b/src/core.ts @@ -1,5 +1,5 @@ // For more information, see https://crawlee.dev/ -import { PlaywrightCrawler, downloadListOfUrls } from "crawlee"; +import { Configuration, PlaywrightCrawler, downloadListOfUrls } from "crawlee"; import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; import { Config, configSchema } from "./config.js"; @@ -54,83 +54,89 @@ export async function crawl(config: Config) { if (process.env.NO_CRAWL !== "true") { // PlaywrightCrawler crawls the web using a headless // browser controlled by the Playwright library. - crawler = new PlaywrightCrawler({ - // Use the requestHandler to process each of the crawled pages. - async requestHandler({ request, page, enqueueLinks, log, pushData }) { - const title = await page.title(); - pageCounter++; - log.info( - `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`, - ); - - // Use custom handling for XPath selector - if (config.selector) { - if (config.selector.startsWith("/")) { - await waitForXPath( - page, - config.selector, - config.waitForSelectorTimeout ?? 1000, - ); - } else { - await page.waitForSelector(config.selector, { - timeout: config.waitForSelectorTimeout ?? 1000, - }); - } - } + crawler = new PlaywrightCrawler( + { + // Use the requestHandler to process each of the crawled pages. + async requestHandler({ request, page, enqueueLinks, log, pushData }) { + const title = await page.title(); + pageCounter++; + log.info( + `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`, + ); - const html = await getPageHtml(page, config.selector); + // Use custom handling for XPath selector + if (config.selector) { + if (config.selector.startsWith("/")) { + await waitForXPath( + page, + config.selector, + config.waitForSelectorTimeout ?? 1000, + ); + } else { + await page.waitForSelector(config.selector, { + timeout: config.waitForSelectorTimeout ?? 1000, + }); + } + } - // Save results as JSON to ./storage/datasets/default - await pushData({ title, url: request.loadedUrl, html }); + const html = await getPageHtml(page, config.selector); - if (config.onVisitPage) { - await config.onVisitPage({ page, pushData }); - } + // Save results as JSON to ./storage/datasets/default + await pushData({ title, url: request.loadedUrl, html }); - // Extract links from the current page - // and add them to the crawling queue. - await enqueueLinks({ - globs: - typeof config.match === "string" ? [config.match] : config.match, - exclude: - typeof config.exclude === "string" - ? [config.exclude] - : config.exclude ?? [], - }); - }, - // Comment this option to scrape the full website. - maxRequestsPerCrawl: config.maxPagesToCrawl, - // Uncomment this option to see the browser window. - // headless: false, - preNavigationHooks: [ - // Abort requests for certain resource types - async ({ request, page, log }) => { - // If there are no resource exclusions, return - const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? []; - if (RESOURCE_EXCLUSTIONS.length === 0) { - return; - } - if (config.cookie) { - const cookies = ( - Array.isArray(config.cookie) ? config.cookie : [config.cookie] - ).map((cookie) => { - return { - name: cookie.name, - value: cookie.value, - url: request.loadedUrl, - }; - }); - await page.context().addCookies(cookies); + if (config.onVisitPage) { + await config.onVisitPage({ page, pushData }); } - await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) => - route.abort("aborted"), - ); - log.info( - `Aborting requests for as this is a resource excluded route`, - ); + + // Extract links from the current page + // and add them to the crawling queue. + await enqueueLinks({ + globs: + typeof config.match === "string" ? [config.match] : config.match, + exclude: + typeof config.exclude === "string" + ? [config.exclude] + : config.exclude ?? [], + }); }, - ], - }); + // Comment this option to scrape the full website. + maxRequestsPerCrawl: config.maxPagesToCrawl, + // Uncomment this option to see the browser window. + // headless: false, + preNavigationHooks: [ + // Abort requests for certain resource types + async ({ request, page, log }) => { + // If there are no resource exclusions, return + const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? []; + if (RESOURCE_EXCLUSTIONS.length === 0) { + return; + } + if (config.cookie) { + const cookies = ( + Array.isArray(config.cookie) ? config.cookie : [config.cookie] + ).map((cookie) => { + return { + name: cookie.name, + value: cookie.value, + url: request.loadedUrl, + }; + }); + await page.context().addCookies(cookies); + } + await page.route( + `**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, + (route) => route.abort("aborted"), + ); + log.info( + `Aborting requests for as this is a resource excluded route`, + ); + }, + ], + }, + new Configuration({ + purgeOnStart: true, + }), + ); const isUrlASitemap = /sitemap.*\.xml$/.test(config.url);