From 2c917ca57ad7112137b306e56e637fbe4b09686d Mon Sep 17 00:00:00 2001 From: Ang Yong <89739997+angyonghaseyo@users.noreply.github.com> Date: Wed, 22 May 2024 15:18:39 +0800 Subject: [PATCH] connections and loading issue on specific sites (#335) * Test sites watsons.com.sg and guardian.com.sg * Update playwright to 1.44.0 and bump package --------- Co-authored-by: younglim --- constants/common.js | 3 +++ crawlers/crawlDomain.js | 2 +- package-lock.json | 20 ++++++++++---------- package.json | 4 ++-- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/constants/common.js b/constants/common.js index f8e89f92..94cd7591 100644 --- a/constants/common.js +++ b/constants/common.js @@ -141,6 +141,7 @@ const document = new JSDOM('').window; const httpsAgent = new https.Agent({ // Run in environments with custom certificates rejectUnauthorized: false, + keepAlive: true, }); export const messageOptions = { @@ -684,6 +685,7 @@ const getRobotsTxtViaAxios = async (robotsUrl) => { const instance = axios.create({ httpsAgent: new https.Agent({ rejectUnauthorized: false, + keepAlive: true, }), }); @@ -855,6 +857,7 @@ export const getLinksFromSitemap = async ( const instance = axios.create({ httpsAgent: new https.Agent({ rejectUnauthorized: false, + keepAlive: true, }), }); data = await (await instance.get(url, { timeout: 80000 })).data; diff --git a/crawlers/crawlDomain.js b/crawlers/crawlDomain.js index 7e6055ad..90606d53 100644 --- a/crawlers/crawlDomain.js +++ b/crawlers/crawlDomain.js @@ -315,7 +315,7 @@ const crawlDomain = async ( } // Ensure page navigation completes to capture final URL in a redirect chain - await page.goto(request.url, { waitUntil: 'networkidle' }); + await page.goto(request.url, { waitUntil: 'load' }); let finalUrl = page.url(); // Initialize with the request URL diff --git a/package-lock.json b/package-lock.json index 750a1c48..3ee71da0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@govtechsg/purple-hats", - "version": "0.9.52", + "version": "0.9.53", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@govtechsg/purple-hats", - "version": "0.9.52", + "version": "0.9.53", "license": "MIT", "dependencies": { "@json2csv/node": "^7.0.3", @@ -22,7 +22,7 @@ "lodash": "^4.17.21", "minimatch": "^9.0.3", "pdfjs-dist": "github:veraPDF/pdfjs-dist#v2.14.305-taggedPdf-0.1.11", - "playwright": "1.42.1", + "playwright": "1.44.0", "prettier": "^3.1.0", "print-message": "^3.0.1", "safe-regex": "^2.1.1", @@ -7562,11 +7562,11 @@ } }, "node_modules/playwright": { - "version": "1.42.1", - "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.42.1.tgz", - "integrity": "sha512-PgwB03s2DZBcNRoW+1w9E+VkLBxweib6KTXM0M3tkiT4jVxKSi6PmVJ591J+0u10LUrgxB7dLRbiJqO5s2QPMg==", + "version": "1.44.0", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.44.0.tgz", + "integrity": "sha512-F9b3GUCLQ3Nffrfb6dunPOkE5Mh68tR7zN32L4jCk4FjQamgesGay7/dAAe1WaMEGV04DkdJfcJzjoCKygUaRQ==", "dependencies": { - "playwright-core": "1.42.1" + "playwright-core": "1.44.0" }, "bin": { "playwright": "cli.js" @@ -7579,9 +7579,9 @@ } }, "node_modules/playwright-core": { - "version": "1.42.1", - "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.42.1.tgz", - "integrity": "sha512-mxz6zclokgrke9p1vtdy/COWBH+eOZgYUVVU34C73M+4j4HLlQJHtfcqiqqxpP0o8HhMkflvfbquLX5dg6wlfA==", + "version": "1.44.0", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.44.0.tgz", + "integrity": "sha512-ZTbkNpFfYcGWohvTTl+xewITm7EOuqIqex0c7dNZ+aXsbrLj0qI8XlGKfPpipjm0Wny/4Lt4CJsWJk1stVS5qQ==", "bin": { "playwright-core": "cli.js" }, diff --git a/package.json b/package.json index 8ccf1d9b..e9b7616c 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@govtechsg/purple-hats", "main": "npmIndex.js", - "version": "0.9.52", + "version": "0.9.53", "type": "module", "imports": { "#root/*.js": "./*.js" @@ -20,7 +20,7 @@ "lodash": "^4.17.21", "minimatch": "^9.0.3", "pdfjs-dist": "github:veraPDF/pdfjs-dist#v2.14.305-taggedPdf-0.1.11", - "playwright": "1.42.1", + "playwright": "1.44.0", "prettier": "^3.1.0", "print-message": "^3.0.1", "safe-regex": "^2.1.1",