From 6cb3c5cf57122f6877319e605d90556d885b427c Mon Sep 17 00:00:00 2001 From: Nick Hale <4175918+njhale@users.noreply.github.com> Date: Thu, 23 Jan 2025 19:37:27 -0500 Subject: [PATCH] enhance: enable js for google search tool Enable Javascript when gathering page content in the `Google Search` tool. This allows more content to be extracted from web pages that require Javascript to load properly. Signed-off-by: Nick Hale <4175918+njhale@users.noreply.github.com> --- google/search/src/search.ts | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/google/search/src/search.ts b/google/search/src/search.ts index 7ddcb417..42e50809 100644 --- a/google/search/src/search.ts +++ b/google/search/src/search.ts @@ -24,23 +24,13 @@ export async function search ( const encodedQuery = encodeURIComponent(query) const searchUrl = `https://www.google.com/search?q=${encodedQuery}&udm=14` - + const foundURLs = new Set() const results: Array> = [] const page = await context.newPage() - const noJSPages = await Promise.all( - Array.from({ length: maxResults }, async () => { - const page = await context.newPage() - await page.addInitScript(() => { - // Disable JavaScript for the page - Object.defineProperty(navigator, 'javaScriptEnabled', { value: false }) - Object.defineProperty(window, 'Function', { value: () => { } }) - Object.defineProperty(window, 'eval', { value: () => { } }) - }) - - return page - }) + const pages = await Promise.all( + Array.from({ length: maxResults }, () => context.newPage()) ) try { @@ -55,7 +45,7 @@ export async function search ( const url = $(element).attr('href') ?? '' if ((url !== '') && !url.includes('youtube.com/watch?v') && !foundURLs.has(url)) { foundURLs.add(url) - results.push(getMarkdown(noJSPages[results.length], url).then(content => { + results.push(getMarkdown(pages[results.length], url).then(content => { return (content !== '') ? { url, content } : null })) } @@ -68,13 +58,14 @@ export async function search ( } finally { // Fire and forget page close so we can move on void page.close() - void Promise.all(noJSPages.map(async p => { await p.close() })) + void Promise.all(pages.map(async p => { await p.close() })) } } export async function getMarkdown (page: Page, url: string): Promise { try { await page.goto(url, { timeout: 1000 }) + await page.waitForLoadState('networkidle', { timeout: 1000 }) } catch (e) { console.warn('slow page:', url) } @@ -83,7 +74,7 @@ export async function getMarkdown (page: Page, url: string): Promise { while (content === '') { let fails = 0 try { - content = await page.content() + content = await page.evaluate(() => document.documentElement.outerHTML) } catch (e) { fails++ if (fails > 2) { @@ -116,6 +107,7 @@ export async function getMarkdown (page: Page, url: string): Promise { continue; } + $(selector).each(function () { resp += turndownService.turndown($.html(this)) })