Skip to content

Commit

Permalink
enhance: enable js for google search tool
Browse files Browse the repository at this point in the history
Enable Javascript when gathering page content in the `Google Search`
tool. This allows more content to be extracted from web pages that
require Javascript to load properly.

Signed-off-by: Nick Hale <[email protected]>
  • Loading branch information
njhale committed Jan 24, 2025
1 parent af5a039 commit c71af5e
Showing 1 changed file with 7 additions and 12 deletions.
19 changes: 7 additions & 12 deletions google/search/src/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,14 @@ export async function search (

const encodedQuery = encodeURIComponent(query)
const searchUrl = `https://www.google.com/search?q=${encodedQuery}&udm=14`

const foundURLs = new Set<string>()
const results: Array<Promise<SearchResult | null>> = []

const page = await context.newPage()
const noJSPages = await Promise.all(
const pages = await Promise.all(
Array.from({ length: maxResults }, async () => {
const page = await context.newPage()
await page.addInitScript(() => {
// Disable JavaScript for the page
Object.defineProperty(navigator, 'javaScriptEnabled', { value: false })
Object.defineProperty(window, 'Function', { value: () => { } })
Object.defineProperty(window, 'eval', { value: () => { } })
})

return page
})
)
Expand All @@ -55,7 +48,7 @@ export async function search (
const url = $(element).attr('href') ?? ''
if ((url !== '') && !url.includes('youtube.com/watch?v') && !foundURLs.has(url)) {
foundURLs.add(url)
results.push(getMarkdown(noJSPages[results.length], url).then(content => {
results.push(getMarkdown(pages[results.length], url).then(content => {
return (content !== '') ? { url, content } : null
}))
}
Expand All @@ -68,13 +61,14 @@ export async function search (
} finally {
// Fire and forget page close so we can move on
void page.close()
void Promise.all(noJSPages.map(async p => { await p.close() }))
void Promise.all(pages.map(async p => { await p.close() }))
}
}

export async function getMarkdown (page: Page, url: string): Promise<string> {
try {
await page.goto(url, { timeout: 1000 })
await page.waitForLoadState('networkidle', { timeout: 1000 })
} catch (e) {
console.warn('slow page:', url)
}
Expand All @@ -83,7 +77,7 @@ export async function getMarkdown (page: Page, url: string): Promise<string> {
while (content === '') {
let fails = 0
try {
content = await page.content()
content = await page.evaluate(() => document.documentElement.outerHTML)
} catch (e) {
fails++
if (fails > 2) {
Expand Down Expand Up @@ -116,6 +110,7 @@ export async function getMarkdown (page: Page, url: string): Promise<string> {
continue;
}


$(selector).each(function () {
resp += turndownService.turndown($.html(this))
})
Expand Down

0 comments on commit c71af5e

Please sign in to comment.