Skip to content

Commit

Permalink
add websearch tool
Browse files Browse the repository at this point in the history
  • Loading branch information
aiXander committed Jan 4, 2025
1 parent 28f404a commit d7bde28
Show file tree
Hide file tree
Showing 4 changed files with 254 additions and 0 deletions.
13 changes: 13 additions & 0 deletions eve/tools/websearch/api.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: Web Search
description: Search and extract content from any webpage
cost_estimate: 1
output_type: string
status: prod
visible: true
parameters:
url:
type: string
label: URL
description: The webpage URL to analyze (must start with http:// or https://)
required: true
pattern: ^https?://.*
180 changes: 180 additions & 0 deletions eve/tools/websearch/handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import os
from typing import Dict, Optional, List, Any, Tuple
from playwright.async_api import async_playwright
import json
import logging

async def safe_evaluate(page, script: str, default_value: Any) -> Tuple[Any, str]:
"""
Safely evaluate JavaScript on the page with error handling.
"""
try:
result = await page.evaluate(script)
return result, ""
except Exception as e:
return default_value, str(e)

async def handler(args: dict, env: str = None) -> Dict[str, str]:
"""
Handler function for the websearch tool that scrapes content from specified URLs.
Args:
args (dict): Dictionary containing:
- url (str): Required. The URL to scrape
- max_links (int): Optional. Maximum number of links to extract (default: 15)
- max_chars (int): Optional. Maximum number of characters to include in content summary (default: 2000)
env (str): Optional environment parameter
Returns:
Dict[str, str]: Dictionary containing the scraped content and any errors
"""
url = args.get('url')
if not url:
raise ValueError("URL parameter is required")

# Get configurable limits from args with defaults
max_links = int(args.get('max_links', 15))
max_chars = int(args.get('max_chars', 2000))

page_content = {
"title": "",
"text": "",
"links": []
}

errors = []

try:
async with async_playwright() as p:
# Launch browser with security options
browser = await p.chromium.launch(
headless=True,
args=['--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage']
)

# Create new page (timeout is set per operation, not during page creation)
context = await browser.new_context(
viewport={'width': 1280, 'height': 800}
)
page = await context.new_page()

# Set default timeout for all operations
page.set_default_timeout(30000)

# Navigate to URL with retry
for attempt in range(3):
try:
response = await page.goto(
url,
wait_until='domcontentloaded',
timeout=10000
)
if response and response.ok:
break
except Exception as e:
if attempt == 2:
return {"output": f"Error loading page after 3 attempts: {str(e)}"}
continue

# Extract page title (with fallback)
try:
page_content["title"] = await page.title()
except Exception as e:
page_content["title"] = "Title extraction failed"
errors.append(f"Title error: {str(e)}")

# Extract visible text content with multiple strategies
text_extraction_script = """
() => {
try {
let text = Array.from(document.body.querySelectorAll('p, h1, h2, h3, h4, h5, h6, article, section, main'))
.map(element => element.textContent.trim())
.filter(text => text.length > 0)
.join('\\n\\n');
if (text.length < 100) {
text = Array.from(document.body.getElementsByTagName('*'))
.map(element => element.textContent.trim())
.filter(text => text.length > 20)
.join('\\n\\n');
}
return text;
} catch (error) {
return '';
}
}
"""

text_content, text_error = await safe_evaluate(page, text_extraction_script, "")
if text_error:
errors.append(f"Text extraction error: {text_error}")
page_content["text"] = text_content if text_content else "No text content could be extracted"

# Extract links with fallback strategies
links_script = """
() => {
try {
let links = Array.from(document.links)
.map(link => ({
text: link.textContent.trim(),
href: link.href
}))
.filter(link => link.text && link.href.startsWith('http'));
if (links.length === 0) {
links = Array.from(document.getElementsByTagName('a'))
.map(a => ({
text: a.textContent.trim(),
href: a.getAttribute('href')
}))
.filter(link => link.text && link.href && link.href.startsWith('http'));
}
return links;
} catch (error) {
return [];
}
}
"""

links, links_error = await safe_evaluate(page, links_script, [])
if links_error:
errors.append(f"Links extraction error: {links_error}")
page_content["links"] = links[:max_links] if links else []

# Close browser
await browser.close()

# Format output with error reporting
output = f"""
# Page Analysis: {url}
## Title
{page_content['title']}
## Content Summary
{page_content['text'][:max_chars]}{'...' if len(page_content['text']) > max_chars else ''}
## Top Links
"""
if page_content["links"]:
for link in page_content["links"]:
output += f"- [{link['text']}]({link['href']})\n"
else:
output += "No links were extracted\n"

if errors:
output += "\n## Extraction Issues\n"
output += "Some content may be incomplete due to the following issues:\n"
for error in errors:
output += f"- {error}\n"

return {
"output": output
}

except Exception as e:
return {
"output": f"Critical error processing webpage: {str(e)}\n\nPartial content (if any):\n{json.dumps(page_content, indent=2)}"
}
3 changes: 3 additions & 0 deletions eve/tools/websearch/test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"url": "https://news.ycombinator.com"
}
58 changes: 58 additions & 0 deletions eve/tools/websearch/test_locally.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import asyncio
from handler import handler
import json
from datetime import datetime
import os

async def test_website(url: str) -> None:
"""Test the handler with a single website and print results."""
print(f"\n{'='*80}")
print(f"Testing URL: {url}")
print(f"{'='*80}")

try:
result = await handler({"url": url})
print(result["output"])
except Exception as e:
print(f"Error testing {url}: {str(e)}")

async def run_tests():
"""Run tests on various types of websites."""
results_dir = "websearch_results"
os.makedirs(results_dir, exist_ok=True)

test_urls = [
"https://news.ycombinator.com",
"https://huggingface.co/papers",
"https://www.reddit.com/r/StableDiffusion/",
]

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results = {}

for url in test_urls:
try:
result = await handler({"url": url})
results[url] = {
"status": "success",
"output": result["output"]
}
except Exception as e:
results[url] = {
"status": "error",
"error": str(e)
}

output_file = os.path.join(results_dir, f"websearch_test_{timestamp}.json")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2)

await test_website(url)
print("\nWaiting 2 seconds before next test...")
await asyncio.sleep(2)

print(f"\nTest results saved to: {output_file}")

if __name__ == "__main__":
print("Starting websearch tool tests...")
asyncio.run(run_tests())

0 comments on commit d7bde28

Please sign in to comment.