-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmodule_websearch.py
executable file
·190 lines (156 loc) · 5.73 KB
/
module_websearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""
module_websearch.py
Web Search Module for TARS-AI Application.
This module provides functionality for performing web searches using Selenium WebDriver.
It supports multiple search engines and allows for extracting specific content, links,
and structured data from search results.
"""
# === Standard Libraries ===
import os
import sys
from contextlib import contextmanager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import atexit
from datetime import datetime
# === Helper Functions ===
# Silence logs to suppress unnecessary outputs
@contextmanager
def silence_log():
"""
Context manager to suppress unnecessary console logs during driver initialization.
"""
old_stdout, old_stderr = sys.stdout, sys.stderr
try:
with open(os.devnull, "w") as devnull:
sys.stdout, sys.stderr = devnull, devnull
yield
finally:
sys.stdout, sys.stderr = old_stdout, old_stderr
def initialize_driver():
"""
Initialize the Selenium WebDriver using Chromium.
Returns:
- WebDriver: Configured Selenium WebDriver instance.
"""
options = ChromeOptions()
options.add_argument("--headless") # Run in headless mode
options.add_argument("--no-sandbox") # Bypass OS security model
options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems
options.add_argument("--disable-infobars")
options.add_argument("--lang=en-GB")
service = ChromeService(executable_path="/usr/bin/chromedriver") # Path to the chromedriver
return webdriver.Chrome(service=service, options=options)
def quit_driver():
"""
Quit the WebDriver instance when the script ends.
"""
if driver:
driver.quit()
def save_debug():
"""
Save the current page source for debugging purposes.
"""
with open("engine/debug.html", "w", encoding='utf-8') as f:
f.write(driver.page_source)
def wait_for_element(element_id: str, delay: int = 10):
"""Wait for an element with a specific ID to be present."""
try:
WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID, element_id)))
except Exception:
print(f"ERROR: Element with ID '{element_id}' not found.")
def extract_text(selector):
"""
Extract text content from elements matching the specified CSS selector.
Parameters:
- selector (str): The CSS selector for the elements.
Returns:
- str: Concatenated text content of matched elements.
"""
return '\n'.join(el.text for el in driver.find_elements(By.CSS_SELECTOR, selector) if el and el.text).strip()
def extract_links(selector):
"""
Extract hyperlinks from elements matching the specified CSS selector.
Parameters:
- selector (str): The CSS selector for the elements.
Returns:
- list: List of extracted hyperlinks.
"""
return [el.get_attribute('href') for el in driver.find_elements(By.CSS_SELECTOR, selector) if el and el.text]
# === Search Functions ===
def search_query(url, query, content_selector, link_selector=None):
"""
Perform a web search and extract content and links.
Parameters:
- url (str): Base search URL.
- query (str): Search query.
- content_selector (str): CSS selector for text content.
- link_selector (str, optional): CSS selector for links.
Returns:
- tuple: Extracted text content and links (if applicable).
"""
driver.get(url + query)
wait_for_element('res') # Wait for page to load
content = extract_text(content_selector)
links = extract_links(link_selector) if link_selector else []
return content, links
def search_google(query):
"""
Perform a Google search and extract featured snippets, knowledge panels, and snippets.
Parameters:
- query (str): The search query.
Returns:
- tuple: Extracted content and links.
"""
print(f"INFO: Searching Google for: {query}")
driver.get("https://google.com/search?hl=en&q=" + query)
wait_for_element('res')
save_debug()
text = ""
# Featured snippets
text += extract_text('.wDYxhc')
print(f"INFO: Featured snippets: {text}")
# Knowledge panels
text += extract_text('.hgKElc')
print(f"INFO: Knowledge panels: {text}")
# Page snippets
text += extract_text('.r025kc.lVm3ye')
print(f"INFO: Page snippets: {text}")
# Additional selectors for compatibility
text += extract_text('.yDYNvb.lyLwlc')
return text
def search_google_news(query):
"""
Perform a search on Google News and extract news snippets.
Parameters:
- query (str): The search query.
Returns:
- tuple: Extracted content and links.
"""
print(f"INFO: Fetching Google News for: {query}")
return search_query(
"https://google.com/search?hl=en&gl=us&tbm=nws&q=",
query,
".dURPMd"
)
def search_duckduckgo(query):
"""
Perform a search on DuckDuckGo and extract results.
Parameters:
- query (str): The search query.
Returns:
- tuple: Extracted content and links.
"""
print(f"INFO: Searching DuckDuckGo for: {query}")
return search_query(
"https://duckduckgo.com/?kp=-2&kl=wt-wt&q=",
query,
'[data-result="snippet"]'
)
# === Initialize and Cleanup ===
driver = initialize_driver()
atexit.register(quit_driver)