-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinal-crawl.py
132 lines (109 loc) · 4 KB
/
final-crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import sqlite3
from urllib.parse import urlparse, urljoin
from tqdm import tqdm
async def fetch(session, url):
try:
async with session.get(url, ssl=False) as response:
response.raise_for_status()
content = await response.text()
return content
except Exception as e:
print(f"Error fetching URL {url}: {e}")
return None
async def extract_domain(url):
try:
parsed_url = 'https://' + urlparse(url).netloc
return parsed_url
except Exception as e:
print(f"Error parsing URL: {e}")
return None
async def extract_urls_from_page(session, url):
try:
content = await fetch(session, url)
if not content:
return set()
soup = BeautifulSoup(content, 'html.parser')
extracted_urls = set()
for link in soup.find_all('a'):
href = link.get('href')
if href:
if href[0] == "#":
continue
if href[0] == "/":
domain = await extract_domain(url)
href = domain + href
href = urljoin(href, urlparse(href).path)
extracted_urls.add(href)
return extracted_urls
except Exception as e:
print(f"Error extracting URLs from {url}: {e}")
return set()
async def extract_content(session, url):
try:
content = await fetch(session, url)
if not content:
return ''
soup = BeautifulSoup(content, "html.parser")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
cleaned_text = " ".join(chunk for chunk in chunks if chunk)
return cleaned_text
except Exception as e:
print(f"Error extracting content from {url}: {e}")
return ''
async def get_page_title(session, url):
try:
content = await fetch(session, url)
if not content:
return "Title Not Found"
soup = BeautifulSoup(content, 'html.parser')
title = soup.title.string
return title
except Exception as e:
print(f"Error fetching URL {url}: {e}")
return "Title Not Found"
async def crawl_task(session, url, visited_urls, c):
visited_urls.add(url)
content = await extract_content(session, url)
title = await get_page_title(session, url)
c.execute(
"INSERT OR IGNORE INTO url_data (url, content, title) VALUES (?, ?, ?)",
(url, content, title))
new_urls = await extract_urls_from_page(session, url)
return new_urls
async def crawl_urls(urls):
conn = sqlite3.connect('crawled_urls.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS url_data
(id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE, content TEXT, title TEXT)''')
visited_urls = set()
height = 1
async with aiohttp.ClientSession() as session:
while urls:
total_urls = len(urls)
tasks = []
for _ in range(total_urls):
current_url = urls.pop(0)
if current_url not in visited_urls:
tasks.append(asyncio.ensure_future(
crawl_task(session, current_url, visited_urls, c)))
new_urls = await asyncio.gather(*tasks)
urls.extend([url for sublist in new_urls for url in sublist])
height += 1
print(f"Total urls at height {height} : {total_urls} ")
if height == max_height:
print(f"Crawler Stopped At Height {height}")
break
conn.commit()
conn.close()
if __name__ == "__main__":
max_height = 3
seed_urls = ["https://www.geeksforgeeks.org/dsa-tutorial-learn-data-structures-and-algorithms/"]
asyncio.run(crawl_urls(seed_urls))