-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
76 lines (56 loc) · 2.13 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gevent
from gevent import pool
from gevent import monkey
monkey.patch_all()
import requests
from warcio.archiveiterator import ArchiveIterator
import zmq
import ujson
from bs4 import BeautifulSoup
import processor
import operator
context = zmq.Context()
socket = context.socket(zmq.PUB)
socket.bind("tcp://0.0.0.0:5555")
POOL = pool.Pool(100)
def word_count(string):
counts = dict()
words = string.split()
for word in words:
if word in counts:
counts[word] += 1
else:
counts[word] = 1
return counts
def print_records(url):
url = url.strip()
resp = requests.get(url, stream=True)
for record in ArchiveIterator(resp.raw):
if record.rec_type == 'warcinfo':
pass
elif record.rec_type == 'response':
# print(record.rec_headers)
if not record.http_headers:
continue
if record.http_headers.get_header('Content-Type') == 'text/html':
try:
soup = BeautifulSoup(record.content_stream().read().decode("utf-8"))
except Exception as e:
# print(e)
continue
# Process record here, maybe spacy
text = processor.process(soup)
counts = word_count(text)
top_3_words = [x[0] for x in sorted(counts.items(), key=operator.itemgetter(1), reverse=True)[:2]]
node = record.rec_headers.get_header('WARC-Target-URI')
outlinks = ",".join([link['href'] for link in soup.find_all('a', href=True)])
msg = bytes(ujson.dumps({"Node":node,"Keywords":",".join(top_3_words), "Outlinks":outlinks, "Score":1.0}), "utf-8")
socket.send(msg)
# print(msg.decode("utf-8"))
with open("warc copy.txt", "r") as textfile:
urls = textfile.readlines()
for url in urls:
POOL.spawn(print_records, "https://commoncrawl.s3.amazonaws.com/"+urls[0])
POOL.join()
socket.send(bytes("done", "utf-8"))
# print_records('https://archive.org/download/ExampleArcAndWarcFiles/IAH-20080430204825-00000-blackbook.warc.gz')