-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.py
52 lines (47 loc) · 1.51 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from googlesearch import search
from bs4 import BeautifulSoup
import requests
import re
import concurrent.futures
import cchardet
import lxml
MAX_THREADS = 30
class Documents:
def __init__(self, query):
self.query = query
self.link_search = search(self.query)
self.requests_session = requests.Session()
def gett(self, link_search):
url = link_search
if url.split('/')[0] == '':
return ''
html = self.requests_session.get(url, timeout = 4)
tree = BeautifulSoup(html.text,'lxml')
para = tree.findAll(['p'])
doc = []
for p in para:
p_text = p.getText()
if p_text == '\n':
continue
if len(p_text) < 30:
continue
doc.append(p_text)
doc_str = ' '.join(doc)
doc_str = re.sub(r'\[.*?\]', '', doc_str)
doc_str = re.sub('\xa0', ' ', doc_str)
doc_str = re.sub(' +', ' ', doc_str)
doc_str = re.sub('\n\n', '\n', doc_str)
return doc_str
def get(self):
threads = min(MAX_THREADS, len(self.link_search))
docs = []
count = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
for link_search in self.link_search:
if count > 4:
break
doc = executor.submit(self.gett, link_search).result()
if doc != '':
docs.append(doc)
count += 1
return docs