-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsearch.py
129 lines (100 loc) · 4.47 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re
from html import unescape
from googleapiclient.discovery import build ######
from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tag.perceptron import PerceptronTagger
from nltk.tokenize import RegexpTokenizer
from unidecode import unidecode
import networking
STOP = set(stopwords.words("english")) - {"most", "least"}
#########Need to do 4 german soon
tokenizer = RegexpTokenizer(r"\w+")
tagger = PerceptronTagger()
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate"}
GOOGLE_URL = "https://www.google.com/search?q={}&ie=utf-8&oe=utf-8&client=firefox-b-1-ab"
def find_keywords(words):
"""
Returns the list of words given without stopwords.
:param words: List of words
:return: Words without stopwords
"""
return [w for w in tokenizer.tokenize(words.lower()) if w not in STOP]
def find_nouns(text, num_words, reverse=False):
tokens = word_tokenize(text)
tags = [tag for tag in tagger.tag(tokens) if tag[1] != "POS"]
print(tags)
tags = tags[:num_words] if not reverse else tags[-num_words:]
nouns = []
consecutive_nouns = []
for tag in tags:
tag_type = tag[1]
word = tag[0]
if "NN" not in tag_type and len(consecutive_nouns) > 0:
nouns.append(" ".join(consecutive_nouns))
consecutive_nouns = []
elif "NN" in tag_type:
consecutive_nouns.append(word)
if len(consecutive_nouns) > 0:
nouns.append(" ".join(consecutive_nouns))
return nouns
def find_q_word_location(question_lower):
for q_word in ["what", "when", "who", "which", "whom", "where", "why", "how", "wann", "wer", "welche", "wem", "woher", "warum", "wie"]:
q_word_location = question_lower.find(q_word) #i didnt include was because its a common english word too
if q_word_location != -1:
return q_word_location
def get_google_links(page, num_results):
soup = BeautifulSoup(page, "html.parser")
results = soup.findAll("h3", {"class": "r"})
links = []
for r in results:
url = r.find("a")
if url is not None:
links.append(url["href"])
links = list(dict.fromkeys(links)) # Remove duplicates while preserving order
return links[:num_results]
async def search_google(question, num_results, method = 1):
"""
Returns num_results urls from a google search of question.
:param question: Question to search
:param num_results: Number of results to return
:return: List of length num_results of urls retrieved from the search
"""
# Could use Google's Custom Search API here, limit of 100 queries per day
#AIzaSyDouIO7aLLa4UZP_4yiRcu6AIeH1Ab1-eY
if method == 1:
service = build("customsearch", "v1", developerKey="AIzaSyAL9qFI0KHxhb-ozxoh0nZTTAtS7P8XqfA")
result = service.cse().list(q=question, cx="006676110987290916144:u1aczmb_-he", num=num_results).execute()
results = result["items"]
links = []
for r in results:
links.append(r['link'])
return links
else:
page = await networking.get_response(GOOGLE_URL.format(question), timeout=5, headers=HEADERS)
return get_google_links(page, num_results)
async def multiple_search(questions, num_results):
queries = list(map(GOOGLE_URL.format, questions))
pages = await networking.get_responses(queries, timeout=5, headers=HEADERS)
link_list = [get_google_links(page, num_results) for page in pages]
return link_list
def clean_html(html):
# First we remove inline JavaScript/CSS:
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
# Then we remove html comments. This has to be done before removing regular
# tags since comments can contain '>' characters.
cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
# Next we can remove the remaining tags:
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
# Finally, we deal with whitespace
cleaned = re.sub(r" ", " ", cleaned)
cleaned = re.sub(r"\n", " ", cleaned)
cleaned = re.sub(r"\s\s+", " ", cleaned)
return unidecode(unescape(cleaned.strip()))
async def get_clean_texts(urls, timeout=1.5, headers=HEADERS):
responses = await networking.get_responses(urls, timeout, headers)
return [clean_html(r).lower() for r in responses]