diff --git a/GoogleScraper/database.py b/GoogleScraper/database.py index f3122397..bc64c478 100644 --- a/GoogleScraper/database.py +++ b/GoogleScraper/database.py @@ -121,11 +121,12 @@ def set_values_from_parser(self, parser): parsed = urlparse(link['link']) # fill with nones to prevent key errors - [link.update({key: None}) for key in ('snippet', 'title', 'visible_link') if key not in link] + [link.update({key: None}) for key in ('snippet', 'time_stamp','title', 'visible_link') if key not in link] Link( link=link['link'], snippet=link['snippet'], + time_stamp=link['time_stamp'], title=link['title'], visible_link=link['visible_link'], domain=parsed.netloc, @@ -169,6 +170,7 @@ class Link(Base): id = Column(Integer, primary_key=True) title = Column(String) snippet = Column(String) + time_stamp=Column(String) link = Column(String) domain = Column(String) visible_link = Column(String) @@ -288,4 +290,4 @@ def fixtures(config, session): if not search_engine: session.add(SearchEngine(name=se)) - session.commit() + session.commit() \ No newline at end of file diff --git a/GoogleScraper/http_mode.py b/GoogleScraper/http_mode.py index fab1ec3d..1723c5e0 100644 --- a/GoogleScraper/http_mode.py +++ b/GoogleScraper/http_mode.py @@ -44,6 +44,12 @@ def get_GET_params_for_search_engine(query, search_engine, page_number=1, num_re # state by some hard coded needles. search_params['hl'] = 'en' search_params['q'] = query + #+ "&tbs=cdr%3A1%2Ccd_min%3A2015%2Ccd_max%3A2016&tbm=" + ''' + search_params['tbs'] ='cdr:1' + search_params['cd_min'] = '2015' + search_params['cd_max'] = '2016' + ''' # only set when other num results than 10. if num_results_per_page != 10: search_params['num'] = str(num_results_per_page) @@ -75,6 +81,17 @@ def get_GET_params_for_search_engine(query, search_engine, page_number=1, num_re 'source': 'lnms', 'sa': 'X' }) + #TEST + + + ''' + search_params.update({ + 'tbs' : 'cdr:1', + 'cd_min': '3/2/2015', + 'cd_max': '3/2/2016' + }) + ''' + elif search_engine == 'yandex': search_params['text'] = query @@ -157,6 +174,7 @@ def __init__(self, config, *args, time_offset=0.0, **kwargs): self.scrape_method = 'http' # get the base search url based on the search engine. + #+ "&source=lnt&tbs=cdr%3A1%2Ccd_min%3A2015%2Ccd_max%3A2016&tbm= self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__) @@ -256,7 +274,16 @@ def search(self, rand=True, timeout=15): success = True self.build_search() - + #"tbs=cdr%3A1%2Ccd_min%3A2016%2Ccd_max%3A2015&ei=kdsMWtn0NumD0gLdvYWQDA&" + ''' + self.headers['User-Agent'] = random_user_agent(only_desktop=True) + super().detection_prevention_sleep() + super().keyword_info() + print self.base_search_url + urlencode(self.search_params) + sys.exit() + ''' + + if rand: self.headers['User-Agent'] = random_user_agent(only_desktop=True) @@ -291,6 +318,7 @@ def search(self, rand=True, timeout=15): success = False super().after_search() + return success diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py index 09fd4b41..7c452131 100644 --- a/GoogleScraper/parsing.py +++ b/GoogleScraper/parsing.py @@ -359,6 +359,7 @@ class GoogleParser(Parser): 'result_container': 'div.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', + 'time_stamp' : 'div.s div.slp::text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -367,6 +368,7 @@ class GoogleParser(Parser): 'result_container': 'li.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', + 'time_stamp' : 'div.s div.slp::text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -374,6 +376,7 @@ class GoogleParser(Parser): 'container': 'li.card-section', 'link': 'a._Dk::attr(href)', 'snippet': 'span._dwd::text', + 'time_stamp' : 'div.s div.slp::text', 'title': 'a._Dk::text', 'visible_link': 'cite::text' }, @@ -1076,4 +1079,4 @@ def parse_serp(config, html=None, parser=None, scraper=None, search_engine=None, print(parser) with open('/tmp/testhtml.html', 'w') as of: - of.write(raw_html) + of.write(raw_html) \ No newline at end of file diff --git a/GoogleScraper/scraping.py b/GoogleScraper/scraping.py index 8d691f4b..d768eaae 100644 --- a/GoogleScraper/scraping.py +++ b/GoogleScraper/scraping.py @@ -78,7 +78,7 @@ def get_base_search_url_by_search_engine(config, search_engine_name, search_mode """ assert search_mode in SEARCH_MODES, 'search mode "{}" is not available'.format(search_mode) - specific_base_url = config.get('{}_{}_search_url'.format(search_mode, search_engine_name), None) + specific_base_url = config.get('{}_{}_search_url'.format(search_mode, search_engine_name), None) if not specific_base_url: specific_base_url = config.get('{}_search_url'.format(search_engine_name), None) @@ -90,9 +90,8 @@ def get_base_search_url_by_search_engine(config, search_engine_name, search_mode ips = file.read().split('\n') random_ip = random.choice(ips) return random_ip - - return specific_base_url - + specific_base_url += "&source=lnt&tbs=cdr%3A1%2Ccd_min%3A3%2F1%2F2015%2Ccd_max%3A11%2F1%2F2015&tbm=&" + return specific_base_url class SearchEngineScrape(metaclass=abc.ABCMeta): """Abstract base class that represents a search engine scrape. diff --git a/output.json b/output.json new file mode 100644 index 00000000..0637a088 --- /dev/null +++ b/output.json @@ -0,0 +1 @@ +[] \ No newline at end of file