From 9fd4fd44a38c2346d0d79e9070bf09fa6495e618 Mon Sep 17 00:00:00 2001 From: ethanchewy <17chiue@gmail.com> Date: Mon, 23 Oct 2017 12:14:36 -0700 Subject: [PATCH 1/8] Add time stamp functionality. --- GoogleScraper/database.py | 4 +++- GoogleScraper/parsing.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/GoogleScraper/database.py b/GoogleScraper/database.py index f3122397..e3538ebf 100644 --- a/GoogleScraper/database.py +++ b/GoogleScraper/database.py @@ -121,11 +121,12 @@ def set_values_from_parser(self, parser): parsed = urlparse(link['link']) # fill with nones to prevent key errors - [link.update({key: None}) for key in ('snippet', 'title', 'visible_link') if key not in link] + [link.update({key: None}) for key in ('snippet', 'time_stamp','title', 'visible_link') if key not in link] Link( link=link['link'], snippet=link['snippet'], + time_stamp=link['time_stamp'], title=link['title'], visible_link=link['visible_link'], domain=parsed.netloc, @@ -169,6 +170,7 @@ class Link(Base): id = Column(Integer, primary_key=True) title = Column(String) snippet = Column(String) + time_stamp=Column(String) link = Column(String) domain = Column(String) visible_link = Column(String) diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py index 09fd4b41..04db9c9b 100644 --- a/GoogleScraper/parsing.py +++ b/GoogleScraper/parsing.py @@ -359,6 +359,7 @@ class GoogleParser(Parser): 'result_container': 'div.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', + 'time_stamp' : 'div.slp', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -367,6 +368,7 @@ class GoogleParser(Parser): 'result_container': 'li.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', + 'time_stamp' : 'div.slp', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -374,6 +376,7 @@ class GoogleParser(Parser): 'container': 'li.card-section', 'link': 'a._Dk::attr(href)', 'snippet': 'span._dwd::text', + 'time_stamp' : 'div.slp', 'title': 'a._Dk::text', 'visible_link': 'cite::text' }, From 0dee8db326f05e6d1d5e0ccd5e4b1231fecc3d7f Mon Sep 17 00:00:00 2001 From: ethanchewy <17chiue@gmail.com> Date: Mon, 23 Oct 2017 21:45:58 -0700 Subject: [PATCH 2/8] Add time stamp functionality. --- GoogleScraper/parsing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py index 04db9c9b..8962bb8a 100644 --- a/GoogleScraper/parsing.py +++ b/GoogleScraper/parsing.py @@ -359,7 +359,7 @@ class GoogleParser(Parser): 'result_container': 'div.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', - 'time_stamp' : 'div.slp', + 'time_stamp' : 'div.slp::text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -368,7 +368,7 @@ class GoogleParser(Parser): 'result_container': 'li.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', - 'time_stamp' : 'div.slp', + 'time_stamp' : 'div.slp::text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -376,7 +376,7 @@ class GoogleParser(Parser): 'container': 'li.card-section', 'link': 'a._Dk::attr(href)', 'snippet': 'span._dwd::text', - 'time_stamp' : 'div.slp', + 'time_stamp' : 'div.slp::text', 'title': 'a._Dk::text', 'visible_link': 'cite::text' }, From 129883b89071970d76215b487a87b93528445161 Mon Sep 17 00:00:00 2001 From: ethanchewy <17chiue@gmail.com> Date: Tue, 24 Oct 2017 03:32:10 -0700 Subject: [PATCH 3/8] make time stamp functionality broader --- GoogleScraper/parsing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py index 8962bb8a..ebd015f9 100644 --- a/GoogleScraper/parsing.py +++ b/GoogleScraper/parsing.py @@ -359,7 +359,7 @@ class GoogleParser(Parser): 'result_container': 'div.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', - 'time_stamp' : 'div.slp::text', + 'time_stamp' : 'div.f::text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -368,7 +368,7 @@ class GoogleParser(Parser): 'result_container': 'li.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', - 'time_stamp' : 'div.slp::text', + 'time_stamp' : 'div.f::text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -376,7 +376,7 @@ class GoogleParser(Parser): 'container': 'li.card-section', 'link': 'a._Dk::attr(href)', 'snippet': 'span._dwd::text', - 'time_stamp' : 'div.slp::text', + 'time_stamp' : 'div.f::text', 'title': 'a._Dk::text', 'visible_link': 'cite::text' }, From ef2891ed088fa521bfa1f881c9d6714bfcaf561c Mon Sep 17 00:00:00 2001 From: ethanchewy <17chiue@gmail.com> Date: Tue, 24 Oct 2017 04:12:05 -0700 Subject: [PATCH 4/8] buggy code for ref --- GoogleScraper/database.py | 4 +++- GoogleScraper/parsing.py | 9 ++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/GoogleScraper/database.py b/GoogleScraper/database.py index e3538ebf..038b05c4 100644 --- a/GoogleScraper/database.py +++ b/GoogleScraper/database.py @@ -121,12 +121,13 @@ def set_values_from_parser(self, parser): parsed = urlparse(link['link']) # fill with nones to prevent key errors - [link.update({key: None}) for key in ('snippet', 'time_stamp','title', 'visible_link') if key not in link] + [link.update({key: None}) for key in ('snippet', 'time_stamp', 'recent_time', 'title', 'visible_link') if key not in link] Link( link=link['link'], snippet=link['snippet'], time_stamp=link['time_stamp'], + recent_time=link['recent_time'], title=link['title'], visible_link=link['visible_link'], domain=parsed.netloc, @@ -171,6 +172,7 @@ class Link(Base): title = Column(String) snippet = Column(String) time_stamp=Column(String) + recent_time=Column(String) link = Column(String) domain = Column(String) visible_link = Column(String) diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py index ebd015f9..0f55337c 100644 --- a/GoogleScraper/parsing.py +++ b/GoogleScraper/parsing.py @@ -359,7 +359,8 @@ class GoogleParser(Parser): 'result_container': 'div.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', - 'time_stamp' : 'div.f::text', + 'time_stamp' : 'div.slp.f::text', + 'recent_time' : 'div.slp.f::text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -368,7 +369,8 @@ class GoogleParser(Parser): 'result_container': 'li.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', - 'time_stamp' : 'div.f::text', + 'time_stamp' : 'div.slp.f::text', + 'recent_time' : 'div.slp.f::text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -376,7 +378,8 @@ class GoogleParser(Parser): 'container': 'li.card-section', 'link': 'a._Dk::attr(href)', 'snippet': 'span._dwd::text', - 'time_stamp' : 'div.f::text', + 'time_stamp' : 'div.slp.f::text', + 'recent_time' : 'div.slp.f::text', 'title': 'a._Dk::text', 'visible_link': 'cite::text' }, From cb2064d84fac31ef42c2cefeb726a4f4ac0f04d1 Mon Sep 17 00:00:00 2001 From: ethanchewy <17chiue@gmail.com> Date: Tue, 24 Oct 2017 04:30:37 -0700 Subject: [PATCH 5/8] fix time --- GoogleScraper/database.py | 4 +--- GoogleScraper/parsing.py | 9 +++------ 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/GoogleScraper/database.py b/GoogleScraper/database.py index 038b05c4..51925bcc 100644 --- a/GoogleScraper/database.py +++ b/GoogleScraper/database.py @@ -121,13 +121,12 @@ def set_values_from_parser(self, parser): parsed = urlparse(link['link']) # fill with nones to prevent key errors - [link.update({key: None}) for key in ('snippet', 'time_stamp', 'recent_time', 'title', 'visible_link') if key not in link] + [link.update({key: None}) for key in ('snippet', 'time_stamp', 'title', 'visible_link') if key not in link] Link( link=link['link'], snippet=link['snippet'], time_stamp=link['time_stamp'], - recent_time=link['recent_time'], title=link['title'], visible_link=link['visible_link'], domain=parsed.netloc, @@ -172,7 +171,6 @@ class Link(Base): title = Column(String) snippet = Column(String) time_stamp=Column(String) - recent_time=Column(String) link = Column(String) domain = Column(String) visible_link = Column(String) diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py index 0f55337c..924b752d 100644 --- a/GoogleScraper/parsing.py +++ b/GoogleScraper/parsing.py @@ -359,8 +359,7 @@ class GoogleParser(Parser): 'result_container': 'div.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', - 'time_stamp' : 'div.slp.f::text', - 'recent_time' : 'div.slp.f::text', + 'time_stamp' : 'div.s div.slp:text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -369,8 +368,7 @@ class GoogleParser(Parser): 'result_container': 'li.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', - 'time_stamp' : 'div.slp.f::text', - 'recent_time' : 'div.slp.f::text', + 'time_stamp' : 'div.s div.slp:text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -378,8 +376,7 @@ class GoogleParser(Parser): 'container': 'li.card-section', 'link': 'a._Dk::attr(href)', 'snippet': 'span._dwd::text', - 'time_stamp' : 'div.slp.f::text', - 'recent_time' : 'div.slp.f::text', + 'time_stamp' : 'div.s div.slp:text', 'title': 'a._Dk::text', 'visible_link': 'cite::text' }, From 998ef3eb4d61f136d217fc7637c4ec1bd7742faa Mon Sep 17 00:00:00 2001 From: ethanchewy <17chiue@gmail.com> Date: Wed, 25 Oct 2017 19:07:09 -0700 Subject: [PATCH 6/8] reverse buggy code --- GoogleScraper/database.py | 4 ++-- GoogleScraper/parsing.py | 8 ++++---- output.json | 1 + 3 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 output.json diff --git a/GoogleScraper/database.py b/GoogleScraper/database.py index 51925bcc..bc64c478 100644 --- a/GoogleScraper/database.py +++ b/GoogleScraper/database.py @@ -121,7 +121,7 @@ def set_values_from_parser(self, parser): parsed = urlparse(link['link']) # fill with nones to prevent key errors - [link.update({key: None}) for key in ('snippet', 'time_stamp', 'title', 'visible_link') if key not in link] + [link.update({key: None}) for key in ('snippet', 'time_stamp','title', 'visible_link') if key not in link] Link( link=link['link'], @@ -290,4 +290,4 @@ def fixtures(config, session): if not search_engine: session.add(SearchEngine(name=se)) - session.commit() + session.commit() \ No newline at end of file diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py index 924b752d..6e2b1608 100644 --- a/GoogleScraper/parsing.py +++ b/GoogleScraper/parsing.py @@ -359,7 +359,7 @@ class GoogleParser(Parser): 'result_container': 'div.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', - 'time_stamp' : 'div.s div.slp:text', + 'time_stamp' : 'div.slp::text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -368,7 +368,7 @@ class GoogleParser(Parser): 'result_container': 'li.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', - 'time_stamp' : 'div.s div.slp:text', + 'time_stamp' : 'div.slp::text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -376,7 +376,7 @@ class GoogleParser(Parser): 'container': 'li.card-section', 'link': 'a._Dk::attr(href)', 'snippet': 'span._dwd::text', - 'time_stamp' : 'div.s div.slp:text', + 'time_stamp' : 'div.slp::text', 'title': 'a._Dk::text', 'visible_link': 'cite::text' }, @@ -1079,4 +1079,4 @@ def parse_serp(config, html=None, parser=None, scraper=None, search_engine=None, print(parser) with open('/tmp/testhtml.html', 'w') as of: - of.write(raw_html) + of.write(raw_html) \ No newline at end of file diff --git a/output.json b/output.json new file mode 100644 index 00000000..0637a088 --- /dev/null +++ b/output.json @@ -0,0 +1 @@ +[] \ No newline at end of file From 9bd8c5d642fe432ae0a4404154db6ab61c03fc8b Mon Sep 17 00:00:00 2001 From: ethanchewy <17chiue@gmail.com> Date: Mon, 20 Nov 2017 21:29:11 -0800 Subject: [PATCH 7/8] Added search by time functionality! --- GoogleScraper/http_mode.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/GoogleScraper/http_mode.py b/GoogleScraper/http_mode.py index fab1ec3d..69480897 100644 --- a/GoogleScraper/http_mode.py +++ b/GoogleScraper/http_mode.py @@ -44,6 +44,12 @@ def get_GET_params_for_search_engine(query, search_engine, page_number=1, num_re # state by some hard coded needles. search_params['hl'] = 'en' search_params['q'] = query + #+ "&tbs=cdr%3A1%2Ccd_min%3A2015%2Ccd_max%3A2016&tbm=" + ''' + search_params['tbs'] ='cdr:1' + search_params['cd_min'] = '2015' + search_params['cd_max'] = '2016' + ''' # only set when other num results than 10. if num_results_per_page != 10: search_params['num'] = str(num_results_per_page) @@ -75,6 +81,17 @@ def get_GET_params_for_search_engine(query, search_engine, page_number=1, num_re 'source': 'lnms', 'sa': 'X' }) + #TEST + + + ''' + search_params.update({ + 'tbs' : 'cdr:1', + 'cd_min': '3/2/2015', + 'cd_max': '3/2/2016' + }) + ''' + elif search_engine == 'yandex': search_params['text'] = query @@ -157,7 +174,8 @@ def __init__(self, config, *args, time_offset=0.0, **kwargs): self.scrape_method = 'http' # get the base search url based on the search engine. - self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method) + #+ "&source=lnt&tbs=cdr%3A1%2Ccd_min%3A2015%2Ccd_max%3A2016&tbm= + self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method) + "&source=lnt&tbs=cdr%3A1%2Ccd_min%3A2015%2Ccd_max%3A2016&tbm=" super().instance_creation_info(self.__class__.__name__) @@ -256,7 +274,16 @@ def search(self, rand=True, timeout=15): success = True self.build_search() - + #"tbs=cdr%3A1%2Ccd_min%3A2016%2Ccd_max%3A2015&ei=kdsMWtn0NumD0gLdvYWQDA&" + ''' + self.headers['User-Agent'] = random_user_agent(only_desktop=True) + super().detection_prevention_sleep() + super().keyword_info() + print self.base_search_url + urlencode(self.search_params) + sys.exit() + ''' + + if rand: self.headers['User-Agent'] = random_user_agent(only_desktop=True) @@ -291,6 +318,7 @@ def search(self, rand=True, timeout=15): success = False super().after_search() + return success From 4356901540ea0f3ad4a639e8b5d084656895098f Mon Sep 17 00:00:00 2001 From: ethanchewy <17chiue@gmail.com> Date: Mon, 20 Nov 2017 22:35:01 -0800 Subject: [PATCH 8/8] Fixed buggy time search where it only worked for http search not for async mode --- GoogleScraper/http_mode.py | 2 +- GoogleScraper/parsing.py | 6 +++--- GoogleScraper/scraping.py | 7 +++---- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/GoogleScraper/http_mode.py b/GoogleScraper/http_mode.py index 69480897..1723c5e0 100644 --- a/GoogleScraper/http_mode.py +++ b/GoogleScraper/http_mode.py @@ -175,7 +175,7 @@ def __init__(self, config, *args, time_offset=0.0, **kwargs): # get the base search url based on the search engine. #+ "&source=lnt&tbs=cdr%3A1%2Ccd_min%3A2015%2Ccd_max%3A2016&tbm= - self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method) + "&source=lnt&tbs=cdr%3A1%2Ccd_min%3A2015%2Ccd_max%3A2016&tbm=" + self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__) diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py index 6e2b1608..7c452131 100644 --- a/GoogleScraper/parsing.py +++ b/GoogleScraper/parsing.py @@ -359,7 +359,7 @@ class GoogleParser(Parser): 'result_container': 'div.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', - 'time_stamp' : 'div.slp::text', + 'time_stamp' : 'div.s div.slp::text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -368,7 +368,7 @@ class GoogleParser(Parser): 'result_container': 'li.g ', 'link': 'h3.r > a:first-child::attr(href)', 'snippet': 'div.s span.st::text', - 'time_stamp' : 'div.slp::text', + 'time_stamp' : 'div.s div.slp::text', 'title': 'h3.r > a:first-child::text', 'visible_link': 'cite::text' }, @@ -376,7 +376,7 @@ class GoogleParser(Parser): 'container': 'li.card-section', 'link': 'a._Dk::attr(href)', 'snippet': 'span._dwd::text', - 'time_stamp' : 'div.slp::text', + 'time_stamp' : 'div.s div.slp::text', 'title': 'a._Dk::text', 'visible_link': 'cite::text' }, diff --git a/GoogleScraper/scraping.py b/GoogleScraper/scraping.py index 8d691f4b..d768eaae 100644 --- a/GoogleScraper/scraping.py +++ b/GoogleScraper/scraping.py @@ -78,7 +78,7 @@ def get_base_search_url_by_search_engine(config, search_engine_name, search_mode """ assert search_mode in SEARCH_MODES, 'search mode "{}" is not available'.format(search_mode) - specific_base_url = config.get('{}_{}_search_url'.format(search_mode, search_engine_name), None) + specific_base_url = config.get('{}_{}_search_url'.format(search_mode, search_engine_name), None) if not specific_base_url: specific_base_url = config.get('{}_search_url'.format(search_engine_name), None) @@ -90,9 +90,8 @@ def get_base_search_url_by_search_engine(config, search_engine_name, search_mode ips = file.read().split('\n') random_ip = random.choice(ips) return random_ip - - return specific_base_url - + specific_base_url += "&source=lnt&tbs=cdr%3A1%2Ccd_min%3A3%2F1%2F2015%2Ccd_max%3A11%2F1%2F2015&tbm=&" + return specific_base_url class SearchEngineScrape(metaclass=abc.ABCMeta): """Abstract base class that represents a search engine scrape.