Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding time stamp functionality #195

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions GoogleScraper/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,12 @@ def set_values_from_parser(self, parser):
parsed = urlparse(link['link'])

# fill with nones to prevent key errors
[link.update({key: None}) for key in ('snippet', 'title', 'visible_link') if key not in link]
[link.update({key: None}) for key in ('snippet', 'time_stamp','title', 'visible_link') if key not in link]

Link(
link=link['link'],
snippet=link['snippet'],
time_stamp=link['time_stamp'],
title=link['title'],
visible_link=link['visible_link'],
domain=parsed.netloc,
Expand Down Expand Up @@ -169,6 +170,7 @@ class Link(Base):
id = Column(Integer, primary_key=True)
title = Column(String)
snippet = Column(String)
time_stamp=Column(String)
link = Column(String)
domain = Column(String)
visible_link = Column(String)
Expand Down Expand Up @@ -288,4 +290,4 @@ def fixtures(config, session):
if not search_engine:
session.add(SearchEngine(name=se))

session.commit()
session.commit()
30 changes: 29 additions & 1 deletion GoogleScraper/http_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ def get_GET_params_for_search_engine(query, search_engine, page_number=1, num_re
# state by some hard coded needles.
search_params['hl'] = 'en'
search_params['q'] = query
#+ "&tbs=cdr%3A1%2Ccd_min%3A2015%2Ccd_max%3A2016&tbm="
'''
search_params['tbs'] ='cdr:1'
search_params['cd_min'] = '2015'
search_params['cd_max'] = '2016'
'''
# only set when other num results than 10.
if num_results_per_page != 10:
search_params['num'] = str(num_results_per_page)
Expand Down Expand Up @@ -75,6 +81,17 @@ def get_GET_params_for_search_engine(query, search_engine, page_number=1, num_re
'source': 'lnms',
'sa': 'X'
})
#TEST


'''
search_params.update({
'tbs' : 'cdr:1',
'cd_min': '3/2/2015',
'cd_max': '3/2/2016'
})
'''


elif search_engine == 'yandex':
search_params['text'] = query
Expand Down Expand Up @@ -157,6 +174,7 @@ def __init__(self, config, *args, time_offset=0.0, **kwargs):
self.scrape_method = 'http'

# get the base search url based on the search engine.
#+ "&source=lnt&tbs=cdr%3A1%2Ccd_min%3A2015%2Ccd_max%3A2016&tbm=
self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method)

super().instance_creation_info(self.__class__.__name__)
Expand Down Expand Up @@ -256,7 +274,16 @@ def search(self, rand=True, timeout=15):
success = True

self.build_search()

#"tbs=cdr%3A1%2Ccd_min%3A2016%2Ccd_max%3A2015&ei=kdsMWtn0NumD0gLdvYWQDA&"
'''
self.headers['User-Agent'] = random_user_agent(only_desktop=True)
super().detection_prevention_sleep()
super().keyword_info()
print self.base_search_url + urlencode(self.search_params)
sys.exit()
'''


if rand:
self.headers['User-Agent'] = random_user_agent(only_desktop=True)

Expand Down Expand Up @@ -291,6 +318,7 @@ def search(self, rand=True, timeout=15):
success = False

super().after_search()


return success

Expand Down
5 changes: 4 additions & 1 deletion GoogleScraper/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ class GoogleParser(Parser):
'result_container': 'div.g ',
'link': 'h3.r > a:first-child::attr(href)',
'snippet': 'div.s span.st::text',
'time_stamp' : 'div.s div.slp::text',
'title': 'h3.r > a:first-child::text',
'visible_link': 'cite::text'
},
Expand All @@ -367,13 +368,15 @@ class GoogleParser(Parser):
'result_container': 'li.g ',
'link': 'h3.r > a:first-child::attr(href)',
'snippet': 'div.s span.st::text',
'time_stamp' : 'div.s div.slp::text',
'title': 'h3.r > a:first-child::text',
'visible_link': 'cite::text'
},
'de_ip_news_items': {
'container': 'li.card-section',
'link': 'a._Dk::attr(href)',
'snippet': 'span._dwd::text',
'time_stamp' : 'div.s div.slp::text',
'title': 'a._Dk::text',
'visible_link': 'cite::text'
},
Expand Down Expand Up @@ -1076,4 +1079,4 @@ def parse_serp(config, html=None, parser=None, scraper=None, search_engine=None,
print(parser)

with open('/tmp/testhtml.html', 'w') as of:
of.write(raw_html)
of.write(raw_html)
7 changes: 3 additions & 4 deletions GoogleScraper/scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def get_base_search_url_by_search_engine(config, search_engine_name, search_mode
"""
assert search_mode in SEARCH_MODES, 'search mode "{}" is not available'.format(search_mode)

specific_base_url = config.get('{}_{}_search_url'.format(search_mode, search_engine_name), None)
specific_base_url = config.get('{}_{}_search_url'.format(search_mode, search_engine_name), None)

if not specific_base_url:
specific_base_url = config.get('{}_search_url'.format(search_engine_name), None)
Expand All @@ -90,9 +90,8 @@ def get_base_search_url_by_search_engine(config, search_engine_name, search_mode
ips = file.read().split('\n')
random_ip = random.choice(ips)
return random_ip

return specific_base_url

specific_base_url += "&source=lnt&tbs=cdr%3A1%2Ccd_min%3A3%2F1%2F2015%2Ccd_max%3A11%2F1%2F2015&tbm=&"
return specific_base_url

class SearchEngineScrape(metaclass=abc.ABCMeta):
"""Abstract base class that represents a search engine scrape.
Expand Down
1 change: 1 addition & 0 deletions output.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[]