Skip to content

Commit

Permalink
refactored & simplified we1schomp.py and scrape.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Sean Gilleran committed Jul 24, 2018
1 parent 3d8802b commit d2b2bdd
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 130 deletions.
112 changes: 7 additions & 105 deletions we1schomp.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,108 +2,8 @@
"""
"""

from uuid import uuid1
from argparse import ArgumentParser
from we1schomp import config, scrape, parse, save
from we1schomp.browser import Browser


def scrape_urls_from_google(settings='settings.ini'):
"""
"""

settings = config.load_from_file(settings)

browser = Browser('Chrome')

for site in settings.sections():

browser.wait_for_keypress = settings[site].getboolean(
'wait_for_keypress')
browser.sleep_min = settings[site].getfloat('sleep_min')
browser.sleep_max = settings[site].getfloat('sleep_max')

for term in settings[site]['terms'].split(','):

save.urls_to_file(
site=site, term=term,
data=scrape.urls_from_google(
term=term, site=site, browser=browser,
schema_version=settings[site]['schema_version'],
search_url=settings[site]['search_url'],
url_stops=settings[site]['url_stops'].split(',')),
filename=settings[site]['output_filename'],
output_path=settings[site]['output_path'])

browser.close()


def scrape_content_from_sites(settings='settings.ini'):
"""
"""

settings = config.load_from_file(settings)

browser = Browser('Chrome')

for site in settings.sections():

if settings[site].getboolean('urls_only'):
print(f'Skipping {site}.')
continue

browser.wait_for_keypress = settings[site].getboolean(
'wait_for_keypress')
browser.sleep_min = settings[site].getfloat('sleep_min')
browser.sleep_max = settings[site].getfloat('sleep_max')

for term in settings[site]['terms'].split(','):

articles = parse.urls_from_file(
filename=settings[site]['output_filename'].format(
site=site.replace('.', '-').replace('/', '-'),
term=term.replace(' ', '-'),
timestamp='').split('.')[0].strip('_'),
path=settings[site]['output_path'])

count = 0

for article in articles:

content = scrape.content_from_url(
url=article['url'], browser=browser,
content_tag=settings[site]['content_tag'],
content_length_min=settings[site].getint('content_length_min')) # noqa

name = settings[site]['output_filename'].format(
site=site.replace('.', '-').replace('/', '-'),
term=term.replace(' ', '-'),
timestamp=''
).replace('urls', f'{count:003d}').split('.')[0].strip('_')

save.article_to_file(
data={
'doc_id': str(uuid1()),
'term': term,
'site': site,
'url': article['url'],
'attachment_id': '',
'pub': settings[site]['name'],
'pub_date': article['date'],
'length': f'{len(content.split(" "))} words',
'title': article['title'],
'content': content,
'name': name,
'namespace': 'we1sv2.0',
'metapath': f'Corpus,{name},Rawdata'},
filename=settings[site]['output_filename'].replace(
'urls', f'{count:003d}'),
output_path=settings[site]['output_path'])

count += 1

browser.close()
print('Done!')
from we1schomp import config, scrape


if __name__ == '__main__':
Expand All @@ -121,17 +21,19 @@ def scrape_content_from_sites(settings='settings.ini'):
help='Only get artiles, not Google results.')
args = parser.parse_args()

settings = config.load_from_file(args.settings_file)

if args.urls_only and not args.articlesl_only:
scrape_urls_from_google(args.settings_file)
scrape.urls_from_google(settings)
exit()

if args.articles_only and not args.urls_only:
scrape_content_from_sites(args.settings_file)
scrape.content_from_sites(settings)
exit()

scrape_urls_from_google(args.settings_file)
scrape.urls_from_google(settings)

print('\n')

scrape_content_from_sites(args.settings_file)
scrape.content_from_sites(settings)
exit()
120 changes: 95 additions & 25 deletions we1schomp/scrape.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,119 @@
"""
"""

from we1schomp import parse
from uuid import uuid1
from we1schomp import parse, save
from we1schomp.browser import Browser


def urls_from_google(term, site, schema_version, search_url,
url_stops=[], browser=None):
def urls_from_google(settings, browser=None):
"""
"""

if not browser:
browser = Browser()
browser = Browser('Chrome')

print(f'\nSearching Google for "{term}" at {site}.')
browser.go(search_url.format(term=term, site=site))
for site in settings.sections():

data = []
browser.wait_for_keypress = settings[site].getboolean(
'wait_for_keypress')
browser.sleep_min = settings[site].getfloat('sleep_min')
browser.sleep_max = settings[site].getfloat('sleep_max')

while True:
for term in settings[site]['terms'].split(','):

browser.check_for_google_captcha()
print(f'\nSearching Google for "{term}" at {site}.')
browser.go(settings[site]['search_url'].format(term=term, site=site)) # noqa

for url in parse.urls_from_google(browser.source, url_stops):
data.append(url)
data = []

if not browser.next_google_result():
break
while True:

return data
browser.check_for_google_captcha()

for url in parse.urls_from_google(
browser.source, settings[site]['url_stops'].split(',')):
data.append(url)

def content_from_url(url, content_tag='p', content_length_min=250,
browser=None):
"""
"""
if not browser.next_google_result():
break

if not browser:
browser = Browser()
save.urls_to_file(
site=site, term=term, data=data,
filename=settings[site]['output_filename'],
output_path=settings[site]['output_path'])

browser.go(url)
browser.close()
print('Done!')

content = parse.content_from_html(
browser.source, content_tag, content_length_min)

browser.sleep()
def content_from_sites(settings, browser=None):
"""
"""

return content
if not browser:
browser = Browser('Chrome')

browser = Browser('Chrome')

for site in settings.sections():

if settings[site].getboolean('urls_only'):
print(f'Skipping {site}.')
continue

browser.wait_for_keypress = settings[site].getboolean(
'wait_for_keypress')
browser.sleep_min = settings[site].getfloat('sleep_min')
browser.sleep_max = settings[site].getfloat('sleep_max')

for term in settings[site]['terms'].split(','):

articles = parse.urls_from_file(
filename=settings[site]['output_filename'].format(
site=site.replace('.', '-').replace('/', '-'),
term=term.replace(' ', '-'),
timestamp='').split('.')[0].strip('_'),
path=settings[site]['output_path'])

count = 0

for article in articles:

browser.sleep()
browser.go(article['url'])

content = parse.content_from_html(
browser.source,
content_tag=settings[site]['content_tag'],
content_length_min=settings[site].get_int('content_length_min')) # noqa

name = settings[site]['output_filename'].format(
site=site.replace('.', '-').replace('/', '-'),
term=term.replace(' ', '-'),
timestamp=''
).replace('urls', f'{count:003d}').split('.')[0].strip('_')

save.article_to_file(
data={
'doc_id': str(uuid1()),
'term': term,
'site': site,
'url': article['url'],
'attachment_id': '',
'pub': settings[site]['name'],
'pub_date': article['date'],
'length': f'{len(content.split(" "))} words',
'title': article['title'],
'content': content,
'name': name,
'namespace': 'we1sv2.0',
'metapath': f'Corpus,{name},Rawdata'},
filename=settings[site]['output_filename'].replace(
'urls', f'{count:003d}'),
output_path=settings[site]['output_path'])

count += 1

browser.close()
print('Done!')

0 comments on commit d2b2bdd

Please sign in to comment.