-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
49 lines (38 loc) · 2.08 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""
The program takes two input values from the user: the number of pages (an integer) and the type of articles. Then continues to process the Nature website data.
The main goal is to save the articles with the correct article bodies once the program has been executed.
"""
import requests
import os
from bs4 import BeautifulSoup
def getScrapper(url):
response = requests.get(url, headers={'Accept-Language': 'en-US,en;q=0.5'}) # Requested data comes in English
return BeautifulSoup(response.content, 'html.parser') # Tool to scrape HTML off webpage
print('Enter the number of pages you wish to scrape:')
number_of_pages = int(input())
print('Enter the article type you would like results for:')
article_type = str(input())
url = 'https://www.nature.com/nature/articles?sort=PubDate&year=2020&page=3' # URL is static for legal purposes
response = requests.get(url, headers={'Accept-Language': 'en-US,en;q=0.5'}) # Requesting data
for num in range(number_of_pages): # Iterates through each page
# Creates a directory based on website page number.
page_num = 'Page_' + str(num + 1)
os.mkdir(page_num)
os.chdir(page_num)
# Main program
page_num_string = 'page=' + str(num)
soup = getScrapper(url + page_num_string) # Concatenates page for accurate URL
link = soup.find_all('article') # All information needed to be scraped is in Article tag
# Iterates through each article for article type and title
for i in link:
# If the article type matches it scrapes the paragraph data and writes it to a text file.
if i.find_next('span', class_="c-meta__type").text == article_type:
hrefs = (i.find("a").get("href"))
title = "_".join(i.find('a').text.split())
web_request = 'https://www.nature.com' + hrefs
soup = getScrapper(web_request)
paragraph_text = soup.find_all('div', {'class': "c-article-body u-clearfix"})
file = open(title + '.txt', 'wb')
for paragraph in paragraph_text:
file.write(bytes(paragraph.text, "utf-8"))
file.close()