-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathguardian_top_article.py
93 lines (57 loc) · 2.01 KB
/
guardian_top_article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
"""
Fecth the text of the main headline article from the Guardian website.
Print it out and print out the URLs of any links in the text.
"""
import bs4
import requests
# %% Helper functions
def get_and_soupify(url):
"""Gets a webpage and turns it into a BeautifulSoup.
Argument url: string URL
Returns: bs4.BeautifulSoup
Raises RuntimeError: if status of request is not '200 (OK)'
"""
response = requests.get(url)
msg = '{} ({}): {}'.format(response.status_code, response.reason, url)
if response.status_code != 200:
raise RuntimeError(msg)
print(msg)
return bs4.BeautifulSoup(response.text, 'lxml')
def print_output(content, title):
"""Print some output with a neat header line.
Arguments:
content: string content to print
title: string title to display in header
"""
header = '#### ' + title + ' ####'
footer = '#' * len(header)
print('', header, content, footer, sep='\n')
# %% Main page
main_url = 'https://www.theguardian.com/international'
page = get_and_soupify(main_url)
# Target:
# Within the <section> with id 'headlines' ...
# the first <a> tag ...
# with class 'fc-item__link' ...
# and data-link-name 'article'.
headlines = page.find('section', attrs={'id': 'headlines'})
top_headline = headlines.find('a', attrs={'class': 'fc-item__link', 'data-link-name': 'article'})
article_url = top_headline['href']
# %% Top article page
article_page = get_and_soupify(article_url)
# Target:
# Within the first <div> tag with class 'content__article-body' ...
# all the <p> tags.
main_section = article_page.find('div', attrs={'class': 'content__article-body'})
paragraphs = main_section.find_all('p')
# %% Article text
paragraph_texts = [p.get_text() for p in paragraphs]
print_output('\n'.join(paragraph_texts), 'Body text')
# %% Linked URLs
links = []
for p in paragraphs:
a_tags = p.find_all('a')
for a in a_tags:
links.append(a['href'])
print_output('\n'.join(links), 'Linked URLs')