-
Notifications
You must be signed in to change notification settings - Fork 59
/
Copy path09_simple_bs4_tutorial.py
251 lines (200 loc) · 5.84 KB
/
09_simple_bs4_tutorial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
###############################################
# Simple BS4 tutorial
# -------------------
#
# Learn the basics of parsing with bs4
# by parsing sightseeingpass:
# https://www.sightseeingpass.com/en/new-york
#
# By Doug Purcell
# http://www.purcellconsult.com
#
###############################################
from site_downloader import download
site = download('https://www.sightseeingpass.com/en/new-york')
prettify = site.prettify()
# print(prettify)
print('---- title tag in page ----')
print(site.title)
print()
print('---- title tag string -----')
print(site.title.string)
print()
print('---- title parent name ----')
print()
print(site.title.parent.name)
print()
print('----- body tag -----')
print(site.body)
print()
print('----- meta tag -----')
print(site.meta)
print()
# gets all of the script tags
print('----- script tags -----')
script_tags = site.find_all('script')
print(script_tags)
print('--- script tag ends ---')
# gets all ul tags
print('----- ul tags -----')
ul_tags = site.find_all('ul')
print(ul_tags)
print('----- ul tags end -----')
print()
# gets all of the li tags
print('----- li tags -----')
li_tags = site.find_all('li')
print(li_tags)
print('----- li tags ends -----')
print()
# gets all of a tags
print('----- a tags ----- begin')
a_tags = site.find_all('a')
print(a_tags)
print('----- a tags ----- end')
# gets all of the p tags
print('----- p tag ------')
p_tag = site.find_all('p')
print(p_tag)
print('-----p tag ends ------')
print()
# get all of the img tags
print('----- img tag ------')
img_tag = site.find_all('img')
print(img_tag)
print('----- img tag ends ------')
print()
# gets all of the button tags
print('----- button tag ------')
button_tag = site.find_all('button')
print(button_tag)
print('----- button tag ends ------')
print()
# div tag
print('----- div tag ------')
div_tag = site.find_all('div')
print(div_tag)
print('----- div tag ends ------')
print()
# span tag
print('----- span tag -----')
span_tag = site.find_all('span')
print(span_tag)
print('----- span tag end -----')
print()
# do you see a pattern on how to get all of a certain html tag?
# Counting tags in bs4
# ---------------------------
# count number of <br /> tags
# count number of i tags
# count the number of u tags
# count the number of b tags
# count the number of div tags
br_count = 0
for br_tag in site.find_all('br'):
br_count += 1
i_tag_count = 0
for i_tag in site.find_all('i'):
i_tag_count += 1
u_tag_count = 0
for u_tag in site.find_all('u'):
u_tag_count += 1
b_tag_count = 0
for b_tag in site.find_all('b'):
b_tag_count += 1
print('---- Div Tag Count ----')
div_tag_count = 0
div_count = len(site.find_all('div'))
print('There\'s {} br tags'.format(br_count))
print('There\'s {} i tags'.format(i_tag_count))
print('There\'s {} u tags'.format(u_tag_count))
print('There\'s {} b tags'.format(b_tag_count))
print('There\'s {} div tags'.format(div_count))
# refining our bs4 scraps
# -----------------------
# most scraps aren't this simple
# sometimes we get data that we don't want
# therefore, we need ways to extract only certain parts
# find method
# ------------
# function signature: find(name, attrs, recursive, string, **kwargs)
# details: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find
# compared to find_all it just looks for the first result. find_all
# scans the entire document looking for results. I.e, tags that
# occur once like body, html, head, title, etc.
# Also, find returns the result, while find_all returns a list of results
print('----- The header -----')
header = site.find('header')
print('The header is {}'.format(header))
print('----- The header end -----')
print()
print('----- div slice -----')
div_tags = site.find_all('div')
print(div_tags[1:4])
print('----- div slice end -----')
print()
# select
# ------
# css selectors
# select, match, filter using modern CSS selectors
# learn about the api: https://facelessuser.github.io/soupsieve
print('----- regular match with css selectors -----')
print(site.select('title'))
print()
print('----- find tags beneath other tags -----')
top_10_nyc = site.select('p span')
print(top_10_nyc)
# how to refine so i get the span tag with the class attribute...
# of menu-hover-bar--red?
top_10_nyc = site.select(".menu-hover-bar--red")[0]
print(top_10_nyc)
# how can we get the text between the tags?
print(top_10_nyc.get_text())
# what's a second way that I can get text between tags?
# get the first element from the contents
print(top_10_nyc.contents[0])
# what's a 3rd way :-)?
# find the next tag, use next attribute
print(top_10_nyc.next)
# jeez, what's a fourth way?
# use .string
print(top_10_nyc.string)
print()
print('----- find tags directly beneath other tags -----')
site_lists = site.select('p > span')
for x, y in enumerate(site_lists):
print(site_lists[x].contents[0])
print()
print('----- Address -----')
contact_us = site.select('.desk-footer__contact-us p')
print(contact_us[0].get_text())
print('----- Get the a tags in the footer -----')
footer_links = site.select('#linksMoreInformation a')
for links in footer_links:
print(links)
print()
print('----- Get the href tags in the footer -----')
for links in footer_links:
print(links['href'])
print()
print('----- Get the social media links -----')
for social_links in site.select('.social-bar li a'):
print(social_links['href'])
print()
print('----- get all of the links on the site -----')
for web_links in site.find_all('a', {'href': True}):
print(web_links['href'])
# understanding parents and siblings
# ----------------------------------
# parents is needed if you need to work your way up a tree
# i.e, start somewhere and then start working your way up
print()
print('----- parents -----')
print()
tag = site.find('head')
print(tag.find_parents('html'))
print()
# gets elements that's side by side
print('----- siblings -----')
div_class = site.find('div')
print(div_class.find_next_sibling())