forked from melodyfelix/cs411_project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
51 lines (42 loc) · 1.19 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup
import sqlite3
def get_db():
db = sqlite3.connect('instance/app.sqlite')
db.row_factory = sqlite3.Row
return db
db = get_db()
#setup web scraper
url = 'https://en.wikipedia.org/wiki/Rolling_Stone_100_Best_Songs_of_the_Decade'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
#find specific table and rows in scraped web page
table = soup.find('table')
table_body = table.find('tbody')
rows = table.find_all('tr')
#add all songs to db
for i in range(1, len(rows)):
row = rows[i]
cols = row.find_all('td')
cols = [elem.text.strip() for elem in cols]
song_name = cols[1]
#strip quotes and album name from song_name
quote = False
temp = ""
for char in song_name:
if char == "\"" and not quote:
quote = True
continue
elif char == "\"" and quote:
break
else:
temp += char
song_name = temp
genre = cols[2]
link = 'https://en.wikipedia.org' + row.select_one('a')['href']
db.execute(
'INSERT INTO Song (SongName, Genre, Song_Url) VALUES (?,?,?)',
(song_name, genre, link)
)
db.commit()
db.close()