-
Notifications
You must be signed in to change notification settings - Fork 0
/
movie_inf_wiki.py
61 lines (46 loc) · 1.7 KB
/
movie_inf_wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import requests
from bs4 import BeautifulSoup
import re
# URL of the page to scrape
url = 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture'
film_url={}
film_length={}
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
section_1950s = soup.find('span', id='1950s').parent
table = section_1950s.find_next('table')
rows=table.find_all('tr')
for row in rows:
cells=row.find_all("td")
for cell in cells:
lines=cell.find_all('i')
for line in lines:
a_tag=line.find('a')
if a_tag:
href = a_tag.get('href')
title = a_tag.get('title')
film_url[title]='https://en.wikipedia.org/'+str(href)
for item in film_url:
response=requests.get(film_url[item])
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
info_box = soup.find('table', {'class': 'infobox vevent'})
runtime_row = info_box.find('th', string='Running time').find_next_sibling('td')
length=runtime_row.get_text(strip=True)
pattern = re.compile(r'\[.*?\]')
if '[' in length:
length = pattern.sub('', length)
film_length[item]=length
print(film_length)
# response=requests.get(film_url['All About Eve'])
# response.encoding = 'utf-8'
# soup = BeautifulSoup(response.text, 'html.parser')
# info_box = soup.find('table', {'class': 'infobox vevent'})
# runtime_row = info_box.find('th', string='Running time').find_next_sibling('td')
# length=runtime_row.get_text(strip=True)
# pattern = re.compile(r'\[.*?\]')
# if '[' in length:
# length = pattern.sub('', length)
# film_length['x']=length
# print(film_length)