-
Notifications
You must be signed in to change notification settings - Fork 1
/
Task2.py
91 lines (81 loc) · 3.11 KB
/
Task2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
## In this task, I have arranged all the movies, according to their years of release.
import requests, pprint
from bs4 import BeautifulSoup
url = "https://www.imdb.com/india/top-rated-indian-movies/?ref_=nv_mv_250_in"
page = requests.get(url)
text = page.text
parse = BeautifulSoup(text, "html.parser")
# h1 = parse.find('h1').text
# print (h1)
# title = parse.find('title').text
# print (title)
def scrape_top_list():
lister = parse.find('div', class_ = "lister")
# print (lister)
tbody = lister.find('tbody', class_ = "lister-list")
# print (tbody)
trs = tbody.find_all('tr')
# print (trs)
############# to find the position of movies:
position = []
movies_list = []
years_of_release = []
ratings_list = []
movies_links = []
for tr in trs:
# print (tr)
# print ("-------------------------------------------------------------------------------------------------------------------------------------")
tds = tr.find('td', class_ = "titleColumn").get_text().strip()
# print (tds)
# print ("-------------------------------------------------------------------------------------------------------------------------------------")
string = ""
for i in tds:
if "." != i:
string = string+i
else:
position.append(int(string))
break
# print (position)
all_in_one = []
for tr in trs:
main_link = tr.find('td', class_ = "titleColumn")
# to print the name of every movie
name = main_link.find('a').get_text()
movies_list.append(name)
# to print the year_of_release of every movie
year = main_link.find('span').text
years_of_release.append(year)
# to print the link of every movie
site_links = tr.find('td', class_ = "titleColumn").a['href']
link_to_movie = "http://www.imdb.com" + (site_links)
movies_links.append(link_to_movie)
# to print the rating of every movie
ratingtd = tr.find('td', class_ = "ratingColumn imdbRating")
ratings = ratingtd.find('strong').text
ratings_list.append(ratings)
for i in range(len(movies_list)):
b = {
'position': position[i],
'name': movies_list[i],
'year': years_of_release[i],
'rating': ratings_list[i],
'url': movies_links[i],
}
all_in_one.append(b)
# print (len(all_in_one))
# print (movies_list)
# print (years_of_release)
# print (ratings_list)
# print (movies_links)
############################3-------------Task2---------()---#######################
main_dic = {}
for i in years_of_release:
empty_list = []
for j in range(len(all_in_one)):
if i == all_in_one[j]['year']:
empty_dic = {'name': all_in_one[j]['name'], 'year': i, 'position': all_in_one[j]['position'], 'rating': all_in_one[j]['rating'], 'url': all_in_one[j]['url']}
# print (empty_dic)
empty_list.append(empty_dic)
main_dic[i] = empty_list
pprint.pprint (main_dic)
scrape_top_list()