-
Notifications
You must be signed in to change notification settings - Fork 0
/
imdb.py
240 lines (194 loc) · 7.23 KB
/
imdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
from requests import get
url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
response = get(url)
print(len(response.text))
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
"""
print(movie_containers[0].find('div',class_='lister-item-content'))
first_movie=movie_containers[0].find('div',class_='lister-item-content')
first_movie.div
first_movie.a
first_movie.h3
first_movie.h3.a
##Name
first_name = first_movie.h3.a.text
first_name
##Year
first_year = first_movie.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
first_year
## Year of release
first_imdb = float(first_movie.strong.text)
first_imdb
## Meta Score
first_mscore = first_movie.find('span', class_ = 'metascore favorable')
first_mscore = int(first_mscore.text)
print(first_mscore)
## NO of votes
first_votes = int(first_movie.find('span', attrs = {'name':'nv'})['data-value'])
first_votes
##Gross
gross =first_movie.find_all('span', attrs = {'name':'nv'})[1]['data-value']
gross
## Movie description
movie_desc=first_movie.find_all('p', class_ = 'text-muted')[1].text
movie_desc
##Movie details
movie_det=first_movie.find_all('p', class_ = 'text-muted')
movie_det
## Movie duration
movie_dur=movie_det.find('span',class_='runtime').text
movie_dur
## Movie Genre
movie_genre=movie_det.find('span',class_='genre').text
movie_genre
for container in movie_containers[:3]:
if container.find('div', class_ = 'ratings-metascore') is not None:
gross_inc =container.find_all('span', attrs = {'name':'nv'})[1]['data-value']
print(gross_inc)
"""
##################################################################################
##################################################################################
# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
#gross=[] #many movies have no record
movie_description=[]
movie_duration=[]
movie_genre=[]
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# The name
name = container.h3.a.text
names.append(name)
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# The IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# The Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
# The number of votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
votes.append(int(vote))
# Gross income of movie
#gross_inc =container.find_all('span', attrs = {'name':'nv'})[1]['data-value']
#gross.append(gross_inc)
# movie description
movie_desc=container.find_all('p', class_ = 'text-muted')[1].text
movie_description.append(movie_desc)
movie_det=container.find_all('p', class_ = 'text-muted')[0]
# Movie duration
movie_dur=movie_det.find('span',class_='runtime').text
movie_duration.append(movie_dur)
# Movie genre
movie_gen=movie_det.find('span',class_='genre').text
movie_genre.append(movie_gen)
import pandas as pd
one_df = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
#'gross':gross,
'movie decription':movie_description,
'movie duration':movie_duration,
'movie genre':movie_genre
})
print(one_df.info())
one_df.to_csv('50_movie_details.csv')
########################################################################################
########################################################################################
from IPython.core.display import clear_output
headers = {"Accept-Language": "en-US, en;q=0.5"}
pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000,2018)]
from time import sleep
from random import randint
from time import time
start_time = time()
requests = 0
for _ in range(5):
# A request would go here
requests += 1
sleep(randint(1,3))
elapsed_time = time() - start_time
print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
for year_url in years_url:
# For every page in the interval 1-4
for page in pages:
# Make a get request
response = get('http://www.imdb.com/search/title?release_date=' + year_url +
'&sort=num_votes,desc&page=' + page, headers = headers)
# Pause the loop
sleep(randint(8,15))
# Monitor the requests
requests += 1
elapsed_time = time() - start_time
print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
# Throw a warning for non-200 status codes
#if response.status_code != 200:
#warn('Request: {}; Status code: {}'.format(requests, response.status_code))
# Break the loop if the number of requests is greater than expected
#if requests > 72:
#warn('Number of requests was greater than expected.')
# print('itne request mat karo')
# break
# Parse the content of the request with BeautifulSoup
page_html = BeautifulSoup(response.text, 'html.parser')
# Select all the 50 movie containers from a single page
mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# The name
name = container.h3.a.text
names.append(name)
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# The IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# The Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
# The number of votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
votes.append(int(vote))
# Gross income of movie
#gross_inc =container.find_all('span', attrs = {'name':'nv'})[1]['data-value']
#gross.append(gross_inc)
# movie description
movie_desc=container.find_all('p', class_ = 'text-muted')[1].text
movie_description.append(movie_desc)
movie_det=container.find_all('p', class_ = 'text-muted')[0]
# Movie duration
movie_dur=movie_det.find('span',class_='runtime').text
movie_duration.append(movie_dur)
# Movie genre
movie_gen=movie_det.find('span',class_='genre').text
movie_genre.append(movie_gen)
import pandas as pd
final_df = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
#'gross':gross,
'movie decription':movie_description,
'movie duration':movie_duration,
'movie genre':movie_genre
})
print(final_df.info())
final_df.to_csv('final_movie_details.csv')