-
Notifications
You must be signed in to change notification settings - Fork 0
/
code.py
182 lines (120 loc) · 5.75 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# --------------
from csv import reader
def explore_data(dataset, start, end, rows_and_columns=False):
"""Explore the elements of a list.
Print the elements of a list starting from the index 'start'(included) upto the index 'end' (excluded).
Keyword arguments:
dataset -- list of which we want to see the elements
start -- index of the first element we want to see, this is included
end -- index of the stopping element, this is excluded
rows_and_columns -- this parameter is optional while calling the function. It takes binary values, either True or False. If true, print the dimension of the list, else dont.
"""
dataset_slice = dataset[start:end]
for row in dataset_slice:
print(row)
print('\n')
if rows_and_columns:
print('Number of rows:',len(dataset))
print('Number of columns:',len(dataset[0]))
def duplicate_and_unique_movies(dataset, index_):
"""Check the duplicate and unique entries.
We have nested list. This function checks if the rows in the list is unique or duplicated based on the element at index 'index_'.
It prints the Number of duplicate entries, along with some examples of duplicated entry.
Keyword arguments:
dataset -- two dimensional list which we want to explore
index_ -- column index at which the element in each row would be checked for duplicacy
"""
duplicate=[]
unique = []
for movie in dataset:
name = movie[index_]
if name in unique:
duplicate.append(name)
else:
unique.append(name)
print('Number of duplicate Movies:',len(duplicate))
print('\n')
print('Examples of duplicate Movies:',duplicate[:15])
def movies_lang(dataset, index_, lang_):
"""Extract the movies of a particular language.
Of all the movies available in all languages, this function extracts all the movies in a particular laguage.
Once you ahve extracted the movies, call the explore_data() to print first few rows.
Keyword arguments:
dataset -- list containing the details of the movie
index_ -- index which is to be compared for langauges
lang_ -- desired language for which we want to filter out the movies
Returns:
movies_ -- list with details of the movies in selected language
"""
movies_=[]
for movie in movies:
lang = movie[index_]
if lang ==lang_:
movies_.append(movie)
print("Examples of Movies in English Language:")
explore_data(movies_,0,3,True)
return movies_
def rate_bucket(dataset, rate_low, rate_high):
"""Extract the movies within the specified ratings.
This function extracts all the movies that has rating between rate_low and high_rate.
Once you ahve extracted the movies, call the explore_data() to print first few rows.
Keyword arguments:
dataset -- list containing the details of the movie
rate_low -- lower range of rating
rate_high -- higher range of rating
Returns:
rated_movies -- list of the details of the movies with required ratings
"""
rated_movies= []
for movie in dataset:
vote_avg = float(movie[-4])
if ((vote_avg>= rate_low)&(vote_avg<=rate_high)):
rated_movies.append(movie)
print("Examples of Movies in required rating bucket:")
explore_data(rated_movies,0,3,True)
return rated_movies
# Read the data file and store it as a list 'movies'
opened_file = open(path, encoding="utf8")
read_file = reader(opened_file)
movies = list(read_file)
# The first row is header. Extract and store it in 'movies_header'.
movies_header = movies[0]
print("Movies Header:\n",movies_header)
# Subset the movies dataset such that the header is removed from the list and store it back in movies
movies = movies[1:]
# Delete wrong data
# Explore the row #4553. You will see that as apart from the id, description, status and title, no other information is available.
# Hence drop this row.
print("Entry at index 4553:")
explore_data(movies,4553,4554)
del movies[4553]
# Using explore_data() with appropriate parameters, view the details of the first 5 movies.
print("First 5 Entries:")
explore_data(movies,0,5,True)
# Our dataset might have more than one entry for a movie. Call duplicate_and_unique_movies() with index of the name to check the same.
duplicate_and_unique_movies(movies,13)
# We saw that there are 3 movies for which the there are multiple entries.
# Create a dictionary, 'reviews_max' that will have the name of the movie as key, and the maximum number of reviews as values.
reviews_max = {}
for movie in movies:
name = movie[13]
n_reviews = float(movie[12])
if name in reviews_max and reviews_max[name] < n_reviews:
reviews_max[name] = n_reviews
elif name not in reviews_max:
reviews_max[name]= n_reviews
len(reviews_max)
# Create a list 'movies_clean', which will filter out the duplicate movies and contain the rows with maximum number of reviews for duplicate movies, as stored in 'review_max'.
movies_clean =[]
already_added = []
for movie in movies:
name = movie[13]
n_reviews = float(movie[12])
if (reviews_max[name] == n_reviews) and (name not in already_added):
movies_clean.append(movie)
already_added.append(name)
len(movies_clean)
# Calling movies_lang(), extract all the english movies and store it in movies_en.
movies_en = movies_lang(movies_clean, 3, 'en')
# Call the rate_bucket function to see the movies with rating higher than 8.
high_rated_movies = rate_bucket(movies_en, 8, 10)