-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGroupRecommendation.py
171 lines (144 loc) · 8.47 KB
/
GroupRecommendation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# User-based Collaborative Filtering Group Recommendations
# Faiz Muhammad
import numpy as np
import pandas as pd
from scipy import stats
'''
This function gets top k similar users with their corresponding similarities using the Pearson's Coefficient
'''
def get_similarUsers(u_index, ui_matrix, topk_users=30): # get top k similar users
sim = []
total_users = ui_matrix.shape[0]
x = np.array(ui_matrix.iloc[u_index, :]) # converting our main user's ratings into a numpy array
for i in range(total_users):
if i != u_index:
y = np.array(ui_matrix.iloc[i, :]) # converting similar user's ratings into a numpy array
xy_stacked = np.vstack([x, y]) # Stacking two users
xy = xy_stacked[:,
~np.any(np.isnan(xy_stacked), axis=0)] # only getting the vector where both arrays have values
if len(xy[0]) > 3 and len(xy[1]) > 3:
r, p = stats.mstats.pearsonr(xy[0], xy[1])
if np.isnan(r): # Checking if pearson coefficient is Nan
r = -2
sim.append((i, r)) # appending (index, value) tuple in the similarity list
sim.sort(key=lambda o: o[1]) # sorting the tuples based on similarity
return list(reversed(sim[-topk_users:])) # getting the top 30 most similar users to our user
'''
This function returns the movie indices along with their score for a particular user identified by u_index
'''
def pred(u_index, ui_matrix, sim_matrix, topk_recommendations=20):
item_pred = []
x = np.array(ui_matrix.iloc[u_index, :]) # converting our main user's ratings into a numpy array
x_mean = np.nanmean(x)
sim_sum = np.sum([pair[1] for pair in sim_matrix]) # summing the similarity of top 30 similar users
count = 0
counter = 0
user_rating = [] # this list will contain all the user's ratings (predicted or original) in the same order
for item in range(len(x)):
if np.isnan(ui_matrix.iloc[u_index, item]): # if the item hasn't been rated by our main user
simsum_centering = []
counter += 1
for index, similarity in sim_matrix: # for users in 30 most similar users
if not np.isnan(ui_matrix.iloc[index, item]): # if this user has rated the item
count += 1
y = np.array(ui_matrix.iloc[index, :]) # convert the similar user to a numpy array
y_mean = np.nanmean(y) # get the mean rating of this user
rating = ui_matrix.iloc[index, item] # get the rating of the item
rating_centered = rating - y_mean # sub mean_rating of the similar user from this item's rating
simsum_centering.append(similarity * rating_centered) # mul w this user's similarity & add to list
if (np.sum(simsum_centering) / sim_sum) > 0:
pred = x_mean + (np.sum(simsum_centering) / sim_sum)
item_pred.append((item, pred)) # add the predicted score for the item along with its index
else:
pred = 1
item_pred.append((item, pred)) # else give the lowest possible rating to the item
user_rating.append(pred) # append that item's rating in the user_rating list
else: # if the item was already been rated by the user
user_rating.append(ui_matrix.iloc[u_index, item]) # add the rating in the user_rating list
item_pred.sort(key=lambda u: u[1]) # sort the items based on their score
return list(reversed(item_pred[-topk_recommendations:])), user_rating #
'''
There are some movies that are non-rated by any user. This function gets rid of those movies
'''
def prune_movies_df(movies, ratings):
movie_ids = np.array(movies['movieId'])
# print(len(movie_ids)) # Uncomment for debugging purposes
rated_movie_ids = set(list(np.array(ratings['movieId'])))
# print(len(rated_movie_ids)) # Uncomment for debugging purposes
uncommon_movies = []
# Getting movies that are in movies.csv but not in ratings.csv
for i in movie_ids:
if i not in rated_movie_ids:
uncommon_movies.append(i)
# print(len(uncommon_movies))
ind = []
for movie_id in uncommon_movies:
movies.drop(movies.loc[movies['movieId'] == movie_id].index,
inplace=True) # removing those movies from the dataframe
return movies_df
'''
This function displays the item's information by using the index value of the item in list.
'''
def display_topk(movies, agg_list):
for index, value in agg_list: # agg_list refers to the group's top 20 recommended movie list
movie_title = movies.get('title')[index] # get the Movie Title corresponding the index
print(movie_title)
# main
# make changes according to your path
root = 'ml-latest-small'
# read movies.csv and ratings.csv from the directory
movies_df = pd.read_csv(root + '/movies.csv', sep=',')
ratings_df = pd.read_csv(root + '/ratings.csv', sep=',')
movies_df = prune_movies_df(movies_df, ratings_df) # pruning the movies from movies_df that weren't rated by anyone
# ratings_df is reshaped according to the example in the slides for simplicity
ratings_df_reshaped = ratings_df.pivot(index='userId', columns='movieId', values='rating')
user_item_matrix = ratings_df_reshaped # We are going to use the matrix that has NaN values
'''
Let's make a group of three users: 1, 473 and 511
These values are not chosen randomly, These users are very similar to each other. Say a group of friends who have the
same taste in movies.
'''
group_predictions = []
group_ratings = [] # rating of all the original + predicted ratings of all the users in the group
group = [1, 473, 511]
for member in group:
member_index = member - 1 # Since the first user starts from index=0 in the dataframe
sim = get_similarUsers(u_index=member_index, ui_matrix=user_item_matrix, topk_users=30)
predictions, u_rating = pred(u_index=member_index, ui_matrix=user_item_matrix, sim_matrix=sim, topk_recommendations=20)
group_predictions.append(predictions) # Appending the 20 recommended movies for a user in the group list
group_ratings.append(u_rating)
print("Top 20 Predictions for User 1")
display_topk(movies_df, group_predictions[0])
print("\nTop 20 Predictions for User 2")
display_topk(movies_df, group_predictions[1])
print("\nTop 20 Predictions for User 3")
display_topk(movies_df, group_predictions[2])
'''
Average Aggregation and Least Misery Aggregation:
for each item in all three lists of the group, we are going to calculate the group rating using the following formula
for Average Aggregation: r*(g,i) = AVGu∈g {r*(u,i)}
for Least Misery Aggregation: r*(g,i) = MINu∈g {r*(u,i)}
where r*(u,i) is either the predicted rating of u for i, or r(u,i) if this rating is present in the data set
'''
r_mean_gi = [] # list of average aggregated group item indices and their respective scores
r_min_gi = [] # list of least misery aggregated group item indices and their respective scores
index_visted = [] # list of visited indices/items
for i in range(len(group_predictions)):
for index, value in group_predictions[i]:
if index not in index_visted: # check to see if the score has already been aggregated for this item
index_visted.append(index) # if the item repeats, don't calculate the average again
r_u1i = group_ratings[0][index] # rating for item by user 1
r_u2i = group_ratings[1][index] # rating for item by user 2
r_u3i = group_ratings[2][index] # rating for item by user 3
r_mean = np.mean([r_u1i, r_u2i, r_u3i]) # mean of all three scores for a particula item
r_min = min(r_u1i, r_u2i, r_u3i) # min of all three scores for a particula item
r_mean_gi.append((index, r_mean)) # add the tuple of item index and its average score
r_min_gi.append((index, r_min)) # add the tuple of item index and its minimum score
r_mean_gi.sort(key=lambda w: w[1]) # sort the items based on their score
r_min_gi.sort(key=lambda w: w[1]) # sort the items based on their score
top20_average_agg_list = list(reversed(r_mean_gi[-20:])) # reverse the list to get the highest score on top
top20_leastmisery_agg_list = list(reversed(r_min_gi[-20:])) # reverse the list to get the highest score on top
print("\nTop 20 Movies for the group using Average Aggregation Method:")
display_topk(movies_df, top20_average_agg_list)
print("\nTop 20 Movies for the group using Least Misery Aggregation Method:")
display_topk(movies_df, top20_leastmisery_agg_list)