Recommender system using collaborative filtering
*Content-based approach requires a good amount of information of items’ own features, rather than using users’ interactions and feedbacks.
- Collaborative Filtering, on the other hand, doesn’t need anything else except users’ historical preference on a set of items. Because it’s based on historical data, the core assumption here is that the users who have agreed in the past tend to also agree in the future.
- mainly depend on the history of user ( Core: people perfer the things they get use to it )
- Explicit Rating
1. 5 stars for Titanic, explicitly show how much they like this kinda movie
- Implicit Rating
2. Suggests users preference indirectly, such as page views, clicks, purchase records, whether or not listen to a music track, and so on.
- user-based CF
- item-based CF.
-
Initialisation
- build n × m matrix that store the relation between user and ratings
- Parameter breakdown:
n × m matrix of ratings
u_i : represent users
p_j : represent item
- Caculation and predication
- User-based CF
The main process is to caculate the similarity between target user
and all other users
, select the top X similar users,and take the weighted average of ratings from these X users with similarity as weights
while different people have different baseline when giving ratings,some generally give full scores but some pretty strict. so,to aviod bias,we can substact each user's average rating of all item-based
when caculating weighted average,and add it back to the target user as below:
there are two ways to caculate similarity:
- recommend top-K
# predict TOP-12 recommended movies
python3 Recommender.py
- predict movie score
python3 Predict_Score.py
- evaluate 2
python3 evaluate.py
Output 3 pictures They are K-RMSE and K-Precision, Weight-RMSE and Weight-Precision of user base, Weight-RMSE and Weight-Precision of item base
for row in X_train.itertuples():
train_data_matrix[row[1]-1,row[2]-1] = row[3]
for row in x_test.itertuples():
train_data_matrix[row[1]-1,row[2]-1] = row[3]
In our project we using cosine similarity as distance function
user_similarity = pairwise_distances(train_data_matrix, metric = "cosine")
item_similarity = pairwise_distances(train_data_matrix.T, metric = "cosine")
to predicte top-K movie
def KNN(items, ratings, item_similarity, keywords, k):
'''
:param items: movelens pandas table
:param ratings: ratings of movie
:param item_similarity: similarity matrix
:param keywords: movie name
:param k: top-K
:return: list contains 1 moveName 2 movieSimilarity 3 mean
'''
moveList = []
movie_id = list(items[items['title'].str.contains(keywords)].item_id)[0]
movie_similarity = item_similarity[movie_id - 1]
movie_similarity_index = np.argsort(-movie_similarity)[1:k + 1]
for i in movie_similarity_index:
list_mv = list(set(list(items[items['item_id'] == i + 1].title)))
list_mv.append(movie_similarity[i])
list_mv.append(ratings[ratings['item_id'] == i + 1].rating.mean())
moveList.append(list_mv)
return moveList
to perdicte the score according to the user base and item base
def predict(rating, similarity, base = 'user'):
if base == 'user':
mean_user_rating = rating.mean(axis = 1)
rating_diff = (rating - mean_user_rating[:,np.newaxis])
pred = mean_user_rating[:,np.newaxis] + similarity.dot(rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
elif base == 'item':
pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
return pred
def predict_rate(rating, similarity):
pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
return pred
User based method to predict user's movie score:
def user_base_predict(testUser, topKUser):
# similarity again:
sim = pairwise_distances(testUser,topKUser, metric = "cosine")
sim2 = pairwise_distances(testUser,topKUser, metric = "cosine")
#print(sim)
for i in range(len(sim)):
for j in range(len(sim[0])):
sim[i][j] = 1/(sim[i][j]+1)
sim_avg = sim.mean(axis = 1)
pred = sim_avg * (np.dot(sim2,topKUser))
return pred
# get similarity of testUser with allUser
def get_similarity(testUser, allUser):
return pairwise_distances(testUser,allUser, metric = "cosine")
# get matrix of topK similarity User
def get_topK(matrix,similarity,k):
similarity = similarity[0]
topK_data_matrix = []
i = len(similarity)
for j in range(i):
arr = similarity.argsort()[-k:]
arr_index = arr
for m in arr_index:
topK_data_matrix.append(matrix[m])
# top k mean similarity
topK_data_matrix = np.asarray(topK_data_matrix)
return topK_data_matrix
# predict all user's score
def predict_all(train_data_matrix,topK):
predict = []
for i in range(len(train_data_matrix)):
testUser = [train_data_matrix[i]]
if i == 0:
allUser = train_data_matrix[i+1:]
elif i == (len(train_data_matrix) -1):
allUser = train_data_matrix[:i]
else:
allUp = train_data_matrix[:i]
allDown = train_data_matrix[i+1:]
allUser = np.concatenate((allUp,allDown))
s = get_similarity(testUser,allUser)
topKUser = get_topK(train_data_matrix,s,topK)
prediction = user_base_predict(testUser,topKUser)
predict.append(prediction)
return np.asarray(predict)
def predict_userMovieScore(predictall, userID):
return predictall[userID-1]
# RUN: if we want to predict the 1st user's score with topK similarity:
y_predict = predict_all(train_data_matrix,10)
predict_userMovieScore(y_predict,1)
To calculate the RMSE and return it
def rmse(prediction, ground_truth):
prediction = prediction[ground_truth.nonzero()].flatten()
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
return sqrt(mean_squared_error(prediction, ground_truth))
To calculate precision, recall, coverage and popularity and return them
def evaluate(train, prediction, item_popular, name):
hit = 0
rec_count = 0
test_count = 0
popular_sum = 0
hit_pred = set()
for u_index in range(n_users):
items = np.where(train_data_matrix[u_index, :] == 0)[0]
pre_items = sorted(
dict(zip(items, prediction[u_index, items])).items(),
key=itemgetter(1),
reverse=True)[:20]
test_items = np.where(test_data_matrix[u_index, :] > 0)[0]
for item, _ in pre_items:
if item in test_items:
hit += 1
hit_pred.add(item)
if item in item_popular:
popular_sum += math.log(1 + item_popular[item])
rec_count += len(pre_items)
test_count += len(test_items)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(hit_pred) / (1.0 * len(item_popular))
popularity = popular_sum / (1.0 * rec_count)
return precision, recall, coverage, popularity
Use svd and calculate its RMSE and precision
def svd(train_data_matrix, n_users, n_items, test_data_matrix):
svd_x = []
svd_y = []
precision_y = []
for i in range(1,50,2):
svd_x.append(i)
u, s, v = svds(train_data_matrix, k = i) #s为分解的奇异值 u[1650, 1650] s[1650, 940] v[940, 940]
s_matrix = np.diag(s)
pred_svd = np.dot(np.dot(u, s_matrix), v)
svd_y.append(rmse(pred_svd, test_data_matrix))
svd_precision, recall, coverage, popularity = evaluate(train_data_matrix, pred_svd, item_popular, 'svd')
precision_y.append(svd_precision)
return svd_x, svd_y, precision_y
Show the effect of K change on RMSE
def plot_rmse(train_data_matrix, n_users, n_items, test_data_matrix):
svd_x, svd_y, precision_y = svd(train_data_matrix, n_users, n_items, test_data_matrix)
svd_rmse = plt.figure()
a = svd_rmse.add_subplot(111)
b = svd_rmse.add_subplot(222)
a.set(xlim = [0,50], ylim = [2.6,3.5], title = 'k-rmse', xlabel = 'value of k', ylabel = 'RMSE')
a.plot(svd_x, svd_y, color = 'red')
b.set(xlim = [0,50], ylim = [0.1,0.3], title = 'k-rmse', xlabel = 'value of k', ylabel = 'Precision')
b.plot(svd_x, precision_y, color = 'red')
plt.show()
Show the effect of weight change on precision and RMSE
def plot_item(weight, precision_item):
item_rmse = plt.figure()
e = item_rmse.add_subplot(111)
f = item_rmse.add_subplot(222)
e.set(title = 'weight-rmse', xlabel = 'value of weight', ylabel = 'RMSE')
e.plot(weight, item_y)
f.set(title = 'weight-precision', xlabel = 'value of weight', ylabel = 'Precision')
f.plot(weight, precision_item)
plt.show()