-
Notifications
You must be signed in to change notification settings - Fork 5
/
utils.py
53 lines (40 loc) · 2.18 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import math
import numpy as np
np.random.seed(42)
def create_maps(dataset):
ext2int_user_map = {v: k for k, v in enumerate(dataset.iloc[:, 0].unique())}
int2ext_user_map = {k: v for k, v in enumerate(dataset.iloc[:, 0].unique())}
ext2int_item_map = {v: k for k, v in enumerate(dataset.iloc[:, 1].unique())}
int2ext_item_map = {k: v for k, v in enumerate(dataset.iloc[:, 1].unique())}
return ext2int_user_map, int2ext_user_map, ext2int_item_map, int2ext_item_map
def splitting(dataset, ratio=0.2):
print("Performing splitting...")
user_size = dataset.groupby(['userId'], as_index=True).size()
user_threshold = user_size.apply(lambda x: math.floor(x * (1 - ratio)))
dataset['rank_first'] = dataset.groupby(['userId'])['timestamp'].rank(method='first', ascending=True)
dataset["test_flag"] = dataset.apply(
lambda x: x["rank_first"] > user_threshold.loc[x["userId"]], axis=1)
test = dataset[dataset["test_flag"] == True].drop(columns=["rank_first", "test_flag"]).reset_index(drop=True)
train = dataset[dataset["test_flag"] == False].drop(columns=["rank_first", "test_flag"]).reset_index(drop=True)
print("Done!")
return train, test
def splitting_leave_one_out(dataset):
print("Performing splitting...")
dataset['rank_first'] = dataset.groupby(['userId'])['timestamp'].rank(method='first', ascending=False, axis=1)
dataset["test_flag"] = dataset.apply(
lambda x: x["rank_first"] <= 1, axis=1)
test = dataset[dataset["test_flag"] == True].drop(columns=["rank_first", "test_flag"]).reset_index(drop=True)
train = dataset[dataset["test_flag"] == False].drop(columns=["rank_first", "test_flag"]).reset_index(drop=True)
return train, test
def dataframe_to_dict(data):
users = list(data['userId'].unique())
"Conversion to Dictionary"
ratings = {}
for u in users:
sel_ = data[data['userId'] == u]
ratings[u] = dict(zip(sel_['movieId'], sel_['rating']))
return ratings
def split_data(data, n_organizations):
item_idxs = np.array_split(np.random.permutation(data.iloc[:, 1].unique()), n_organizations)
dfs = [data[data.iloc[:, 1].isin(item_idxs[i])] for i in range(n_organizations)]
return dfs