From 0ca1b28d6041ce965a3969f9d529578d7c07be83 Mon Sep 17 00:00:00 2001
From: "Pavel N. Krivitsky"
Date: Thu, 9 May 2024 23:16:55 +1000
Subject: [PATCH] A number of optimisations: * Use of global variables has been
minimised, and most of the "main" code has been moved to the main() function.
* The code will save the embedding and load it if already present instead of
refitting every time. * Removed redundant loading of the data. * User and
Item IDs are now converted to strings for consistency. * Try use some Pandas
capabilities to speed up processing. * Statements, functions, and other
components that have no effect or are unused have been removed.
---
MCRS_GAT.py | 203 ++++++++++++++++++++--------------------------------
1 file changed, 76 insertions(+), 127 deletions(-)
diff --git a/MCRS_GAT.py b/MCRS_GAT.py
index 4c34ddc..76386cf 100644
--- a/MCRS_GAT.py
+++ b/MCRS_GAT.py
@@ -8,39 +8,23 @@
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data
import pandas as pd
-from sklearn.model_selection import train_test_split
-from torch_geometric.nn import GATConv
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.svm import SVR
-def read_data(file_path, criteria):
+def read_data(file_path):
data = pd.read_excel(file_path)
- user_id = data['User_ID']
- item_id = data['Items_ID']
+ data['User_ID'] = user_id = data['User_ID'].astype(str)
+ data['Items_ID'] = item_id = data['Items_ID'].astype(str)
user_id_map = {uid: i for i, uid in enumerate(user_id.unique())}
num_users = len(user_id_map)
item_id_map = {mid: i + num_users for i, mid in enumerate(item_id.unique())}
- num_items = len(item_id_map)
-
- num_criteria = len(criteria)
- base_ground_truth_ratings = np.zeros((num_users, num_items, num_criteria), dtype=np.int32)
- for i, row in data.iterrows():
- uid = row['User_ID']
- mid = row['Items_ID']
- criterion_ratings = [row[criterion] for criterion in criteria]
- if uid in user_id_map and mid in item_id_map:
- user_idx = user_id_map[uid]
- item_idx = item_id_map[mid] - num_users
- base_ground_truth_ratings[user_idx, item_idx] = criterion_ratings
+ return data, user_id_map, item_id_map
- return user_id_map, item_id_map, base_ground_truth_ratings
-
-def L_BGNN(file_path, criteria, user_id_map, item_id_map):
- graph_data = pd.read_excel(file_path)
+def L_BGNN(data, criteria, user_id_map, item_id_map):
matrices = [] # Initialize a list to store the normalized matrices for each criterion
n_nodes = len(user_id_map) + len(item_id_map)
@@ -48,10 +32,10 @@ def L_BGNN(file_path, criteria, user_id_map, item_id_map):
# TODO: Check if this should be a sparse matrix.
adj_matrix = np.zeros((n_nodes, n_nodes), dtype=np.int32)
- for i in range(len(graph_data)):
- uid = user_id_map[graph_data['User_ID'][i]]
- mid = item_id_map[graph_data['Items_ID'][i]]
- rating = graph_data[criterion][i]
+ for i in range(len(data)):
+ uid = user_id_map[data['User_ID'][i]]
+ mid = item_id_map[data['Items_ID'][i]]
+ rating = data[criterion][i]
adj_matrix[uid][mid] = rating
adj_matrix[mid][uid] = rating
@@ -124,7 +108,7 @@ def forward(self, x, edge_index):
return x
def fusion_embeddings_vectors(self, embeddings_list): # Add self parameter
- max_size = max([embedding.size(0) for embedding in embeddings_list])
+ max_size = max(embedding.size(0) for embedding in embeddings_list)
# Pad embeddings to the maximum size
padded_embeddings = [F.pad(embedding, (0, 0, 0, max_size - embedding.size(0))) for embedding in embeddings_list]
@@ -238,40 +222,20 @@ def train_GAT(self, optimizer, data, embeddings_list, alpha=0.5, beta=0.5, gamma
# -------------Recommendation Section -------------------------
-def create_ground_truth_ratings(file_path, criteria):
- # data = pd.read_excel(file_path)
-
- # Create a mapping from user/item IDs to unique integer indices
- user_id_map = {uid: i for i, uid in enumerate(data['User_ID'].unique())}
- item_id_map = {mid: i for i, mid in enumerate(data['Items_ID'].unique())}
-
- num_users = len(user_id_map)
- num_items = len(item_id_map)
- ground_truth_ratings_matrix = np.zeros((num_users, num_items, 1), dtype=np.float32)
-
- for _, row in data.iterrows():
- uid = row['User_ID']
- mid = row['Items_ID']
- overall_rating = row['Overall_Rating']
-
- if uid in user_id_map and mid in item_id_map:
- user_idx = user_id_map[uid]
- item_idx = item_id_map[mid]
- ground_truth_ratings_matrix[user_idx, item_idx, 0] = overall_rating
-
- return data, ground_truth_ratings_matrix, user_id_map, item_id_map
-
def Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_func=None, top_k=1):
recommendations_f_items = {}
+ num_users = len(user_id_map)
- # Convert fused embeddings to numpy array
- fused_embeddings_np = fused_embeddings.cpu().detach().numpy()
+ # Convert fused embeddings to numpy array, focusing on users
+ fused_embeddings_np = fused_embeddings.cpu().detach().numpy()[0:num_users,]
# Compute similarities between embeddings
similarities = cosine_similarity(fused_embeddings_np)
- # Iterate over all users
- for user_idx, user_id in enumerate(user_id_map.keys()):
+ # Iterate over all users in order of index
+ grouped = data.groupby('User_ID')
+ uids = sorted(user_id_map.items(), key = lambda x: x[1])
+ for user_id, user_idx in uids:
# Determine threshold value using threshold_func
if threshold_func is not None:
threshold_A = threshold_func(fused_embeddings[user_idx]).item()
@@ -291,27 +255,23 @@ def Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_fu
recommended_items = []
# Retrieve the current user's rating from the data
- user_data = data[data['User_ID'] == user_id]
+ user_data = grouped.get_group(user_id)
if len(user_data) > 0: # Check if there are ratings for this user
current_user_rating = user_data['Overall_Rating'].values[0]
# Get recommended items for the user
for user_idx_2 in similar_users_sorted_idx:
- if user_idx_2 >= len(user_id_map.keys()):
- continue # Skip if index is out of range
- user_id_2 = list(user_id_map.keys())[user_idx_2]
- for _, row in data[data['User_ID'] == user_id_2].iterrows():
+ user_id_2 = uids[user_idx_2][0]
+ for _, row in grouped.get_group(user_id_2).iterrows():
item_id = row['Items_ID']
overall_rating = row['Overall_Rating']
- # Check if overall rating is similar to the current user's rating
- if abs(overall_rating - current_user_rating) <= threshold_A:
+ # Check if overall rating is similar to the
+ # current user's rating and filter out items
+ # already rated by the current user
+ if item_id not in user_data['Items_ID'].values and abs(overall_rating - current_user_rating) <= threshold_A:
recommended_items.append({'item_id': item_id, 'Overall_Rating': overall_rating})
- # Filter out items already rated by the current user
- recommended_items = [item for item in recommended_items if
- item['item_id'] not in user_data['Items_ID'].values]
-
# Sort recommended items by overall rating
recommended_items = sorted(recommended_items, key=lambda x: x['Overall_Rating'], reverse=True)[:top_k]
@@ -325,13 +285,7 @@ def Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_fu
return recommendations_f_items
-def split_and_save_data(file_path, criteria, test_size=0.2, random_state=42):
- # Call the read_data function to obtain the data structures
- user_id_map, item_id_map, base_ground_truth_ratings = read_data(file_path, criteria)
-
- # Read the data from the file
- data = pd.read_excel(file_path)
-
+def split_and_save_data(data, output_path=None, test_size=0.2, random_state=42):
# Convert User_ID and Items_ID columns to string type
data['User_ID'] = data['User_ID'].astype(str)
data['Items_ID'] = data['Items_ID'].astype(str)
@@ -342,9 +296,10 @@ def split_and_save_data(file_path, criteria, test_size=0.2, random_state=42):
# Split the data into train and test sets
train_data, test_data = train_test_split(data_subset, test_size=test_size, random_state=random_state)
- # Define file paths for train and test data
- train_file_path = os.path.join(os.path.dirname(file_path), 'train_data.xlsx')
- test_file_path = os.path.join(os.path.dirname(file_path), 'test_data.xlsx')
+ if output_path:
+ # Define file paths for train and test data
+ train_file_path = os.path.join(output_path, 'train_data.xlsx')
+ test_file_path = os.path.join(output_path, 'test_data.xlsx')
# Save the train and test sets into separate files
train_data.to_excel(train_file_path, index=False)
@@ -352,9 +307,9 @@ def split_and_save_data(file_path, criteria, test_size=0.2, random_state=42):
return train_data, test_data
-def evaluate_RS_Model(fused_embeddings, user_id_map, file_path, criteria, test_size=0.2, random_state=42):
+def evaluate_RS_Model(fused_embeddings, user_id_map, item_id_map, data, output_path, test_size=0.2, random_state=42):
# Split and save the data into train and test sets
- train_data, test_data = split_and_save_data(file_path, criteria, test_size=test_size, random_state=random_state)
+ train_data, test_data = split_and_save_data(data, output_path, test_size=test_size, random_state=random_state)
# Prepare training data
train_X = fused_embeddings.cpu().detach().numpy()[train_data['User_ID'].astype('category').cat.codes]
@@ -405,7 +360,6 @@ def threshold_function(embedding):
# Prepare test data
test_user_ids = test_data['User_ID'].values.astype(str)
- test_item_ids = test_data['Items_ID'].values.astype(str)
# Ensure all user IDs in test data are present in user_id_map
for user_id in test_user_ids:
@@ -438,7 +392,7 @@ def threshold_function(embedding):
return test_mae, test_rmse
-def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, criteria, test_size=0.2, num_runs=30):
+def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, item_id_map, data, output_path, test_size=0.2, num_runs=30):
# Lists to store MAE and RMSE values from each run
mae_values = []
rmse_values = []
@@ -446,7 +400,7 @@ def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, cr
# Perform specified number of runs of the function and collect MAE and RMSE values
for i in range(num_runs):
print("Run", i+1)
- mae, rmse = evaluate_RS_Model(fused_embeddings, user_id_map, file_path, criteria, test_size=test_size, random_state=i)
+ mae, rmse = evaluate_RS_Model(fused_embeddings, user_id_map, item_id_map, data, output_path, test_size=test_size, random_state=i)
mae_values.append(mae)
rmse_values.append(rmse)
@@ -471,6 +425,47 @@ def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, cr
# ---------------------Main Function ---------------------------
+def main(file_path, criteria, save_embeddings=False):
+ # Read data for the selected dataset
+ print("Reading data...")
+ data, user_id_map, item_id_map = read_data(file_path)
+ print("Reading data finished.")
+
+ if save_embeddings and type(save_embeddings) != str:
+ save_embeddings = file_path + '.embed.pt'
+
+ if save_embeddings and os.path.isfile(save_embeddings):
+ embeddings_loaded = True
+ print("Loading embeddings...")
+ fused_embeddings = torch.load(save_embeddings)
+ print("Loading embeddings finished...")
+ else:
+ embeddings_loaded = False
+ print("Constricting sociomatrices...")
+ matrices = L_BGNN(data, criteria, user_id_map, item_id_map)
+ print("Constricting sociomatrices finished.")
+
+ #---Attention Embedding------
+ print("Constricting model...")
+ model = GAT(in_channels=16, out_channels=256)
+ print("Constricting model finished.")
+
+ print("Generating embeddings...")
+ fused_embeddings = model.Multi_Embd(matrices, num_epochs=100, learning_rate=0.01)
+ print("Generating embeddings finished.")
+
+ if save_embeddings and not embeddings_loaded:
+ print("Saving embeddings...")
+ torch.save(fused_embeddings, save_embeddings)
+ print("Saving embeddings finished...")
+
+ # Call the function with the defined threshold function
+ print("Evaluating...")
+ Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_func=None, top_k=1)
+ evaluate_RS_Model(fused_embeddings, user_id_map, item_id_map, data, os.path.dirname(file_path), test_size=0.2, random_state=42)
+ evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, item_id_map, data, os.path.dirname(file_path), test_size=0.2, num_runs=30)
+
+
if __name__ == "__main__":
# Define your file paths for different datasets in Katana Server
@@ -495,52 +490,6 @@ def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, cr
}
# Define the dataset to run
- dataset_to_run = 'BeerAdvocate'
-
- # Read data for the selected dataset
-
- file_path = file_paths[dataset_to_run]
- criteria = criteria_mapping[dataset_to_run]
- user_id_map, item_id_map, base_ground_truth_ratings = read_data(file_path, criteria)
- num_users = len(user_id_map)
- num_items = len(item_id_map)
- num_criteria = len(criteria)
- data = pd.read_excel(file_path)
-
- # Read data from the Excel file and create ID mappings
- matrices = L_BGNN(file_path, criteria, user_id_map, item_id_map)
-
- #---Attention Embedding------
- model = GAT(in_channels=16, out_channels=256)
- fused_embeddings = model.Multi_Embd(matrices, num_epochs=100, learning_rate=0.01)
-
- # Recommendation section
- num_samples = fused_embeddings.shape[0]
-
- # Create an instance of the MultiCriteriaRecommender class
- output_dim = fused_embeddings.shape[1] # Set output_dim to the number of criteria
+ DATASET_TO_RUN = 'BeerAdvocate'
- # Convert fused_embeddings to a torch tensor
- fused_embeddings_tensor = fused_embeddings.clone().detach().to(torch.float32)
-
- # Reshape fused_embeddings_tensor to match the expected shape
- num_samples, num_features = fused_embeddings_tensor.shape
-
- # Calculate the total number of features per criterion
- num_features_per_criterion = num_features // num_criteria
-
- # Create a DataFrame with user and item identifiers as MultiIndex
- df_users_items = pd.DataFrame(index=pd.MultiIndex.from_tuples([(user_id, item_id) for user_id in user_id_map.keys() for item_id in item_id_map.keys()]))
-
- # Call the create_real_ratings function
- data, ground_truth_ratings_matrix, user_id_map, item_id_map = create_ground_truth_ratings(file_path, criteria)
-
- # Define the threshold function
- def threshold_function(embedding):
- return torch.tensor(0.1)
-
- # Call the function with the defined threshold function
- recommendations = Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_func=None, top_k=1)
- train_data, test_data = split_and_save_data(file_path, criteria)
- test_mae, test_rmse=evaluate_RS_Model(fused_embeddings, user_id_map, file_path, criteria, test_size=0.2, random_state=42)
- mae_std, rmse_std=evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, criteria, test_size=0.2, num_runs=30)
+ main(file_paths[DATASET_TO_RUN], criteria_mapping[DATASET_TO_RUN], True)