From 0ca1b28d6041ce965a3969f9d529578d7c07be83 Mon Sep 17 00:00:00 2001 From: "Pavel N. Krivitsky" Date: Thu, 9 May 2024 23:16:55 +1000 Subject: [PATCH] A number of optimisations: * Use of global variables has been minimised, and most of the "main" code has been moved to the main() function. * The code will save the embedding and load it if already present instead of refitting every time. * Removed redundant loading of the data. * User and Item IDs are now converted to strings for consistency. * Try use some Pandas capabilities to speed up processing. * Statements, functions, and other components that have no effect or are unused have been removed. --- MCRS_GAT.py | 203 ++++++++++++++++++++-------------------------------- 1 file changed, 76 insertions(+), 127 deletions(-) diff --git a/MCRS_GAT.py b/MCRS_GAT.py index 4c34ddc..76386cf 100644 --- a/MCRS_GAT.py +++ b/MCRS_GAT.py @@ -8,39 +8,23 @@ from sklearn.model_selection import train_test_split from torch_geometric.data import Data import pandas as pd -from sklearn.model_selection import train_test_split -from torch_geometric.nn import GATConv from sklearn.metrics import mean_absolute_error, mean_squared_error from sklearn.svm import SVR -def read_data(file_path, criteria): +def read_data(file_path): data = pd.read_excel(file_path) - user_id = data['User_ID'] - item_id = data['Items_ID'] + data['User_ID'] = user_id = data['User_ID'].astype(str) + data['Items_ID'] = item_id = data['Items_ID'].astype(str) user_id_map = {uid: i for i, uid in enumerate(user_id.unique())} num_users = len(user_id_map) item_id_map = {mid: i + num_users for i, mid in enumerate(item_id.unique())} - num_items = len(item_id_map) - - num_criteria = len(criteria) - base_ground_truth_ratings = np.zeros((num_users, num_items, num_criteria), dtype=np.int32) - for i, row in data.iterrows(): - uid = row['User_ID'] - mid = row['Items_ID'] - criterion_ratings = [row[criterion] for criterion in criteria] - if uid in user_id_map and mid in item_id_map: - user_idx = user_id_map[uid] - item_idx = item_id_map[mid] - num_users - base_ground_truth_ratings[user_idx, item_idx] = criterion_ratings + return data, user_id_map, item_id_map - return user_id_map, item_id_map, base_ground_truth_ratings - -def L_BGNN(file_path, criteria, user_id_map, item_id_map): - graph_data = pd.read_excel(file_path) +def L_BGNN(data, criteria, user_id_map, item_id_map): matrices = [] # Initialize a list to store the normalized matrices for each criterion n_nodes = len(user_id_map) + len(item_id_map) @@ -48,10 +32,10 @@ def L_BGNN(file_path, criteria, user_id_map, item_id_map): # TODO: Check if this should be a sparse matrix. adj_matrix = np.zeros((n_nodes, n_nodes), dtype=np.int32) - for i in range(len(graph_data)): - uid = user_id_map[graph_data['User_ID'][i]] - mid = item_id_map[graph_data['Items_ID'][i]] - rating = graph_data[criterion][i] + for i in range(len(data)): + uid = user_id_map[data['User_ID'][i]] + mid = item_id_map[data['Items_ID'][i]] + rating = data[criterion][i] adj_matrix[uid][mid] = rating adj_matrix[mid][uid] = rating @@ -124,7 +108,7 @@ def forward(self, x, edge_index): return x def fusion_embeddings_vectors(self, embeddings_list): # Add self parameter - max_size = max([embedding.size(0) for embedding in embeddings_list]) + max_size = max(embedding.size(0) for embedding in embeddings_list) # Pad embeddings to the maximum size padded_embeddings = [F.pad(embedding, (0, 0, 0, max_size - embedding.size(0))) for embedding in embeddings_list] @@ -238,40 +222,20 @@ def train_GAT(self, optimizer, data, embeddings_list, alpha=0.5, beta=0.5, gamma # -------------Recommendation Section ------------------------- -def create_ground_truth_ratings(file_path, criteria): - # data = pd.read_excel(file_path) - - # Create a mapping from user/item IDs to unique integer indices - user_id_map = {uid: i for i, uid in enumerate(data['User_ID'].unique())} - item_id_map = {mid: i for i, mid in enumerate(data['Items_ID'].unique())} - - num_users = len(user_id_map) - num_items = len(item_id_map) - ground_truth_ratings_matrix = np.zeros((num_users, num_items, 1), dtype=np.float32) - - for _, row in data.iterrows(): - uid = row['User_ID'] - mid = row['Items_ID'] - overall_rating = row['Overall_Rating'] - - if uid in user_id_map and mid in item_id_map: - user_idx = user_id_map[uid] - item_idx = item_id_map[mid] - ground_truth_ratings_matrix[user_idx, item_idx, 0] = overall_rating - - return data, ground_truth_ratings_matrix, user_id_map, item_id_map - def Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_func=None, top_k=1): recommendations_f_items = {} + num_users = len(user_id_map) - # Convert fused embeddings to numpy array - fused_embeddings_np = fused_embeddings.cpu().detach().numpy() + # Convert fused embeddings to numpy array, focusing on users + fused_embeddings_np = fused_embeddings.cpu().detach().numpy()[0:num_users,] # Compute similarities between embeddings similarities = cosine_similarity(fused_embeddings_np) - # Iterate over all users - for user_idx, user_id in enumerate(user_id_map.keys()): + # Iterate over all users in order of index + grouped = data.groupby('User_ID') + uids = sorted(user_id_map.items(), key = lambda x: x[1]) + for user_id, user_idx in uids: # Determine threshold value using threshold_func if threshold_func is not None: threshold_A = threshold_func(fused_embeddings[user_idx]).item() @@ -291,27 +255,23 @@ def Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_fu recommended_items = [] # Retrieve the current user's rating from the data - user_data = data[data['User_ID'] == user_id] + user_data = grouped.get_group(user_id) if len(user_data) > 0: # Check if there are ratings for this user current_user_rating = user_data['Overall_Rating'].values[0] # Get recommended items for the user for user_idx_2 in similar_users_sorted_idx: - if user_idx_2 >= len(user_id_map.keys()): - continue # Skip if index is out of range - user_id_2 = list(user_id_map.keys())[user_idx_2] - for _, row in data[data['User_ID'] == user_id_2].iterrows(): + user_id_2 = uids[user_idx_2][0] + for _, row in grouped.get_group(user_id_2).iterrows(): item_id = row['Items_ID'] overall_rating = row['Overall_Rating'] - # Check if overall rating is similar to the current user's rating - if abs(overall_rating - current_user_rating) <= threshold_A: + # Check if overall rating is similar to the + # current user's rating and filter out items + # already rated by the current user + if item_id not in user_data['Items_ID'].values and abs(overall_rating - current_user_rating) <= threshold_A: recommended_items.append({'item_id': item_id, 'Overall_Rating': overall_rating}) - # Filter out items already rated by the current user - recommended_items = [item for item in recommended_items if - item['item_id'] not in user_data['Items_ID'].values] - # Sort recommended items by overall rating recommended_items = sorted(recommended_items, key=lambda x: x['Overall_Rating'], reverse=True)[:top_k] @@ -325,13 +285,7 @@ def Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_fu return recommendations_f_items -def split_and_save_data(file_path, criteria, test_size=0.2, random_state=42): - # Call the read_data function to obtain the data structures - user_id_map, item_id_map, base_ground_truth_ratings = read_data(file_path, criteria) - - # Read the data from the file - data = pd.read_excel(file_path) - +def split_and_save_data(data, output_path=None, test_size=0.2, random_state=42): # Convert User_ID and Items_ID columns to string type data['User_ID'] = data['User_ID'].astype(str) data['Items_ID'] = data['Items_ID'].astype(str) @@ -342,9 +296,10 @@ def split_and_save_data(file_path, criteria, test_size=0.2, random_state=42): # Split the data into train and test sets train_data, test_data = train_test_split(data_subset, test_size=test_size, random_state=random_state) - # Define file paths for train and test data - train_file_path = os.path.join(os.path.dirname(file_path), 'train_data.xlsx') - test_file_path = os.path.join(os.path.dirname(file_path), 'test_data.xlsx') + if output_path: + # Define file paths for train and test data + train_file_path = os.path.join(output_path, 'train_data.xlsx') + test_file_path = os.path.join(output_path, 'test_data.xlsx') # Save the train and test sets into separate files train_data.to_excel(train_file_path, index=False) @@ -352,9 +307,9 @@ def split_and_save_data(file_path, criteria, test_size=0.2, random_state=42): return train_data, test_data -def evaluate_RS_Model(fused_embeddings, user_id_map, file_path, criteria, test_size=0.2, random_state=42): +def evaluate_RS_Model(fused_embeddings, user_id_map, item_id_map, data, output_path, test_size=0.2, random_state=42): # Split and save the data into train and test sets - train_data, test_data = split_and_save_data(file_path, criteria, test_size=test_size, random_state=random_state) + train_data, test_data = split_and_save_data(data, output_path, test_size=test_size, random_state=random_state) # Prepare training data train_X = fused_embeddings.cpu().detach().numpy()[train_data['User_ID'].astype('category').cat.codes] @@ -405,7 +360,6 @@ def threshold_function(embedding): # Prepare test data test_user_ids = test_data['User_ID'].values.astype(str) - test_item_ids = test_data['Items_ID'].values.astype(str) # Ensure all user IDs in test data are present in user_id_map for user_id in test_user_ids: @@ -438,7 +392,7 @@ def threshold_function(embedding): return test_mae, test_rmse -def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, criteria, test_size=0.2, num_runs=30): +def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, item_id_map, data, output_path, test_size=0.2, num_runs=30): # Lists to store MAE and RMSE values from each run mae_values = [] rmse_values = [] @@ -446,7 +400,7 @@ def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, cr # Perform specified number of runs of the function and collect MAE and RMSE values for i in range(num_runs): print("Run", i+1) - mae, rmse = evaluate_RS_Model(fused_embeddings, user_id_map, file_path, criteria, test_size=test_size, random_state=i) + mae, rmse = evaluate_RS_Model(fused_embeddings, user_id_map, item_id_map, data, output_path, test_size=test_size, random_state=i) mae_values.append(mae) rmse_values.append(rmse) @@ -471,6 +425,47 @@ def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, cr # ---------------------Main Function --------------------------- +def main(file_path, criteria, save_embeddings=False): + # Read data for the selected dataset + print("Reading data...") + data, user_id_map, item_id_map = read_data(file_path) + print("Reading data finished.") + + if save_embeddings and type(save_embeddings) != str: + save_embeddings = file_path + '.embed.pt' + + if save_embeddings and os.path.isfile(save_embeddings): + embeddings_loaded = True + print("Loading embeddings...") + fused_embeddings = torch.load(save_embeddings) + print("Loading embeddings finished...") + else: + embeddings_loaded = False + print("Constricting sociomatrices...") + matrices = L_BGNN(data, criteria, user_id_map, item_id_map) + print("Constricting sociomatrices finished.") + + #---Attention Embedding------ + print("Constricting model...") + model = GAT(in_channels=16, out_channels=256) + print("Constricting model finished.") + + print("Generating embeddings...") + fused_embeddings = model.Multi_Embd(matrices, num_epochs=100, learning_rate=0.01) + print("Generating embeddings finished.") + + if save_embeddings and not embeddings_loaded: + print("Saving embeddings...") + torch.save(fused_embeddings, save_embeddings) + print("Saving embeddings finished...") + + # Call the function with the defined threshold function + print("Evaluating...") + Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_func=None, top_k=1) + evaluate_RS_Model(fused_embeddings, user_id_map, item_id_map, data, os.path.dirname(file_path), test_size=0.2, random_state=42) + evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, item_id_map, data, os.path.dirname(file_path), test_size=0.2, num_runs=30) + + if __name__ == "__main__": # Define your file paths for different datasets in Katana Server @@ -495,52 +490,6 @@ def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, cr } # Define the dataset to run - dataset_to_run = 'BeerAdvocate' - - # Read data for the selected dataset - - file_path = file_paths[dataset_to_run] - criteria = criteria_mapping[dataset_to_run] - user_id_map, item_id_map, base_ground_truth_ratings = read_data(file_path, criteria) - num_users = len(user_id_map) - num_items = len(item_id_map) - num_criteria = len(criteria) - data = pd.read_excel(file_path) - - # Read data from the Excel file and create ID mappings - matrices = L_BGNN(file_path, criteria, user_id_map, item_id_map) - - #---Attention Embedding------ - model = GAT(in_channels=16, out_channels=256) - fused_embeddings = model.Multi_Embd(matrices, num_epochs=100, learning_rate=0.01) - - # Recommendation section - num_samples = fused_embeddings.shape[0] - - # Create an instance of the MultiCriteriaRecommender class - output_dim = fused_embeddings.shape[1] # Set output_dim to the number of criteria + DATASET_TO_RUN = 'BeerAdvocate' - # Convert fused_embeddings to a torch tensor - fused_embeddings_tensor = fused_embeddings.clone().detach().to(torch.float32) - - # Reshape fused_embeddings_tensor to match the expected shape - num_samples, num_features = fused_embeddings_tensor.shape - - # Calculate the total number of features per criterion - num_features_per_criterion = num_features // num_criteria - - # Create a DataFrame with user and item identifiers as MultiIndex - df_users_items = pd.DataFrame(index=pd.MultiIndex.from_tuples([(user_id, item_id) for user_id in user_id_map.keys() for item_id in item_id_map.keys()])) - - # Call the create_real_ratings function - data, ground_truth_ratings_matrix, user_id_map, item_id_map = create_ground_truth_ratings(file_path, criteria) - - # Define the threshold function - def threshold_function(embedding): - return torch.tensor(0.1) - - # Call the function with the defined threshold function - recommendations = Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_func=None, top_k=1) - train_data, test_data = split_and_save_data(file_path, criteria) - test_mae, test_rmse=evaluate_RS_Model(fused_embeddings, user_id_map, file_path, criteria, test_size=0.2, random_state=42) - mae_std, rmse_std=evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, criteria, test_size=0.2, num_runs=30) + main(file_paths[DATASET_TO_RUN], criteria_mapping[DATASET_TO_RUN], True)