Skip to content

Commit

Permalink
A number of optimisations:
Browse files Browse the repository at this point in the history
* Use of global variables has been minimised, and most of the "main" code has been moved to the main() function.
* The code will save the embedding and load it if already present instead of refitting every time.
* Removed redundant loading of the data.
* User and Item IDs are now converted to strings for consistency.
* Try use some Pandas capabilities to speed up processing.
* Statements, functions, and other components that have no effect or are unused have been removed.
  • Loading branch information
krivit committed May 9, 2024
1 parent 70bab98 commit 0ca1b28
Showing 1 changed file with 76 additions and 127 deletions.
203 changes: 76 additions & 127 deletions MCRS_GAT.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,50 +8,34 @@
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data
import pandas as pd
from sklearn.model_selection import train_test_split
from torch_geometric.nn import GATConv
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.svm import SVR


def read_data(file_path, criteria):
def read_data(file_path):

data = pd.read_excel(file_path)
user_id = data['User_ID']
item_id = data['Items_ID']
data['User_ID'] = user_id = data['User_ID'].astype(str)
data['Items_ID'] = item_id = data['Items_ID'].astype(str)

user_id_map = {uid: i for i, uid in enumerate(user_id.unique())}
num_users = len(user_id_map)
item_id_map = {mid: i + num_users for i, mid in enumerate(item_id.unique())}
num_items = len(item_id_map)

num_criteria = len(criteria)
base_ground_truth_ratings = np.zeros((num_users, num_items, num_criteria), dtype=np.int32)

for i, row in data.iterrows():
uid = row['User_ID']
mid = row['Items_ID']
criterion_ratings = [row[criterion] for criterion in criteria]
if uid in user_id_map and mid in item_id_map:
user_idx = user_id_map[uid]
item_idx = item_id_map[mid] - num_users
base_ground_truth_ratings[user_idx, item_idx] = criterion_ratings
return data, user_id_map, item_id_map

return user_id_map, item_id_map, base_ground_truth_ratings

def L_BGNN(file_path, criteria, user_id_map, item_id_map):
graph_data = pd.read_excel(file_path)
def L_BGNN(data, criteria, user_id_map, item_id_map):
matrices = [] # Initialize a list to store the normalized matrices for each criterion
n_nodes = len(user_id_map) + len(item_id_map)

for criterion in criteria:
# TODO: Check if this should be a sparse matrix.
adj_matrix = np.zeros((n_nodes, n_nodes), dtype=np.int32)

for i in range(len(graph_data)):
uid = user_id_map[graph_data['User_ID'][i]]
mid = item_id_map[graph_data['Items_ID'][i]]
rating = graph_data[criterion][i]
for i in range(len(data)):
uid = user_id_map[data['User_ID'][i]]
mid = item_id_map[data['Items_ID'][i]]
rating = data[criterion][i]

adj_matrix[uid][mid] = rating
adj_matrix[mid][uid] = rating
Expand Down Expand Up @@ -124,7 +108,7 @@ def forward(self, x, edge_index):
return x

def fusion_embeddings_vectors(self, embeddings_list): # Add self parameter
max_size = max([embedding.size(0) for embedding in embeddings_list])
max_size = max(embedding.size(0) for embedding in embeddings_list)

# Pad embeddings to the maximum size
padded_embeddings = [F.pad(embedding, (0, 0, 0, max_size - embedding.size(0))) for embedding in embeddings_list]
Expand Down Expand Up @@ -238,40 +222,20 @@ def train_GAT(self, optimizer, data, embeddings_list, alpha=0.5, beta=0.5, gamma

# -------------Recommendation Section -------------------------

def create_ground_truth_ratings(file_path, criteria):
# data = pd.read_excel(file_path)

# Create a mapping from user/item IDs to unique integer indices
user_id_map = {uid: i for i, uid in enumerate(data['User_ID'].unique())}
item_id_map = {mid: i for i, mid in enumerate(data['Items_ID'].unique())}

num_users = len(user_id_map)
num_items = len(item_id_map)
ground_truth_ratings_matrix = np.zeros((num_users, num_items, 1), dtype=np.float32)

for _, row in data.iterrows():
uid = row['User_ID']
mid = row['Items_ID']
overall_rating = row['Overall_Rating']

if uid in user_id_map and mid in item_id_map:
user_idx = user_id_map[uid]
item_idx = item_id_map[mid]
ground_truth_ratings_matrix[user_idx, item_idx, 0] = overall_rating

return data, ground_truth_ratings_matrix, user_id_map, item_id_map

def Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_func=None, top_k=1):
recommendations_f_items = {}
num_users = len(user_id_map)

# Convert fused embeddings to numpy array
fused_embeddings_np = fused_embeddings.cpu().detach().numpy()
# Convert fused embeddings to numpy array, focusing on users
fused_embeddings_np = fused_embeddings.cpu().detach().numpy()[0:num_users,]

# Compute similarities between embeddings
similarities = cosine_similarity(fused_embeddings_np)

# Iterate over all users
for user_idx, user_id in enumerate(user_id_map.keys()):
# Iterate over all users in order of index
grouped = data.groupby('User_ID')
uids = sorted(user_id_map.items(), key = lambda x: x[1])
for user_id, user_idx in uids:
# Determine threshold value using threshold_func
if threshold_func is not None:
threshold_A = threshold_func(fused_embeddings[user_idx]).item()
Expand All @@ -291,27 +255,23 @@ def Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_fu
recommended_items = []

# Retrieve the current user's rating from the data
user_data = data[data['User_ID'] == user_id]
user_data = grouped.get_group(user_id)
if len(user_data) > 0: # Check if there are ratings for this user
current_user_rating = user_data['Overall_Rating'].values[0]

# Get recommended items for the user
for user_idx_2 in similar_users_sorted_idx:
if user_idx_2 >= len(user_id_map.keys()):
continue # Skip if index is out of range
user_id_2 = list(user_id_map.keys())[user_idx_2]
for _, row in data[data['User_ID'] == user_id_2].iterrows():
user_id_2 = uids[user_idx_2][0]
for _, row in grouped.get_group(user_id_2).iterrows():
item_id = row['Items_ID']
overall_rating = row['Overall_Rating']

# Check if overall rating is similar to the current user's rating
if abs(overall_rating - current_user_rating) <= threshold_A:
# Check if overall rating is similar to the
# current user's rating and filter out items
# already rated by the current user
if item_id not in user_data['Items_ID'].values and abs(overall_rating - current_user_rating) <= threshold_A:
recommended_items.append({'item_id': item_id, 'Overall_Rating': overall_rating})

# Filter out items already rated by the current user
recommended_items = [item for item in recommended_items if
item['item_id'] not in user_data['Items_ID'].values]

# Sort recommended items by overall rating
recommended_items = sorted(recommended_items, key=lambda x: x['Overall_Rating'], reverse=True)[:top_k]

Expand All @@ -325,13 +285,7 @@ def Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_fu

return recommendations_f_items

def split_and_save_data(file_path, criteria, test_size=0.2, random_state=42):
# Call the read_data function to obtain the data structures
user_id_map, item_id_map, base_ground_truth_ratings = read_data(file_path, criteria)

# Read the data from the file
data = pd.read_excel(file_path)

def split_and_save_data(data, output_path=None, test_size=0.2, random_state=42):
# Convert User_ID and Items_ID columns to string type
data['User_ID'] = data['User_ID'].astype(str)
data['Items_ID'] = data['Items_ID'].astype(str)
Expand All @@ -342,19 +296,20 @@ def split_and_save_data(file_path, criteria, test_size=0.2, random_state=42):
# Split the data into train and test sets
train_data, test_data = train_test_split(data_subset, test_size=test_size, random_state=random_state)

# Define file paths for train and test data
train_file_path = os.path.join(os.path.dirname(file_path), 'train_data.xlsx')
test_file_path = os.path.join(os.path.dirname(file_path), 'test_data.xlsx')
if output_path:
# Define file paths for train and test data
train_file_path = os.path.join(output_path, 'train_data.xlsx')
test_file_path = os.path.join(output_path, 'test_data.xlsx')

# Save the train and test sets into separate files
train_data.to_excel(train_file_path, index=False)
test_data.to_excel(test_file_path, index=False)

return train_data, test_data

def evaluate_RS_Model(fused_embeddings, user_id_map, file_path, criteria, test_size=0.2, random_state=42):
def evaluate_RS_Model(fused_embeddings, user_id_map, item_id_map, data, output_path, test_size=0.2, random_state=42):
# Split and save the data into train and test sets
train_data, test_data = split_and_save_data(file_path, criteria, test_size=test_size, random_state=random_state)
train_data, test_data = split_and_save_data(data, output_path, test_size=test_size, random_state=random_state)

# Prepare training data
train_X = fused_embeddings.cpu().detach().numpy()[train_data['User_ID'].astype('category').cat.codes]
Expand Down Expand Up @@ -405,7 +360,6 @@ def threshold_function(embedding):

# Prepare test data
test_user_ids = test_data['User_ID'].values.astype(str)
test_item_ids = test_data['Items_ID'].values.astype(str)

# Ensure all user IDs in test data are present in user_id_map
for user_id in test_user_ids:
Expand Down Expand Up @@ -438,15 +392,15 @@ def threshold_function(embedding):

return test_mae, test_rmse

def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, criteria, test_size=0.2, num_runs=30):
def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, item_id_map, data, output_path, test_size=0.2, num_runs=30):
# Lists to store MAE and RMSE values from each run
mae_values = []
rmse_values = []

# Perform specified number of runs of the function and collect MAE and RMSE values
for i in range(num_runs):
print("Run", i+1)
mae, rmse = evaluate_RS_Model(fused_embeddings, user_id_map, file_path, criteria, test_size=test_size, random_state=i)
mae, rmse = evaluate_RS_Model(fused_embeddings, user_id_map, item_id_map, data, output_path, test_size=test_size, random_state=i)
mae_values.append(mae)
rmse_values.append(rmse)

Expand All @@ -471,6 +425,47 @@ def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, cr

# ---------------------Main Function ---------------------------

def main(file_path, criteria, save_embeddings=False):
# Read data for the selected dataset
print("Reading data...")
data, user_id_map, item_id_map = read_data(file_path)
print("Reading data finished.")

if save_embeddings and type(save_embeddings) != str:
save_embeddings = file_path + '.embed.pt'

if save_embeddings and os.path.isfile(save_embeddings):
embeddings_loaded = True
print("Loading embeddings...")
fused_embeddings = torch.load(save_embeddings)
print("Loading embeddings finished...")
else:
embeddings_loaded = False
print("Constricting sociomatrices...")
matrices = L_BGNN(data, criteria, user_id_map, item_id_map)
print("Constricting sociomatrices finished.")

#---Attention Embedding------
print("Constricting model...")
model = GAT(in_channels=16, out_channels=256)
print("Constricting model finished.")

print("Generating embeddings...")
fused_embeddings = model.Multi_Embd(matrices, num_epochs=100, learning_rate=0.01)
print("Generating embeddings finished.")

if save_embeddings and not embeddings_loaded:
print("Saving embeddings...")
torch.save(fused_embeddings, save_embeddings)
print("Saving embeddings finished...")

# Call the function with the defined threshold function
print("Evaluating...")
Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_func=None, top_k=1)
evaluate_RS_Model(fused_embeddings, user_id_map, item_id_map, data, os.path.dirname(file_path), test_size=0.2, random_state=42)
evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, item_id_map, data, os.path.dirname(file_path), test_size=0.2, num_runs=30)


if __name__ == "__main__":

# Define your file paths for different datasets in Katana Server
Expand All @@ -495,52 +490,6 @@ def evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, cr
}

# Define the dataset to run
dataset_to_run = 'BeerAdvocate'

# Read data for the selected dataset

file_path = file_paths[dataset_to_run]
criteria = criteria_mapping[dataset_to_run]
user_id_map, item_id_map, base_ground_truth_ratings = read_data(file_path, criteria)
num_users = len(user_id_map)
num_items = len(item_id_map)
num_criteria = len(criteria)
data = pd.read_excel(file_path)

# Read data from the Excel file and create ID mappings
matrices = L_BGNN(file_path, criteria, user_id_map, item_id_map)

#---Attention Embedding------
model = GAT(in_channels=16, out_channels=256)
fused_embeddings = model.Multi_Embd(matrices, num_epochs=100, learning_rate=0.01)

# Recommendation section
num_samples = fused_embeddings.shape[0]

# Create an instance of the MultiCriteriaRecommender class
output_dim = fused_embeddings.shape[1] # Set output_dim to the number of criteria
DATASET_TO_RUN = 'BeerAdvocate'

# Convert fused_embeddings to a torch tensor
fused_embeddings_tensor = fused_embeddings.clone().detach().to(torch.float32)

# Reshape fused_embeddings_tensor to match the expected shape
num_samples, num_features = fused_embeddings_tensor.shape

# Calculate the total number of features per criterion
num_features_per_criterion = num_features // num_criteria

# Create a DataFrame with user and item identifiers as MultiIndex
df_users_items = pd.DataFrame(index=pd.MultiIndex.from_tuples([(user_id, item_id) for user_id in user_id_map.keys() for item_id in item_id_map.keys()]))

# Call the create_real_ratings function
data, ground_truth_ratings_matrix, user_id_map, item_id_map = create_ground_truth_ratings(file_path, criteria)

# Define the threshold function
def threshold_function(embedding):
return torch.tensor(0.1)

# Call the function with the defined threshold function
recommendations = Recommendation_items_Top_k(fused_embeddings, user_id_map, data, threshold_func=None, top_k=1)
train_data, test_data = split_and_save_data(file_path, criteria)
test_mae, test_rmse=evaluate_RS_Model(fused_embeddings, user_id_map, file_path, criteria, test_size=0.2, random_state=42)
mae_std, rmse_std=evaluate_RS_Model_multiple_runs(fused_embeddings, user_id_map, file_path, criteria, test_size=0.2, num_runs=30)
main(file_paths[DATASET_TO_RUN], criteria_mapping[DATASET_TO_RUN], True)

0 comments on commit 0ca1b28

Please sign in to comment.