From d7b01522ddafa5d00abab9a6db46c83c7120a113 Mon Sep 17 00:00:00 2001 From: Santiago Folgueras Date: Tue, 3 Dec 2024 17:02:39 +0100 Subject: [PATCH 01/16] Clean up the code, incorporate some of the dev changes into the TrainModelGraph and add auxiliary functions to the training module. --- test/TrainModelWithNorm.ipynb | 91 +++---- test/model_architecture | 9 + tools/training/TrainModelFromGraph.py | 329 ++++++-------------------- tools/training/transformations.py | 45 ++++ tools/training/validation.py | 95 ++++++++ 5 files changed, 272 insertions(+), 297 deletions(-) create mode 100644 test/model_architecture create mode 100644 tools/training/transformations.py create mode 100644 tools/training/validation.py diff --git a/test/TrainModelWithNorm.ipynb b/test/TrainModelWithNorm.ipynb index 53c26d1..31b5504 100644 --- a/test/TrainModelWithNorm.ipynb +++ b/test/TrainModelWithNorm.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 26, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -44,7 +44,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/08/4qyj2h792lv7kk8b3phm_j000000gn/T/ipykernel_71433/1996300641.py:26: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + "/var/folders/08/4qyj2h792lv7kk8b3phm_j000000gn/T/ipykernel_90950/1996300641.py:26: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", " graph = torch.load(file_path)\n" ] }, @@ -105,37 +105,18 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 3, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total Graphs: 507314\n", - "Total Graphs after filtering: 502362\n", - "Total Graphs after filtering: 502362\n", - "====================================\n", - "Example of data:\n", - "\n", - "tensor([[1.0005e+00, 2.4179e+00, 4.3113e+02, 0.0000e+00, 3.0000e+00],\n", - " [1.1310e+00, 2.3166e+00, 4.1368e+02, 1.0000e+01, 5.0000e+00],\n", - " [1.0657e+00, 2.3225e+00, 4.4868e+02, 1.1000e+01, 5.0000e+00],\n", - " [1.1092e+00, 2.3469e+00, 1.1241e+03, 7.0000e+00, 9.0000e+00],\n", - " [1.0766e+00, 2.3318e+00, 1.1015e+03, 1.6000e+01, 5.0000e+00]],\n", - " dtype=torch.float64)\n", - "tensor([[0, 0, 1, 1, 2, 2, 3, 4],\n", - " [1, 2, 0, 2, 0, 1, 4, 3]])\n", - "tensor([[-0.1012, 0.1305],\n", - " [-0.0954, 0.0653],\n", - " [-0.1012, 0.1305],\n", - " [ 0.0058, -0.0653],\n", - " [-0.0954, 0.0653],\n", - " [ 0.0058, -0.0653],\n", - " [-0.0151, -0.0326],\n", - " [-0.0151, -0.0326]])\n", - "tensor(-0.0836)\n", - "====================================\n" + "ename": "NameError", + "evalue": "name 'graphs' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch_geometric\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtransforms\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mT\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m Graphs_for_training \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(\u001b[43mgraphs\u001b[49m, [])\n\u001b[1;32m 4\u001b[0m Graphs_for_training_reduced \u001b[38;5;241m=\u001b[39m Graphs_for_training\n\u001b[1;32m 5\u001b[0m Graphs_for_training_filtered \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 6\u001b[0m g \u001b[38;5;28;01mfor\u001b[39;00m g \u001b[38;5;129;01min\u001b[39;00m Graphs_for_training_reduced\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(g\u001b[38;5;241m.\u001b[39my)\u001b[38;5;241m.\u001b[39many() \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misnan(g\u001b[38;5;241m.\u001b[39mx)\u001b[38;5;241m.\u001b[39many()) \u001b[38;5;129;01mand\u001b[39;00m g\u001b[38;5;241m.\u001b[39medge_index\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m1\u001b[39m) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 8\u001b[0m ]\n", + "\u001b[0;31mNameError\u001b[0m: name 'graphs' is not defined" ] } ], @@ -178,7 +159,18 @@ "print(Graphs_for_training_filtered[0].edge_index)\n", "print(Graphs_for_training_filtered[0].edge_attr)\n", "print(Graphs_for_training_filtered[0].y)\n", - "print(\"====================================\")" + "print(\"====================================\")\n", + "\n", + "# Save data (not normalized) for later use\n", + "events = len(Graphs_for_training_filtered)\n", + "ntrain = int((events * 0.7) / BatchSize) * BatchSize # to have full batches\n", + "print(f\"Training events: {ntrain}\")\n", + "\n", + "train_dataset_noNORM = Graphs_for_training_filtered[:ntrain]\n", + "test_dataset_noNORM = Graphs_for_training_filtered[ntrain:ntrain * 2]\n", + "\n", + "train_loader_noNORM = DataLoader(train_dataset_noNORM, batch_size=BatchSize, shuffle=True)\n", + "test_loader_noNORM = DataLoader(test_dataset_noNORM, batch_size=BatchSize, shuffle=False)\n" ] }, { @@ -301,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -355,6 +347,7 @@ " break # Only draw the first batch\n", "\n", "# Plot histograms for training data\n", + "plot_histograms(train_loader_noNORM)\n", "plot_histograms(train_loader)" ] }, @@ -665,13 +658,6 @@ " torch.save(model_MPNNRegressor.state_dict(), f'{ModelOutDIR}model_MPNNRegressor_{BatchSize}batches_{num_epochs}epochs_4MPL_8Lins.pth')" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -794,12 +780,12 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_results(all_regression, all_prediction, label='Model'):\n", - " fig, axs = plt.subplots(1, 2, figsize=(15, 5))\n", + " fig, axs = plt.subplots(1, 3, figsize=(15, 5))\n", "\n", " print(\"Plotting Regression target\")\n", " axs[0].hist(all_regression, bins=np.arange(-0.5,0,0.006), alpha=0.75, label='Regression target')\n", @@ -810,12 +796,27 @@ " axs[0].legend()\n", "\n", " axs[1].scatter(all_regression, all_prediction, alpha=0.5)\n", - " axs[1].set_xlim(-0.5,0)\n", - " plt.plot([min(all_prediction), max(all_prediction)], [min(all_prediction), max(all_prediction)], color='red', linestyle='--') # Line of equality\n", + " axs[1].plot([min(all_prediction), max(all_prediction)], [min(all_prediction), max(all_prediction)], color='red', linestyle='--') # Line of equality\n", " axs[1].set_title(f'Regression target vs prediction for {label}')\n", " axs[1].set_xlabel('Regression target')\n", " axs[1].set_ylabel('Prediction')\n", "\n", + " axs[2].hist(all_prediction - all_regression, bins=30, alpha=0.75)\n", + " axs[2].set_title(f'Residuals for {label}')\n", + " axs[2].set_xlabel('Residual')\n", + " axs[2].set_ylabel('Frequency')\n", + " \n", + " # Calculate the bias and resolution and plot them in the graph\n", + " bias = np.mean(all_prediction - all_regression)\n", + " resolution = np.std(all_prediction - all_regression)\n", + "\n", + " # Add text box with bias and resolution\n", + " textstr = f'Bias: {bias:.4f}\\nResolution: {resolution:.4f}'\n", + " props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)\n", + " axs[1].text(0.95, 0.95, textstr, transform=axs[1].transAxes, fontsize=12,\n", + " verticalalignment='top', horizontalalignment='right', bbox=props)\n", + "\n", + "\n", " plt.tight_layout()\n", " plt.show()\n", "\n" diff --git a/test/model_architecture b/test/model_architecture new file mode 100644 index 0000000..c5c39e4 --- /dev/null +++ b/test/model_architecture @@ -0,0 +1,9 @@ +digraph { + graph [size="12,12"] + node [align=left fontname=monospace fontsize=10 height=0.2 ranksep=0.1 shape=box style=filled] + 2276440280112 [label=" + (1024)" fillcolor=darkolivegreen1] + 2276440280592 [label=" + (1024, 1)" fillcolor=darkolivegreen3] + 2276440280592 -> 2276440280112 [style=dotted] +} diff --git a/tools/training/TrainModelFromGraph.py b/tools/training/TrainModelFromGraph.py index 880b5f0..486cdba 100644 --- a/tools/training/TrainModelFromGraph.py +++ b/tools/training/TrainModelFromGraph.py @@ -1,6 +1,6 @@ import torch from torch_geometric.loader import DataLoader -from torch_geometric.transforms import BaseTransform +from torch_geometric.transforms import Compose from torch_geometric.data import Data import os,sys @@ -8,47 +8,11 @@ import argparse import matplotlib.pyplot as plt from tools.training.models import GATRegressor, GraphSAGEModel, MPLNNRegressor +from tools.training.transformations import NormalizeNodeEdgesAndDropTwoFeatures import pickle import itertools -#import torch._dynamo -#torch._dynamo.config.capture_scalar_outputs = True - -#torch.serialization.add_safe_globals([Data]) - -class NormalizeNodeFeatures(BaseTransform): - def __call__(self, data): - if hasattr(data, 'x'): - data.x = (data.x - data.x.mean(dim=0)) / data.x.std(dim=0) - return data - -class NormalizeEdgeFeatures(BaseTransform): - def __call__(self, data): - if hasattr(data, 'edge_attr'): - data.edge_attr = (data.edge_attr - data.edge_attr.mean(dim=0)) / data.edge_attr.std(dim=0) - return data - -class NormalizeTargets(BaseTransform): - def __call__(self, data): - if hasattr(data, 'y'): - data.y = (data.y - data.y.mean(dim=0)) / data.y.std(dim=0) - return data - -class NormalizeSpecificNodeFeatures(BaseTransform): - def __init__(self, column_indices): - self.column_indices = column_indices - - def __call__(self, data): - if hasattr(data, 'x'): - for column_index in self.column_indices: - column = data.x[:, column_index] - mean = column.mean() - std = column.std() - data.x[:, column_index] = (column - mean) / std - return data - - class TrainModelFromGraph: @staticmethod @@ -56,17 +20,16 @@ def add_args(parser): parser.add_argument('--graph_path', type=str, default='graph_folder', help='Path to the graph data') parser.add_argument('--graph_name', type=str, default='vix_graph_13Nov_3_muonQOverPt', help='Name of the graph data') parser.add_argument('--out_path', type=str, default='Bsize_gmp_64_lr5e-4_v3', help='Output path for the results') + parser.add_argument('--save_tag', type=str, default='vix_graph_13Nov_3_muonQOverPt', help='Tag for saving the model') parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training') parser.add_argument('--learning_rate', type=float, default=0.0005, help='Learning rate for training') parser.add_argument('--epochs', type=int, default=1000, help='Number of epochs for training') parser.add_argument('--model_path', type=str, default=None, help='Path to the saved model for evaluation') - parser.add_argument('--output_dir', type=str, default=None, help='Output directory for evaluation results') parser.add_argument('--do_validation', action='store_true', help='Evaluate the model') - parser.add_argument('--model_type', type=str, default='GAT', help='Model to use for training') - parser.add_argument('--normalize_features', action='store_true', help='Normalize node features') - parser.add_argument('--normalize_targets', action='store_true', help='Normalize target features') - parser.add_argument('--normalize_edge_features', action='store_true', help='Normalize edge features') - parser.add_argument('--normalize_specific_features', type=int, nargs='+', default=None, help='Normalize specific node feature columns') + parser.add_argument('--model_type', type=str, default='SAGE', help='Model to use for training') + parser.add_argument('--do_train', action='store_true', help='Train the model') + parser.add_argument('--hidden_dim', type=int, default=32, help='Hidden dimension for the model') + parser.add_argument('--normalization', type=str, default='NodesAndEdgesAndOnlySpatial', help='Type of normalization to apply') parser.add_argument('--num_files', type=int, default=None, help='Number of graph files to load') return parser @@ -74,56 +37,57 @@ def __init__(self, **kwargs): self.graph_path = kwargs.get('graph_path', 'graph_folder') self.graph_name = kwargs.get('graph_name', 'vix_graph_13Nov_3_muonQOverPt') self.out_path = kwargs.get('out_path', 'Bsize_gmp_64_lr5e-4_v3') - self.batch_size = kwargs.get('batch_size', 64) - self.learning_rate = kwargs.get('learning_rate', 0.0005) + self.save_tag = kwargs.get('save_tag', 'vix_graph_13Nov_3_muonQOverPt') + self.batch_size = kwargs.get('batch_size', 1024) + self.learning_rate = kwargs.get('learning_rate', 0.001) self.epochs = kwargs.get('epochs', 100) self.model_path = kwargs.get('model_path', None) - self.output_dir = kwargs.get('output_dir', None) self.do_validation = kwargs.get('evaluate', False) - self.model_type = kwargs.get('model_type', 'GAT') - self.normalize_features = kwargs.get('normalize_features', False) - self.normalize_targets = kwargs.get('normalize_targets', False) - self.normalize_edge_features = kwargs.get('normalize_edge_features', False) - self.normalize_specific_features = kwargs.get('normalize_specific_features', None) + self.do_train = kwargs.get('do_train', False) + self.hidden_dim = kwargs.get('hidden_dim', 32) + self.model_type = kwargs.get('model_type', 'SAGE') + self.normalization = kwargs.get('normalization', 'NodesAndEdgesAndOnlySpatial') self.num_files = kwargs.get('num_files', None) # Número de archivos a cargar - self.device = kwargs.get('device', 'cuda') - # Initialize other attributes self.train_loader = None self.test_loader = None self.model = None self.optimizer = None - self.loss_fn = torch.nn.MSELoss(reduction='sum').to(self.device) -# self.loss_fn = torch.nn.CrossEntropyLoss().to(self.device) + self.loss_fn = None self.device = torch.device('cuda' if (torch.cuda.is_available() and self.device == 'cuda') else 'cpu') # Apply transformations if necessary - self.transforms = [] - if self.normalize_features: - self.transforms.append(NormalizeNodeFeatures()) - if self.normalize_edge_features: - self.transforms.append(NormalizeEdgeFeatures()) - if self.normalize_targets: - self.transforms.append(NormalizeTargets()) - if self.normalize_specific_features is not None: - self.transforms.append(NormalizeSpecificNodeFeatures(self.normalize_specific_features)) - - # For evaluation: - self.trained_model = None - self.pt_pred_arr = [] - self.pt_truth_arr = [] - + self.transform = None + if self.normalization == 'NodesAndEdgesAndOnlySpatial': + self.transform = NormalizeNodeEdgesAndDropTwoFeatures() + elif self.normalization == 'NodesAndEdges': + self.transform = Compose([NormalizeNodeFeatures(),NormalizeEdgeFeatures()]) + elif self.normalization == 'Nodes': + self.transform = NormalizeNodeFeatures() + elif self.normalization == 'Edges': + self.transform = NormalizeEdgeFeatures() + elif self.normalization == 'Targets': + self.transform = NormalizeTargets() + elif self.normalization == 'DropLastTwoNodeFeatures': + self.transform = DropLastTwoNodeFeatures() + elif self.normalization == 'None': + print("No normalization applied") + self.transform = None + else: + print("Unknown normalization type, exiting...") + sys.exit(1) + def load_data(self): # Loading data from graph and convert it to DataLoader graphs = [] all_files = os.listdir(self.graph_path) # Filter for .pkl files - graph_files = [f for f in all_files if self.graph_name in f] + graph_files = [f for f in all_files if (f.endswith('.pkl') or f.endswith('.pt')) and graph_name in f] if not graph_files: - print("No .pkl files found in the directory.") + print("No .pkl/.pt files found in the directory.") return [] if self.num_files is not None: @@ -139,27 +103,27 @@ def load_data(self): graph = torch.load(file) graphs.append(graph) - Graphs_for_training = list(itertools.chain.from_iterable(graphs)) + Graphs_for_training = sum(graphs, []) print(f"Total Graphs: {len(Graphs_for_training)}") - # Filter out graphs with no nodes - Graphs_for_training_filtered = [g for g in Graphs_for_training if g.edge_index.size(1) > 0] # remove empty graphs + ### NOW FILTER EMPTY GRAPHS... + Graphs_for_training_reduced = Graphs_for_training + Graphs_for_training_filtered = [ + g for g in Graphs_for_training_filtered + if not (torch.isnan(g.x).any() or torch.isnan(g.edge_attr).any() or torch.isnan(g.y).any() or g.edge_index.size(1) == 0) + ] # remove extra dimension in y and put deltaPhi and deltaEta in the data object as edge_attr for i in range(0, len(Graphs_for_training_filtered)): Graphs_for_training_filtered[i].y = Graphs_for_training_filtered[i].y.mean(dim=0) - Graphs_for_training_filtered[i].edge_attr = torch.stack([Graphs_for_training_filtered[i].deltaPhi.float(), Graphs_for_training_filtered[i].deltaEta.float()], dim=1) + Graphs_for_training_filtered[i].edge_attr = torch.stack([Graphs_for_training_filtered[i].deltaPhi.float(), Graphs_for_training_filtered[i].deltaEta.float()], dim=1) - Graphs_for_training_filtered = [ - g for g in Graphs_for_training_filtered - if not torch.isnan(g.y).any() - ] + print(f"Total Graphs: {len(Graphs_for_training)}") print(f"Filtered Graphs: {len(Graphs_for_training_filtered)}") # Apply transformations to the load data... - if self.transforms: - for transform in self.transforms: - Graphs_for_training_filtered = [transform(data) for data in Graphs_for_training_filtered] + if self.transform is not None: + Graphs_for_training_filtered = [self.transform(data) for data in Graphs_for_training_filtered] # Train and test split: events = len(Graphs_for_training_filtered) @@ -181,56 +145,8 @@ def load_data(self): self.train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True) self.test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False) - def plot_graph_features(self, data_loader): - feature_names = ["eta", "phi", "R", "layer", "Type"] - for batch in data_loader: - features = batch.x.numpy() - regression = batch.y.numpy() - num_features = features.shape[1] - - fig, axs = plt.subplots(3, 3, figsize=(15, 15)) - axs = axs.flatten() - - # Plot node features - for i in range(num_features): - nbins = 18 if i==3 else 30 - axs[i].hist(features[:, i], bins=nbins, alpha=0.75) - axs[i].set_title(f'Feature {feature_names[i]} Histogram') - axs[i].set_xlabel(f'Feature {feature_names[i]} Value') - axs[i].set_ylabel('Frequency') - - #calculate the average number of edges, dividing by the number of nodes - num_edges = batch.edge_index.size(1)/batch.x.size(0) - axs[num_features].hist(num_edges, bins=30, alpha=0.75) - axs[num_features].set_title('Number of Edges/Node') - axs[num_features].set_ylabel('Count') - - # Plot edge features - for i in range(batch.edge_attr.shape[1]): - axs[i+num_features].hist(batch.edge_attr[:, i], bins=30, alpha=0.75) - axs[i+num_features].set_title(f'Edge Feature {i} Histogram') - axs[i+num_features].set_xlabel(f'Edge Feature {i} Value') - axs[i+num_features].set_ylabel('Frequency') - - #calculate the average number of edges, dividing by the number of nodes - num_edges = batch.edge_index.size(1)/batch.x.size(0) - axs[num_features+2].hist(num_edges, bins=30, alpha=0.75) - axs[num_features+2].set_title('Number of Edges/Node') - axs[num_features+2].set_ylabel('Count') - - # Plot regression target - axs[num_features + 3].hist(regression, bins=30, alpha=0.75) - axs[num_features + 3].set_title('Regression Target Histogram') - axs[num_features + 3].set_xlabel('Regression Target Value') - axs[num_features + 3].set_ylabel('Frequency') - - plt.tight_layout() - plt.show() - break # Only draw the first batch - def initialize_model(self): num_node_features = 3 - num_edge_features = 2 hidden_dim = 32 output_dim = 1 ## ONE FEATURE ONLY!!! print(f"Using device: {self.device}") @@ -243,163 +159,70 @@ def initialize_model(self): #self.model = torch_geometric.compile(self.model) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) + self.loss_fn = torch.nn.MSELoss() + print("Model initialized") print(self.model) - def train(self): + def train(self, loader): self.model.train() - for data in self.train_loader: - data = data.to(self.device) # Mueve los datos al dispositivo - - # Verificar si hay valores extremadamente grandes o pequeños en los datos de entrada - if torch.max(data.x) > 1e6 or torch.min(data.x) < -1e6: - print("Warning: Extremely large or small values found in input node features") - continue - if torch.max(data.edge_attr) > 1e6 or torch.min(data.edge_attr) < -1e6: - print("Warning: Extremely large or small values found in edge attributes") - continue - if torch.max(data.y) > 1e6 or torch.min(data.y) < -1e6: - print("Warning: Extremely large or small values found in target values") - continue - - + total_loss = 0 + for data in loader: + data = data.to(self.device) # Move data to the device self.optimizer.zero_grad() out = self.model(data) - - # Verificar si hay valores nan o inf en las salidas del modelo - if torch.isnan(out).any() or torch.isinf(out).any(): - print("Warning: NaN or Inf values found in model output") - continue - loss = self.loss_fn(out, data.y.view(out.size())) - - # Verificar si hay valores nan o inf en la pérdida - if torch.isnan(loss) or torch.isinf(loss): - print("Warning: NaN or Inf values found in loss") - sys.exit(1) - loss.backward() self.optimizer.step() + total_loss += loss.item() + + return total_loss / len(loader.dataset) @torch.no_grad() def test(self, loader): self.model.eval() total_loss = 0 - total_accuracy = 0 for data in loader: data = data.to(self.device) out = self.model(data) loss = self.loss_fn(out, data.y.view(out.size())) - total_loss += float(loss) - total_accuracy += self.accuracy(out, data.y) - return total_loss / len(loader.dataset), total_accuracy / len(loader.dataset) - - def accuracy(self, predictions, targets, threshold=0.10): - # Calcular la diferencia relativa - relative_diff = torch.abs(predictions - targets) / torch.abs(targets) - - # Comparar con el porcentaje dado - ok = relative_diff < (threshold) - - # Calcular la precisión - acc = ok.sum() - - return int(acc) + total_loss += loss.item() + return total_loss / len(loader.dataset) def Training_loop(self): - train_losses = [] - test_losses = [] - train_accuracies = [] - test_accuracies = [] print(f"Saving results in {self.out_path}") - path = self.out_path - if not os.path.exists(path): - os.makedirs(path) + if not os.path.exists(self.out_path): + os.makedirs(self.out_path) print("Start training...") for epoch in range(self.epochs): - self.train() - train_loss, train_accuracy = self.test(self.train_loader) - test_loss, test_accuracy = self.test(self.test_loader) - train_losses.append(train_loss) - test_losses.append(test_loss) - train_accuracies.append(train_accuracy) - test_accuracies.append(test_accuracy) - if epoch == 0: - torch.save(test_loss, f"{path}/testloss_{epoch + 1}.pt") - torch.save(train_loss, f"{path}/trainloss_{epoch + 1}.pt") - elif (epoch + 1) % 10 == 0: + train_loss = self.train(self.train_loader) + test_loss = self.test(self.test_loader) + if (epoch + 1) % 10 == 0: print(f'Epoch: {epoch + 1:02d}, Train loss: {train_loss:.4f}, Test loss: {test_loss:.4f}, Train accuracy: {train_accuracy:.4f}, Test accuracy: {test_accuracy:.4f}') - torch.save(self.model, f"{path}/model_{epoch + 1}.pth") - torch.save(test_loss, f"{path}/testloss_{epoch + 1}.pt") - torch.save(train_loss, f"{path}/trainloss_{epoch + 1}.pt") - - plt.plot(train_losses, "b", label="Train loss") - plt.plot(test_losses, "k", label="Test loss") - plt.plot(train_accuracies, "g", label="Train accuracy") - plt.plot(test_accuracies, "r", label="Test accuracy") - plt.yscale('log') - plt.savefig(f"{path}/loss_accuracy_plot.png") - plt.close() + torch.save(self.model, f"{self.out_path}/model_{self.model_type}_{self.hidden_dim}dim_{epoch+1}epochs_{self.save_tag}.pth") def set_model_path(self, path): self.model_path = path def load_trained_model(self): print(f"Loading model from {self.model_path}") - self.trained_model = torch.load(self.model_path, map_location=torch.device('cpu')) - - @torch.no_grad() - def evaluate(self): - for data in self.test_loader: - out = self.trained_model(data) - for item in range(0, out.size(0)): - vector_pred = out[item] - vector_real = data[item].y - self.pt_pred_arr.append(vector_pred.item()) - self.pt_truth_arr.append(vector_real.item()) - - def plot_regression(self, output_dir): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - plt.clf() - print(f"Plotting regression in {output_dir}") - plt.hist(self.pt_truth_arr, bins=100, color='skyblue', alpha=0.5, label="truth") - plt.hist(self.pt_pred_arr, bins=100, color='g', alpha=0.5, label="prediction") - plt.legend() - plt.savefig(os.path.join(output_dir, "pt_regression.png")) - plt.clf() - - print(f"Plotting scatter in {output_dir}") - plt.plot(self.pt_truth_arr, self.pt_pred_arr, 'o') - plt.xlabel("Truth") - plt.ylabel("Prediction") - plt.savefig(os.path.join(output_dir, "pt_regression_scatter.png")) - plt.clf() - - print(f"Plotting difference in {output_dir}") - # plot difference between truth and prediction - diff = [x - y for x, y in zip(self.pt_truth_arr, self.pt_pred_arr)] - plt.hist(diff, bins=100, color='r', alpha=0.5, label="difference") - plt.legend() - plt.savefig(os.path.join(output_dir, "pt_regression_diff.png")) - plt.clf() - + self.model.load_state_dict(torch.load(self.model_path, map_location=self.device)) + def main(): parser = argparse.ArgumentParser(description="Train and evaluate GNN model") parser.add_argument('--graph_path', type=str, default='graph_folder', help='Path to the graph data') parser.add_argument('--graph_name', type=str, default='vix_graph_13Nov_3_muonQOverPt', help='Name of the graph data') parser.add_argument('--out_path', type=str, default='Bsize_gmp_64_lr5e-4_v3', help='Output path for the results') + parser.add_argument('--save_tag', type=str, default='vix_graph_13Nov_3_muonQOverPt', help='Tag for saving the model') parser.add_argument('--batch_size', type=int, default=1024, help='Batch size for training') parser.add_argument('--model_type', type=str, default='SAGE', help='Model to use for training') + parser.add_argument('--hidden_dim', type=int, default=32, help='Hidden dimension for the model') parser.add_argument('--plot_graph_features', action='store_true', help='Plot the graph features') - parser.add_argument('--normalize_features', action='store_true', help='Normalize node features') - parser.add_argument('--normalize_targets', action='store_true', help='Normalize target features') - parser.add_argument('--normalize_edge_features', action='store_true', help='Normalize edge features') - parser.add_argument('--normalize_specific_features', type=int, nargs='+', default=None, help='Normalize specific node feature columns') + parser.add_argument('--normalization', type=str, default='NodesAndEdgesAndOnlySpatial', help='Type of normalization to apply') parser.add_argument('--num_files', type=int, default=None, help='Number of graph files to load') parser.add_argument('--learning_rate', type=float, default=0.005, help='Learning rate for training') parser.add_argument('--epochs', type=int, default=100, help='Number of epochs for training') @@ -407,24 +230,26 @@ def main(): parser.add_argument('--output_dir', type=str, default='Bsize_gmp_64_lr5e-4_v3', help='Output directory for evaluation results') parser.add_argument('--do_train', action='store_true', help='Train the model') parser.add_argument('--do_validation', action='store_true', help='Evaluate the model') - parser.add_argument('--device', type=str, default='cuda', help='Device to use for training') args = parser.parse_args() # For training: trainer = TrainModelFromGraph(**vars(args)) trainer.load_data() + trainer.initialize_model() + if args.plot_graph_features: - trainer.plot_graph_features(trainer.train_loader) + from tools.training.validation import plot_graph_feature_histograms + plot_graph_feature_histograms(trainer.train_loader) if args.do_train: - trainer.initialize_model() trainer.Training_loop() if args.do_validation: trainer.load_trained_model() - trainer.evaluate() - trainer.plot_regression(output_dir=args.output_dir) + from tools.training.validation import plot_prediction_results, evaluate_model + regression,prediction = evaluate_model(trainer.model, trainer.test_loader, trainer.device) + plot_prediction_results(regression, prediction, output_dir=args.output_dir,label=trainer.model_type) if __name__ == "__main__": diff --git a/tools/training/transformations.py b/tools/training/transformations.py new file mode 100644 index 0000000..c652b8d --- /dev/null +++ b/tools/training/transformations.py @@ -0,0 +1,45 @@ +from torch_geometric.transforms import BaseTransform, Compose + +class NormalizeNodeFeatures(BaseTransform): + def __call__(self, data): + if hasattr(data, 'x'): + data.x = (data.x - data.x.mean(dim=0)) / data.x.std(dim=0) + return data + +class NormalizeEdgeFeatures(BaseTransform): + def __call__(self, data): + if hasattr(data, 'edge_attr'): + data.edge_attr = (data.edge_attr - data.edge_attr.mean(dim=0)) / data.edge_attr.std(dim=0) + return data + +class NormalizeTargets(BaseTransform): + def __call__(self, data): + if hasattr(data, 'y'): + data.y = (data.y - data.y.mean(dim=0)) / data.y.std(dim=0) + return data + +class DropLastTwoNodeFeatures(BaseTransform): + def __call__(self, data): + if hasattr(data, 'x'): + data.x = data.x[:, :-2] # Eliminar las dos últimas columnas + return data + +class NormalizeSpecificNodeFeatures(BaseTransform): + def __init__(self, column_indices): + self.column_indices = column_indices + + def __call__(self, data): + if hasattr(data, 'x'): + for column_index in self.column_indices: + column = data.x[:, column_index] + mean = column.mean() + std = column.std() + data.x[:, column_index] = (column - mean) / std + return data + +# Definir las transformaciones +NormalizeNodeEdgesAndDropTwoFeatures = Compose([ + NormalizeNodeFeatures(), + NormalizeEdgeFeatures(), + DropLastTwoNodeFeatures() # Aplicar la transformación para eliminar las dos últimas características +]) diff --git a/tools/training/validation.py b/tools/training/validation.py new file mode 100644 index 0000000..6632497 --- /dev/null +++ b/tools/training/validation.py @@ -0,0 +1,95 @@ +import os +import numpy as np +import matplotlib.pyplot as plt + +def plot_graph_feature_histograms(data_loader): + feature_names = ["eta", "phi", "R", "deltaPhi", "deltaEta","Q/pt"] + for batch in data_loader: + features = batch.x.numpy() + regression = batch.y.numpy() + num_features = features.shape[1] + fig, axs = plt.subplots(2, 3, figsize=(15, 15)) + axs = axs.flatten() + + # Plot node features + for i in range(num_features): + axs[i].hist(features[:, i], bins=30, alpha=0.75) + axs[i].set_title(f'Feature {feature_names[i]} Histogram') + axs[i].set_xlabel(f'Feature {feature_names[i]} Value') + axs[i].set_ylabel('Frequency') + + + # plot the number of edges of each graph + for i in range(batch.edge_attr.shape[1]): + axs[i+num_features].hist(batch.edge_attr[:, i], bins=30, alpha=0.75) + axs[i+num_features].set_title(f'Feature {feature_names[i+num_features]} Histogram') + axs[i+num_features].set_xlabel(f'Feature {feature_names[i+num_features]} Value') + axs[i+num_features].set_ylabel('Frequency') + + # Plot regression target + axs[num_features + (batch.edge_attr.shape[1])].hist(regression, bins=30, alpha=0.75) + axs[num_features + (batch.edge_attr.shape[1])].set_title(f'Regression target {feature_names[-1]} Histogram') + axs[num_features + (batch.edge_attr.shape[1])].set_xlabel(f'Regression target {feature_names[-1]} Value') + axs[num_features + (batch.edge_attr.shape[1])].set_ylabel('Frequency') + + plt.tight_layout() + plt.show() + break # Only draw the first batch + +@torch.no_grad() +def evaluate_model(model, test_loader, device): + model.eval() + total_loss = 0 + all_regression = [] + all_prediction = [] + for data in test_loader: + #only one batch + data = data.to(device) + out = model(data) + all_regression.append(data.y.cpu().numpy()) + all_prediction.append(out.cpu().numpy()) + + # Concatenar todas las predicciones y valores objetivo + all_regression = np.concatenate(all_regression, axis=0) + all_prediction = np.concatenate(all_prediction, axis=0) + + return all_regression, all_prediction + +def plot_prediction_results(regression, prediction, output_dir='Test', label='Model'): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + fig, axs = plt.subplots(1, 3, figsize=(15, 5)) + + print("Plotting Regression target") + axs[0].hist(regression, bins=np.arange(-0.5,0,0.006), alpha=0.75, label='Regression target') + axs[0].hist(prediction, bins=np.arange(-0.5,0,0.006), alpha=0.75, label='Prediction') + axs[0].set_title(f'Regression target and prediction for {label}') + axs[0].set_xlabel('Value') + axs[0].set_ylabel('Frequency') + axs[0].legend() + + axs[1].scatter(regression, prediction, alpha=0.5) + axs[1].plot([min(prediction), max(prediction)], [min(prediction), max(prediction)], color='red', linestyle='--') # Line of equality + axs[1].set_title(f'Regression target vs prediction for {label}') + axs[1].set_xlabel('Regression target') + axs[1].set_ylabel('Prediction') + + axs[2].hist(prediction - regression, bins=30, alpha=0.75) + axs[2].set_title(f'Residuals for {label}') + axs[2].set_xlabel('Residual') + axs[2].set_ylabel('Frequency') + + # Calculate the bias and resolution and plot them in the graph + bias = np.mean(prediction - regression) + resolution = np.std(prediction - regression) + + # Add text box with bias and resolution + textstr = f'Bias: {bias:.4f}\nResolution: {resolution:.4f}' + props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) + axs[1].text(0.95, 0.95, textstr, transform=axs[1].transAxes, fontsize=12, + verticalalignment='top', horizontalalignment='right', bbox=props) + + plt.tight_layout() + fig.savefig(os.path.join(output_dir, f'{label}_prediction_results.png')) + From 39adc4581c574a6d3fde46336886bdfa6d66315a Mon Sep 17 00:00:00 2001 From: Santiago Folgueras Date: Tue, 3 Dec 2024 17:17:15 +0100 Subject: [PATCH 02/16] Adaptation to the submitJobs for running multiple tests in parallel --- test/submitJobs_training.py | 37 +++++++++++++++------------ tools/training/TrainModelFromGraph.py | 6 ++--- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/test/submitJobs_training.py b/test/submitJobs_training.py index 2776054..c23b49e 100644 --- a/test/submitJobs_training.py +++ b/test/submitJobs_training.py @@ -8,6 +8,8 @@ queue = "workday" # give bsub queue -- 8nm (8 minutes), 1nh (1 hour), 8nh, 1nd (1day), 2nd, 1nw (1 week), 2nw OutputDir = "/eos/cms/store/user/folguera/L1TMuon/INTREPID/Model_v240725_Bsize64_lr5e-4_NOnormNodes_GAT_241106/" WORKDIR = "/afs/cern.ch/user/f/folguera/workdir/INTREPID/tmp/TrainingModel/" +ModelTypes = ['SAGE', 'MPNN'] +NormalizationTypes = ['DropLastTwoNodeFeatures', 'NodesAndEdgesAndOnlySpatial'] ######## customization end ######### path = os.getcwd() @@ -30,21 +32,24 @@ print("OutputDir: %s" %(OutputDir)) ##### creating job ##### - -with open('%s/exec/job_train_model.sh' %(WORKDIR), 'w') as fout: - fout.write("#!/bin/sh\n") - fout.write("echo\n") - fout.write("echo\n") - fout.write("echo 'START---------------'\n") - fout.write("echo 'WORKDIR ' ${PWD}\n") - fout.write("cd "+str(path)+"\n") - fout.write("source pyenv/bin/activate\n") - fout.write("echo 'Saving Model in %s' \n" %(OutputDir)) - fout.write("python tools/training/TrainModelFromGraph.py --graph_path %s --out_path %s --do_train \n" %(InputFolder, OutputDir)) - fout.write("echo 'STOP---------------'\n") - fout.write("echo\n") - fout.write("echo\n") -os.system("chmod 755 %s/exec/job_train_model.sh" %(WORKDIR)) +file_count = 1 +for model in ModelTypes: + for normalization in NormalizationTypes: + SaveTag = model + "_" + normalization + "_Bsize1024_lr1e-3_241203" + with open('%s/exec/job_train_model_%02d.sh' %(WORKDIR, file_count), 'w') as fout: + fout.write("#!/bin/sh\n") + fout.write("echo\n") + fout.write("echo\n") + fout.write("echo 'START---------------'\n") + fout.write("echo 'WORKDIR ' ${PWD}\n") + fout.write("cd "+str(path)+"\n") + fout.write("source pyenv/bin/activate\n") + fout.write("echo 'Saving Model in %s' \n" %(OutputDir)) + fout.write("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --out_path %s --do_train --save_tag %s --batch_size 1024 --learning_rate 0.001\n" %(model, normalization, InputFolder, OutputDir, SaveTag)) + fout.write("echo 'STOP---------------'\n") + fout.write("echo\n") + fout.write("echo\n") + os.system("chmod 755 %s/exec/job_train_model_%02d.sh" %(WORKDIR, file_count)) ###### create submit.sub file #### with open('submit.sub', 'w') as fout: @@ -60,7 +65,7 @@ ###### sends bjobs ###### os.system("echo submit.sub") -os.system("condor_submit submit.sub") +#os.system("condor_submit submit.sub") print() print("your jobs:") diff --git a/tools/training/TrainModelFromGraph.py b/tools/training/TrainModelFromGraph.py index 486cdba..0549d25 100644 --- a/tools/training/TrainModelFromGraph.py +++ b/tools/training/TrainModelFromGraph.py @@ -22,7 +22,7 @@ def add_args(parser): parser.add_argument('--out_path', type=str, default='Bsize_gmp_64_lr5e-4_v3', help='Output path for the results') parser.add_argument('--save_tag', type=str, default='vix_graph_13Nov_3_muonQOverPt', help='Tag for saving the model') parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training') - parser.add_argument('--learning_rate', type=float, default=0.0005, help='Learning rate for training') + parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning rate for training') parser.add_argument('--epochs', type=int, default=1000, help='Number of epochs for training') parser.add_argument('--model_path', type=str, default=None, help='Path to the saved model for evaluation') parser.add_argument('--do_validation', action='store_true', help='Evaluate the model') @@ -147,7 +147,7 @@ def load_data(self): def initialize_model(self): num_node_features = 3 - hidden_dim = 32 + hidden_dim = self.hidden_dim output_dim = 1 ## ONE FEATURE ONLY!!! print(f"Using device: {self.device}") if self.model_type == 'GAT': @@ -224,7 +224,7 @@ def main(): parser.add_argument('--plot_graph_features', action='store_true', help='Plot the graph features') parser.add_argument('--normalization', type=str, default='NodesAndEdgesAndOnlySpatial', help='Type of normalization to apply') parser.add_argument('--num_files', type=int, default=None, help='Number of graph files to load') - parser.add_argument('--learning_rate', type=float, default=0.005, help='Learning rate for training') + parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning rate for training') parser.add_argument('--epochs', type=int, default=100, help='Number of epochs for training') parser.add_argument('--model_path', type=str, default='Bsize_gmp_64_lr5e-4_v3/model_1000.pth', help='Path to the saved model for evaluation') parser.add_argument('--output_dir', type=str, default='Bsize_gmp_64_lr5e-4_v3', help='Output directory for evaluation results') From 1ef8ba41c35d504e72aa1ffa42ebf110f577a052 Mon Sep 17 00:00:00 2001 From: Santiago Folgueras Date: Tue, 3 Dec 2024 17:30:42 +0100 Subject: [PATCH 03/16] adapt the submitJobs_training.py to test different connections --- test/submitJobs_training.py | 43 ++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/test/submitJobs_training.py b/test/submitJobs_training.py index c23b49e..7751ddb 100644 --- a/test/submitJobs_training.py +++ b/test/submitJobs_training.py @@ -4,12 +4,13 @@ print('START\n') ######## YOU ONLY NEED TO FILL THE AREA BELOW ######### ######## customization area ######### -InputFolder = "/eos/cms/store/user/folguera/L1TMuon/INTREPID/Graphs_v240725_241106/3neighbours_muonQOverPt/" # list with all the file directories +InputFolder = "/eos/cms/store/user/folguera/L1TMuon/INTREPID/Graphs_v240725_241106/" # list with all the file directories queue = "workday" # give bsub queue -- 8nm (8 minutes), 1nh (1 hour), 8nh, 1nd (1day), 2nd, 1nw (1 week), 2nw OutputDir = "/eos/cms/store/user/folguera/L1TMuon/INTREPID/Model_v240725_Bsize64_lr5e-4_NOnormNodes_GAT_241106/" WORKDIR = "/afs/cern.ch/user/f/folguera/workdir/INTREPID/tmp/TrainingModel/" ModelTypes = ['SAGE', 'MPNN'] NormalizationTypes = ['DropLastTwoNodeFeatures', 'NodesAndEdgesAndOnlySpatial'] +InputGraphs = ["3neighbours_muonQOverPt/", "all_connections_muonQOverPt/"] ######## customization end ######### path = os.getcwd() @@ -32,24 +33,32 @@ print("OutputDir: %s" %(OutputDir)) ##### creating job ##### -file_count = 1 +file_count = 0 for model in ModelTypes: for normalization in NormalizationTypes: - SaveTag = model + "_" + normalization + "_Bsize1024_lr1e-3_241203" - with open('%s/exec/job_train_model_%02d.sh' %(WORKDIR, file_count), 'w') as fout: - fout.write("#!/bin/sh\n") - fout.write("echo\n") - fout.write("echo\n") - fout.write("echo 'START---------------'\n") - fout.write("echo 'WORKDIR ' ${PWD}\n") - fout.write("cd "+str(path)+"\n") - fout.write("source pyenv/bin/activate\n") - fout.write("echo 'Saving Model in %s' \n" %(OutputDir)) - fout.write("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --out_path %s --do_train --save_tag %s --batch_size 1024 --learning_rate 0.001\n" %(model, normalization, InputFolder, OutputDir, SaveTag)) - fout.write("echo 'STOP---------------'\n") - fout.write("echo\n") - fout.write("echo\n") - os.system("chmod 755 %s/exec/job_train_model_%02d.sh" %(WORKDIR, file_count)) + for input_graph in InputGraphs: + file_count += 1 + print("Creating job for model %s with normalization %s and input graphs %s" %(model, normalization, input_graph)) + SaveTag = model + "_" + normalization + "_Bsize64_lr5e-4_241106_20files_" + if "all" in input_graph: + SaveTag = SaveTag + "allConnections" + else: + SaveTag = SaveTag + "3neighbours" + + with open('%s/exec/job_train_model_%02d.sh' %(WORKDIR, file_count), 'w') as fout: + fout.write("#!/bin/sh\n") + fout.write("echo\n") + fout.write("echo\n") + fout.write("echo 'START---------------'\n") + fout.write("echo 'WORKDIR ' ${PWD}\n") + fout.write("cd "+str(path)+"\n") + fout.write("source pyenv/bin/activate\n") + fout.write("echo 'Saving Model in %s' \n" %(OutputDir)) + fout.write("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --out_path %s --do_train --save_tag %s --batch_size 1024 --learning_rate 0.001 --num_files 20\n" %(model, normalization, InputFolder, OutputDir, SaveTag)) + fout.write("echo 'STOP---------------'\n") + fout.write("echo\n") + fout.write("echo\n") + os.system("chmod 755 %s/exec/job_train_model_%02d.sh" %(WORKDIR, file_count)) ###### create submit.sub file #### with open('submit.sub', 'w') as fout: From 49fcb2fc23593fd721a6f0c8393d56843c320a61 Mon Sep 17 00:00:00 2001 From: Santiago Folgueras Date: Tue, 3 Dec 2024 17:33:40 +0100 Subject: [PATCH 04/16] modify output folder --- test/submitJobs_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/submitJobs_training.py b/test/submitJobs_training.py index 7751ddb..3ec7c14 100644 --- a/test/submitJobs_training.py +++ b/test/submitJobs_training.py @@ -6,7 +6,7 @@ ######## customization area ######### InputFolder = "/eos/cms/store/user/folguera/L1TMuon/INTREPID/Graphs_v240725_241106/" # list with all the file directories queue = "workday" # give bsub queue -- 8nm (8 minutes), 1nh (1 hour), 8nh, 1nd (1day), 2nd, 1nw (1 week), 2nw -OutputDir = "/eos/cms/store/user/folguera/L1TMuon/INTREPID/Model_v240725_Bsize64_lr5e-4_NOnormNodes_GAT_241106/" +OutputDir = "/eos/cms/store/user/folguera/L1TMuon/INTREPID/Model_Graphsv240725_QOverPtRegression_241203/" WORKDIR = "/afs/cern.ch/user/f/folguera/workdir/INTREPID/tmp/TrainingModel/" ModelTypes = ['SAGE', 'MPNN'] NormalizationTypes = ['DropLastTwoNodeFeatures', 'NodesAndEdgesAndOnlySpatial'] From 1cce3702a008c61fea44038f2a071315165cba53 Mon Sep 17 00:00:00 2001 From: Santiago Folgueras Date: Tue, 3 Dec 2024 17:52:19 +0100 Subject: [PATCH 05/16] include device argument --- test/submitJobs_training.py | 2 +- tools/training/TrainModelFromGraph.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/test/submitJobs_training.py b/test/submitJobs_training.py index 3ec7c14..10e812f 100644 --- a/test/submitJobs_training.py +++ b/test/submitJobs_training.py @@ -54,7 +54,7 @@ fout.write("cd "+str(path)+"\n") fout.write("source pyenv/bin/activate\n") fout.write("echo 'Saving Model in %s' \n" %(OutputDir)) - fout.write("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --out_path %s --do_train --save_tag %s --batch_size 1024 --learning_rate 0.001 --num_files 20\n" %(model, normalization, InputFolder, OutputDir, SaveTag)) + fout.write("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --out_path %s --do_train --save_tag %s --batch_size 1024 --learning_rate 0.001 --num_files 20\n" %(model, normalization, InputFolder+input_graph, OutputDir, SaveTag)) fout.write("echo 'STOP---------------'\n") fout.write("echo\n") fout.write("echo\n") diff --git a/tools/training/TrainModelFromGraph.py b/tools/training/TrainModelFromGraph.py index 0549d25..6f22172 100644 --- a/tools/training/TrainModelFromGraph.py +++ b/tools/training/TrainModelFromGraph.py @@ -31,6 +31,7 @@ def add_args(parser): parser.add_argument('--hidden_dim', type=int, default=32, help='Hidden dimension for the model') parser.add_argument('--normalization', type=str, default='NodesAndEdgesAndOnlySpatial', help='Type of normalization to apply') parser.add_argument('--num_files', type=int, default=None, help='Number of graph files to load') + parser.add_argument('--device', type=str, default='cuda', help='Device to use for training') return parser def __init__(self, **kwargs): @@ -48,6 +49,7 @@ def __init__(self, **kwargs): self.model_type = kwargs.get('model_type', 'SAGE') self.normalization = kwargs.get('normalization', 'NodesAndEdgesAndOnlySpatial') self.num_files = kwargs.get('num_files', None) # Número de archivos a cargar + self.device = kwargs.get('device', 'cuda') # Initialize other attributes self.train_loader = None @@ -230,6 +232,7 @@ def main(): parser.add_argument('--output_dir', type=str, default='Bsize_gmp_64_lr5e-4_v3', help='Output directory for evaluation results') parser.add_argument('--do_train', action='store_true', help='Train the model') parser.add_argument('--do_validation', action='store_true', help='Evaluate the model') + parser.add_argument('--device', type=str, default='cuda', help='Device to use for training') args = parser.parse_args() From 0ef96d1cb125313c00f5da5ddab2c690f57ef774 Mon Sep 17 00:00:00 2001 From: Santiago Folgueras Date: Tue, 3 Dec 2024 17:52:59 +0100 Subject: [PATCH 06/16] remove tools.training path --- tools/training/TrainModelFromGraph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/training/TrainModelFromGraph.py b/tools/training/TrainModelFromGraph.py index 6f22172..14c4700 100644 --- a/tools/training/TrainModelFromGraph.py +++ b/tools/training/TrainModelFromGraph.py @@ -7,8 +7,8 @@ import argparse import matplotlib.pyplot as plt -from tools.training.models import GATRegressor, GraphSAGEModel, MPLNNRegressor -from tools.training.transformations import NormalizeNodeEdgesAndDropTwoFeatures +from models import GATRegressor, GraphSAGEModel, MPLNNRegressor +from transformations import NormalizeNodeEdgesAndDropTwoFeatures import pickle import itertools From 7e8d0a9f8c6dd28bfac4e257d76203856beb49ce Mon Sep 17 00:00:00 2001 From: Santiago Folgueras Date: Tue, 3 Dec 2024 17:54:27 +0100 Subject: [PATCH 07/16] incorporate transformation --- tools/training/TrainModelFromGraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/training/TrainModelFromGraph.py b/tools/training/TrainModelFromGraph.py index 14c4700..8aef881 100644 --- a/tools/training/TrainModelFromGraph.py +++ b/tools/training/TrainModelFromGraph.py @@ -8,7 +8,7 @@ import argparse import matplotlib.pyplot as plt from models import GATRegressor, GraphSAGEModel, MPLNNRegressor -from transformations import NormalizeNodeEdgesAndDropTwoFeatures +from transformations import NormalizeNodeEdgesAndDropTwoFeatures,DropLastTwoNodeFeatures,NormalizeNodeFeatures,NormalizeEdgeFeatures,NormalizeTargets import pickle import itertools From 6fdd4e89f091e902235b74145e32b9fc63a8c3ec Mon Sep 17 00:00:00 2001 From: Santiago Folgueras Date: Tue, 3 Dec 2024 17:56:40 +0100 Subject: [PATCH 08/16] fix graphname ins submitJobs_training --- test/submitJobs_training.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/submitJobs_training.py b/test/submitJobs_training.py index 10e812f..d8708e3 100644 --- a/test/submitJobs_training.py +++ b/test/submitJobs_training.py @@ -11,6 +11,7 @@ ModelTypes = ['SAGE', 'MPNN'] NormalizationTypes = ['DropLastTwoNodeFeatures', 'NodesAndEdgesAndOnlySpatial'] InputGraphs = ["3neighbours_muonQOverPt/", "all_connections_muonQOverPt/"] +GraphName = "vix_graph_6Nov" ######## customization end ######### path = os.getcwd() @@ -54,7 +55,7 @@ fout.write("cd "+str(path)+"\n") fout.write("source pyenv/bin/activate\n") fout.write("echo 'Saving Model in %s' \n" %(OutputDir)) - fout.write("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --out_path %s --do_train --save_tag %s --batch_size 1024 --learning_rate 0.001 --num_files 20\n" %(model, normalization, InputFolder+input_graph, OutputDir, SaveTag)) + fout.write("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --out_path %s --do_train --save_tag %s --batch_size 1024 --learning_rate 0.001 --num_files 20 --graph_name %s\n" %(model, normalization, InputFolder+input_graph, OutputDir, SaveTag, GraphName)) fout.write("echo 'STOP---------------'\n") fout.write("echo\n") fout.write("echo\n") From d063b6a79dc36ea6812e06a0887d8b2bfc59d943 Mon Sep 17 00:00:00 2001 From: Santiago Folgueras Date: Tue, 3 Dec 2024 17:58:26 +0100 Subject: [PATCH 09/16] fix bug in TrainModelFromGraph.py --- tools/training/TrainModelFromGraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/training/TrainModelFromGraph.py b/tools/training/TrainModelFromGraph.py index 8aef881..d350010 100644 --- a/tools/training/TrainModelFromGraph.py +++ b/tools/training/TrainModelFromGraph.py @@ -87,7 +87,7 @@ def load_data(self): all_files = os.listdir(self.graph_path) # Filter for .pkl files - graph_files = [f for f in all_files if (f.endswith('.pkl') or f.endswith('.pt')) and graph_name in f] + graph_files = [f for f in all_files if (f.endswith('.pkl') or f.endswith('.pt')) and self.graph_name in f] if not graph_files: print("No .pkl/.pt files found in the directory.") return [] From 744603e5fa6fd7e8a8264c4572171f7be5b148d3 Mon Sep 17 00:00:00 2001 From: Santiago Folgueras Date: Tue, 3 Dec 2024 21:39:28 +0100 Subject: [PATCH 10/16] fix bug in training script --- test/submitJobs_training.py | 3 ++- tools/training/TrainModelFromGraph.py | 31 ++++++++++++++++++--------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/test/submitJobs_training.py b/test/submitJobs_training.py index d8708e3..3c4bb4b 100644 --- a/test/submitJobs_training.py +++ b/test/submitJobs_training.py @@ -12,6 +12,7 @@ NormalizationTypes = ['DropLastTwoNodeFeatures', 'NodesAndEdgesAndOnlySpatial'] InputGraphs = ["3neighbours_muonQOverPt/", "all_connections_muonQOverPt/"] GraphName = "vix_graph_6Nov" +Epochs = 50 ######## customization end ######### path = os.getcwd() @@ -55,7 +56,7 @@ fout.write("cd "+str(path)+"\n") fout.write("source pyenv/bin/activate\n") fout.write("echo 'Saving Model in %s' \n" %(OutputDir)) - fout.write("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --out_path %s --do_train --save_tag %s --batch_size 1024 --learning_rate 0.001 --num_files 20 --graph_name %s\n" %(model, normalization, InputFolder+input_graph, OutputDir, SaveTag, GraphName)) + fout.write("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --out_path %s --do_train --save_tag %s --batch_size 1024 --learning_rate 0.001 --num_files 20 --graph_name %s --epochs %d\n" %(model, normalization, InputFolder+input_graph, OutputDir, SaveTag, GraphName, Epochs)) fout.write("echo 'STOP---------------'\n") fout.write("echo\n") fout.write("echo\n") diff --git a/tools/training/TrainModelFromGraph.py b/tools/training/TrainModelFromGraph.py index d350010..9d96dbf 100644 --- a/tools/training/TrainModelFromGraph.py +++ b/tools/training/TrainModelFromGraph.py @@ -111,15 +111,21 @@ def load_data(self): ### NOW FILTER EMPTY GRAPHS... Graphs_for_training_reduced = Graphs_for_training Graphs_for_training_filtered = [ - g for g in Graphs_for_training_filtered - if not (torch.isnan(g.x).any() or torch.isnan(g.edge_attr).any() or torch.isnan(g.y).any() or g.edge_index.size(1) == 0) + g for g in Graphs_for_training_reduced + if not (torch.isnan(g.y).any() or torch.isnan(g.x).any()) and g.edge_index.size(1) > 0 ] - + # remove extra dimension in y and put deltaPhi and deltaEta in the data object as edge_attr for i in range(0, len(Graphs_for_training_filtered)): Graphs_for_training_filtered[i].y = Graphs_for_training_filtered[i].y.mean(dim=0) Graphs_for_training_filtered[i].edge_attr = torch.stack([Graphs_for_training_filtered[i].deltaPhi.float(), Graphs_for_training_filtered[i].deltaEta.float()], dim=1) + + Graphs_for_training_filtered = [ + g for g in Graphs_for_training_filtered + if not (torch.isnan(g.x).any() or torch.isnan(g.edge_attr).any() or torch.isnan(g.y).any()) + ] + print(f"Total Graphs: {len(Graphs_for_training)}") print(f"Filtered Graphs: {len(Graphs_for_training_filtered)}") @@ -127,6 +133,11 @@ def load_data(self): if self.transform is not None: Graphs_for_training_filtered = [self.transform(data) for data in Graphs_for_training_filtered] + Graphs_for_training_filtered = [ + g for g in Graphs_for_training_filtered + if not (torch.isnan(g.x).any() or torch.isnan(g.edge_attr).any() or torch.isnan(g.y).any()) + ] + # Train and test split: events = len(Graphs_for_training_filtered) ntrain = int((events * 0.7) / self.batch_size) * self.batch_size # to have full batches @@ -166,7 +177,7 @@ def initialize_model(self): print("Model initialized") print(self.model) - def train(self, loader): + def train_model(self, loader): self.model.train() total_loss = 0 for data in loader: @@ -178,10 +189,10 @@ def train(self, loader): self.optimizer.step() total_loss += loss.item() - return total_loss / len(loader.dataset) + return total_loss / len(loader) @torch.no_grad() - def test(self, loader): + def test_model(self, loader): self.model.eval() total_loss = 0 @@ -190,7 +201,7 @@ def test(self, loader): out = self.model(data) loss = self.loss_fn(out, data.y.view(out.size())) total_loss += loss.item() - return total_loss / len(loader.dataset) + return total_loss / len(loader) def Training_loop(self): @@ -200,10 +211,10 @@ def Training_loop(self): print("Start training...") for epoch in range(self.epochs): - train_loss = self.train(self.train_loader) - test_loss = self.test(self.test_loader) + train_loss = self.train_model(self.train_loader) + test_loss = self.test_model(self.test_loader) if (epoch + 1) % 10 == 0: - print(f'Epoch: {epoch + 1:02d}, Train loss: {train_loss:.4f}, Test loss: {test_loss:.4f}, Train accuracy: {train_accuracy:.4f}, Test accuracy: {test_accuracy:.4f}') + print(f'Epoch: {epoch + 1:02d}, Train loss: {train_loss:.4f}, Test loss: {test_loss:.4f}') torch.save(self.model, f"{self.out_path}/model_{self.model_type}_{self.hidden_dim}dim_{epoch+1}epochs_{self.save_tag}.pth") def set_model_path(self, path): From 8e54eebdf1e54a717dacbef55d539ba25407d1d6 Mon Sep 17 00:00:00 2001 From: Santiago Folgueras Date: Tue, 3 Dec 2024 21:51:07 +0100 Subject: [PATCH 11/16] fix bug in transformations --- tools/training/TrainModelFromGraph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/training/TrainModelFromGraph.py b/tools/training/TrainModelFromGraph.py index 9d96dbf..09a32b1 100644 --- a/tools/training/TrainModelFromGraph.py +++ b/tools/training/TrainModelFromGraph.py @@ -8,7 +8,7 @@ import argparse import matplotlib.pyplot as plt from models import GATRegressor, GraphSAGEModel, MPLNNRegressor -from transformations import NormalizeNodeEdgesAndDropTwoFeatures,DropLastTwoNodeFeatures,NormalizeNodeFeatures,NormalizeEdgeFeatures,NormalizeTargets +from transformations import DropLastTwoNodeFeatures,NormalizeNodeFeatures,NormalizeEdgeFeatures,NormalizeTargets import pickle import itertools @@ -63,7 +63,7 @@ def __init__(self, **kwargs): # Apply transformations if necessary self.transform = None if self.normalization == 'NodesAndEdgesAndOnlySpatial': - self.transform = NormalizeNodeEdgesAndDropTwoFeatures() + self.transform = Compose([NormalizeNodeFeatures(),NormalizeEdgeFeatures(),DropLastTwoNodeFeatures()]) elif self.normalization == 'NodesAndEdges': self.transform = Compose([NormalizeNodeFeatures(),NormalizeEdgeFeatures()]) elif self.normalization == 'Nodes': From e32ecb2a218f67af507e81d214857722479009b8 Mon Sep 17 00:00:00 2001 From: folguera Date: Thu, 5 Dec 2024 13:06:32 +0100 Subject: [PATCH 12/16] Add plotting of results to the training scripts --- test/submitJobs_plot_results.py | 110 ++++++++++++++++++++++++++ test/submitJobs_training.py | 2 +- tools/training/TrainModelFromGraph.py | 53 ++++++++++--- tools/training/validation.py | 22 ++++-- 4 files changed, 170 insertions(+), 17 deletions(-) create mode 100644 test/submitJobs_plot_results.py diff --git a/test/submitJobs_plot_results.py b/test/submitJobs_plot_results.py new file mode 100644 index 0000000..2cb93d5 --- /dev/null +++ b/test/submitJobs_plot_results.py @@ -0,0 +1,110 @@ +import os,sys + +print('START\n') +######## YOU ONLY NEED TO FILL THE AREA BELOW ######### +######## customization area ######### +GraphFolder = "/eos/cms/store/user/folguera/L1TMuon/INTREPID/Graphs_v240725_241106/" # list with all the file directories +ModelFolder = "/eos/cms/store/user/folguera/L1TMuon/INTREPID/Model_Graphsv240725_QOverPtRegression_241203/" +ModelTypes = ['SAGE', 'MPNN'] +NormalizationTypes = ['DropLastTwoNodeFeatures', 'NodesAndEdgesAndOnlySpatial'] +InputGraphs = ["3neighbours_muonQOverPt/", "all_connections_muonQOverPt/"] +GraphName = "vix_graph_6Nov" +Epochs = 50 +OutputDir = "/eos/user/f/folguera/www/INTREPID/2024_12_04_GNN_QOverPtRegression/" +JustPrint = True +######## customization end ######### + + +if JustPrint: + print("##########################") + print("source pyenv/bin/activate\n") + + if not os.path.exists(OutputDir): + print("OutputDir %s does not exist" %(OutputDir)) + os.system("mkdir %s" %(OutputDir)) + + for model in ModelTypes: + for normalization in NormalizationTypes: + for input_graph in InputGraphs: + SaveTag = model + "_" + normalization + "_Bsize64_lr5e-4_241106_20files_" + if "all" in input_graph: + SaveTag = SaveTag + "allConnections" + else: + SaveTag = SaveTag + "3neighbours" + ModelFile = f'model_{model}_32dim_50epochs_{SaveTag}.pth' + + print("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --output_dir %s --do_validation --save_tag %s --batch_size 1024 --learning_rate 0.001 --num_files 5 --graph_name %s --epochs %d --model_path %s/%s &\n" %(model, normalization, GraphFolder+input_graph, OutputDir, SaveTag, GraphName, Epochs, ModelFolder,ModelFile)) + + + + print("##########################") + sys.exit() + +### NOW SUBMIT THE JOBS +queue = "microcentury" +WORKDIR = "/afs/cern.ch/user/f/folguera/workdir/INTREPID/tmp/PlotModel/" + +path = os.getcwd() +print('do not worry about folder creation:\n') +os.system("rm -rf %s" %(WORKDIR)) +os.system("mkdir %s" %(WORKDIR)) +os.system("mkdir %s/exec" %(WORKDIR)) +os.system("mkdir %s/batchlogs" %(WORKDIR)) + +if not os.path.exists(OutputDir): + print("OutputDir %s does not exist" %(OutputDir)) + os.system("mkdir %s" %(OutputDir)) +else : + print("Warning: OutputDir already exists. It will be overwritten\n") + print("OutputDir: %s" %(OutputDir)) + + +file_count = 0 +for model in ModelTypes: + for normalization in NormalizationTypes: + for input_graph in InputGraphs: + file_count += 1 + SaveTag = model + "_" + normalization + "_Bsize64_lr5e-4_241106_20files_" + if "all" in input_graph: + SaveTag = SaveTag + "allConnections" + else: + SaveTag = SaveTag + "3neighbours" + ModelFile = f'model_{model}_32dim_50epochs_{SaveTag}.pth' + + with open('%s/exec/job_plot_model_%02d.sh' %(WORKDIR, file_count), 'w') as fout: + fout.write("#!/bin/sh\n") + fout.write("echo\n") + fout.write("echo\n") + fout.write("echo 'START---------------'\n") + fout.write("echo 'WORKDIR ' ${PWD}\n") + fout.write("cd "+str(path)+"\n") + fout.write("source pyenv/bin/activate\n") + fout.write("echo 'Saving Model in %s' \n" %(OutputDir)) + fout.write("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --output_dir --plot_graph_features %s --do_validation --save_tag %s --batch_size 1024 --learning_rate 0.001 --num_files 5 --graph_name %s --epochs %d --model_path %s/%s\n" %(model, normalization, GraphFolder+input_graph, OutputDir, SaveTag, GraphName, Epochs, ModelFolder,ModelFile)) + fout.write("echo 'STOP---------------'\n") + fout.write("echo\n") + fout.write("echo\n") + os.system("chmod 755 %s/exec/job_train_model_%02d.sh" %(WORKDIR, file_count)) + +###### create submit.sub file #### +with open('submit.sub', 'w') as fout: + fout.write("executable = $(filename)\n") + fout.write("arguments = $(ClusterId)$(ProcId)\n") + fout.write("output = %s/batchlogs/$(ClusterId).$(ProcId).out\n" %(WORKDIR)) + fout.write("error = %s/batchlogs/$(ClusterId).$(ProcId).err\n" %(WORKDIR)) + fout.write("log = %s/batchlogs/$(ClusterId).log\n" %(WORKDIR)) + fout.write("request_gpus = 1\n") + fout.write('+JobFlavour = "%s"\n' %(queue)) + fout.write("\n") + fout.write("queue filename matching (%s/exec/job_*sh)\n" %(WORKDIR)) + +###### sends bjobs ###### +os.system("echo submit.sub") +#os.system("condor_submit submit.sub") + +print() +print("your jobs:") +os.system("condor_q") +print() +print('END') +print() diff --git a/test/submitJobs_training.py b/test/submitJobs_training.py index 3c4bb4b..cc62477 100644 --- a/test/submitJobs_training.py +++ b/test/submitJobs_training.py @@ -76,7 +76,7 @@ ###### sends bjobs ###### os.system("echo submit.sub") -#os.system("condor_submit submit.sub") +os.system("condor_submit submit.sub") print() print("your jobs:") diff --git a/tools/training/TrainModelFromGraph.py b/tools/training/TrainModelFromGraph.py index 09a32b1..ade9101 100644 --- a/tools/training/TrainModelFromGraph.py +++ b/tools/training/TrainModelFromGraph.py @@ -19,7 +19,7 @@ class TrainModelFromGraph: def add_args(parser): parser.add_argument('--graph_path', type=str, default='graph_folder', help='Path to the graph data') parser.add_argument('--graph_name', type=str, default='vix_graph_13Nov_3_muonQOverPt', help='Name of the graph data') - parser.add_argument('--out_path', type=str, default='Bsize_gmp_64_lr5e-4_v3', help='Output path for the results') + parser.add_argument('--out_model_path', type=str, default='Bsize_gmp_64_lr5e-4_v3', help='Output path for the results') parser.add_argument('--save_tag', type=str, default='vix_graph_13Nov_3_muonQOverPt', help='Tag for saving the model') parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training') parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning rate for training') @@ -37,7 +37,7 @@ def add_args(parser): def __init__(self, **kwargs): self.graph_path = kwargs.get('graph_path', 'graph_folder') self.graph_name = kwargs.get('graph_name', 'vix_graph_13Nov_3_muonQOverPt') - self.out_path = kwargs.get('out_path', 'Bsize_gmp_64_lr5e-4_v3') + self.out_model_path = kwargs.get('out_model_path', 'Bsize_gmp_64_lr5e-4_v3') self.save_tag = kwargs.get('save_tag', 'vix_graph_13Nov_3_muonQOverPt') self.batch_size = kwargs.get('batch_size', 1024) self.learning_rate = kwargs.get('learning_rate', 0.001) @@ -80,7 +80,38 @@ def __init__(self, **kwargs): else: print("Unknown normalization type, exiting...") sys.exit(1) - + ### Add setter functions for all parameters: + def set_graph_path(self, path): + self.graph_path = path + def set_graph_name(self, name): + self.graph_name = name + def set_out_model_path(self, path): + self.out_model_path = path + def set_save_tag(self, tag): + self.save_tag = tag + def set_batch_size(self, batch_size): + self.batch_size = batch_size + def set_learning_rate(self, learning_rate): + self.learning_rate = learning_rate + def set_epochs(self, epochs): + self.epochs = epochs + def set_model_path(self, model_path): + self.model_path = model_path + def set_do_validation(self, do_validation): + self.do_validation = do_validation + def set_do_train(self, do_train): + self.do_train = do_train + def set_hidden_dim(self, hidden_dim): + self.hidden_dim = hidden_dim + def set_model_type(self, model_type): + self.model_type = model_type + def set_normalization(self, normalization): + self.normalization = normalization + def set_num_files(self, num_files): + self.num_files = num_files + def set_device(self, device): + self.device = device + def load_data(self): # Loading data from graph and convert it to DataLoader graphs = [] @@ -215,14 +246,18 @@ def Training_loop(self): test_loss = self.test_model(self.test_loader) if (epoch + 1) % 10 == 0: print(f'Epoch: {epoch + 1:02d}, Train loss: {train_loss:.4f}, Test loss: {test_loss:.4f}') - torch.save(self.model, f"{self.out_path}/model_{self.model_type}_{self.hidden_dim}dim_{epoch+1}epochs_{self.save_tag}.pth") + torch.save(self.model.state_dict(), f"{self.out_path}/model_{self.model_type}_{self.hidden_dim}dim_{epoch+1}epochs_{self.save_tag}.pth") def set_model_path(self, path): self.model_path = path def load_trained_model(self): print(f"Loading model from {self.model_path}") - self.model.load_state_dict(torch.load(self.model_path, map_location=self.device)) + # load the model, first try state_dict then the model itself + try: + self.model.load_state_dict(torch.load(self.model_path, map_location=self.device)) + except: + self.model = torch.load(self.model_path, map_location=self.device) def main(): @@ -253,17 +288,17 @@ def main(): trainer.initialize_model() if args.plot_graph_features: - from tools.training.validation import plot_graph_feature_histograms - plot_graph_feature_histograms(trainer.train_loader) + from validation import plot_graph_feature_histograms + plot_graph_feature_histograms(trainer.train_loader, output_dir=args.output_dir,label=trainer.model_type+"_"+trainer.save_tag) if args.do_train: trainer.Training_loop() if args.do_validation: trainer.load_trained_model() - from tools.training.validation import plot_prediction_results, evaluate_model + from validation import plot_prediction_results, evaluate_model regression,prediction = evaluate_model(trainer.model, trainer.test_loader, trainer.device) - plot_prediction_results(regression, prediction, output_dir=args.output_dir,label=trainer.model_type) + plot_prediction_results(regression, prediction, output_dir=args.output_dir,model=trainer.model_type, label=trainer.save_tag) if __name__ == "__main__": diff --git a/tools/training/validation.py b/tools/training/validation.py index 6632497..d716c8a 100644 --- a/tools/training/validation.py +++ b/tools/training/validation.py @@ -1,8 +1,9 @@ import os import numpy as np import matplotlib.pyplot as plt +import torch -def plot_graph_feature_histograms(data_loader): +def plot_graph_feature_histograms(data_loader, output_dir='Train', label='Model'): feature_names = ["eta", "phi", "R", "deltaPhi", "deltaEta","Q/pt"] for batch in data_loader: features = batch.x.numpy() @@ -33,7 +34,12 @@ def plot_graph_feature_histograms(data_loader): axs[num_features + (batch.edge_attr.shape[1])].set_ylabel('Frequency') plt.tight_layout() - plt.show() + if not os.path.exists(output_dir): + os.makedirs(output_dir) + fig.savefig(os.path.join(output_dir, f'{label}_inputFeatures.png')) + fig.savefig(os.path.join(output_dir, f'{label}_inputFeatures.pdf')) + fig.savefig(os.path.join(output_dir, f'{label}_inputFeatures.eps')) + break # Only draw the first batch @torch.no_grad() @@ -55,7 +61,7 @@ def evaluate_model(model, test_loader, device): return all_regression, all_prediction -def plot_prediction_results(regression, prediction, output_dir='Test', label='Model'): +def plot_prediction_results(regression, prediction, output_dir='Test', model='model', label='SaveModel'): if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -64,19 +70,19 @@ def plot_prediction_results(regression, prediction, output_dir='Test', label='Mo print("Plotting Regression target") axs[0].hist(regression, bins=np.arange(-0.5,0,0.006), alpha=0.75, label='Regression target') axs[0].hist(prediction, bins=np.arange(-0.5,0,0.006), alpha=0.75, label='Prediction') - axs[0].set_title(f'Regression target and prediction for {label}') + axs[0].set_title(f'Regression target and prediction for {model}') axs[0].set_xlabel('Value') axs[0].set_ylabel('Frequency') axs[0].legend() axs[1].scatter(regression, prediction, alpha=0.5) axs[1].plot([min(prediction), max(prediction)], [min(prediction), max(prediction)], color='red', linestyle='--') # Line of equality - axs[1].set_title(f'Regression target vs prediction for {label}') + axs[1].set_title(f'Regression target vs prediction for {model}') axs[1].set_xlabel('Regression target') axs[1].set_ylabel('Prediction') axs[2].hist(prediction - regression, bins=30, alpha=0.75) - axs[2].set_title(f'Residuals for {label}') + axs[2].set_title(f'Residuals for {model}') axs[2].set_xlabel('Residual') axs[2].set_ylabel('Frequency') @@ -87,9 +93,11 @@ def plot_prediction_results(regression, prediction, output_dir='Test', label='Mo # Add text box with bias and resolution textstr = f'Bias: {bias:.4f}\nResolution: {resolution:.4f}' props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) - axs[1].text(0.95, 0.95, textstr, transform=axs[1].transAxes, fontsize=12, + axs[2].text(0.95, 0.95, textstr, transform=axs[1].transAxes, fontsize=12, verticalalignment='top', horizontalalignment='right', bbox=props) plt.tight_layout() fig.savefig(os.path.join(output_dir, f'{label}_prediction_results.png')) + fig.savefig(os.path.join(output_dir, f'{label}_prediction_results.pdf')) + fig.savefig(os.path.join(output_dir, f'{label}_prediction_results.eps')) From eea588e950a7e891db37fd7233613ea19d23a9c9 Mon Sep 17 00:00:00 2001 From: folguera Date: Thu, 5 Dec 2024 13:13:02 +0100 Subject: [PATCH 13/16] Minor fixes --- test/submitJobs_plot_results.py | 5 ++--- tools/training/validation.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/test/submitJobs_plot_results.py b/test/submitJobs_plot_results.py index 2cb93d5..5706075 100644 --- a/test/submitJobs_plot_results.py +++ b/test/submitJobs_plot_results.py @@ -84,7 +84,7 @@ fout.write("echo 'STOP---------------'\n") fout.write("echo\n") fout.write("echo\n") - os.system("chmod 755 %s/exec/job_train_model_%02d.sh" %(WORKDIR, file_count)) + os.system("chmod 755 %s/exec/job_plot_model_%02d.sh" %(WORKDIR, file_count)) ###### create submit.sub file #### with open('submit.sub', 'w') as fout: @@ -93,14 +93,13 @@ fout.write("output = %s/batchlogs/$(ClusterId).$(ProcId).out\n" %(WORKDIR)) fout.write("error = %s/batchlogs/$(ClusterId).$(ProcId).err\n" %(WORKDIR)) fout.write("log = %s/batchlogs/$(ClusterId).log\n" %(WORKDIR)) - fout.write("request_gpus = 1\n") fout.write('+JobFlavour = "%s"\n' %(queue)) fout.write("\n") fout.write("queue filename matching (%s/exec/job_*sh)\n" %(WORKDIR)) ###### sends bjobs ###### os.system("echo submit.sub") -#os.system("condor_submit submit.sub") +os.system("condor_submit submit.sub") print() print("your jobs:") diff --git a/tools/training/validation.py b/tools/training/validation.py index d716c8a..7bba479 100644 --- a/tools/training/validation.py +++ b/tools/training/validation.py @@ -93,7 +93,7 @@ def plot_prediction_results(regression, prediction, output_dir='Test', model='mo # Add text box with bias and resolution textstr = f'Bias: {bias:.4f}\nResolution: {resolution:.4f}' props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) - axs[2].text(0.95, 0.95, textstr, transform=axs[1].transAxes, fontsize=12, + axs[2].text(0.95, 0.95, textstr, transform=axs[2].transAxes, fontsize=12, verticalalignment='top', horizontalalignment='right', bbox=props) plt.tight_layout() From e9f519126c0e210eb445a8f33de7ce01a1706808 Mon Sep 17 00:00:00 2001 From: folguera Date: Thu, 5 Dec 2024 15:33:39 +0100 Subject: [PATCH 14/16] saving smaller names --- tools/training/TrainModelFromGraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/training/TrainModelFromGraph.py b/tools/training/TrainModelFromGraph.py index ade9101..2b415cc 100644 --- a/tools/training/TrainModelFromGraph.py +++ b/tools/training/TrainModelFromGraph.py @@ -289,7 +289,7 @@ def main(): if args.plot_graph_features: from validation import plot_graph_feature_histograms - plot_graph_feature_histograms(trainer.train_loader, output_dir=args.output_dir,label=trainer.model_type+"_"+trainer.save_tag) + plot_graph_feature_histograms(trainer.train_loader, output_dir=args.output_dir,label=trainer.save_tag) if args.do_train: trainer.Training_loop() From 3eb035a6c685c49a4336b8f2502f0f111089aade Mon Sep 17 00:00:00 2001 From: folguera Date: Thu, 5 Dec 2024 15:37:02 +0100 Subject: [PATCH 15/16] fix typo in submitJobs_plot_results.py --- test/submitJobs_plot_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/submitJobs_plot_results.py b/test/submitJobs_plot_results.py index 5706075..c63451d 100644 --- a/test/submitJobs_plot_results.py +++ b/test/submitJobs_plot_results.py @@ -80,7 +80,7 @@ fout.write("cd "+str(path)+"\n") fout.write("source pyenv/bin/activate\n") fout.write("echo 'Saving Model in %s' \n" %(OutputDir)) - fout.write("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --output_dir --plot_graph_features %s --do_validation --save_tag %s --batch_size 1024 --learning_rate 0.001 --num_files 5 --graph_name %s --epochs %d --model_path %s/%s\n" %(model, normalization, GraphFolder+input_graph, OutputDir, SaveTag, GraphName, Epochs, ModelFolder,ModelFile)) + fout.write("python tools/training/TrainModelFromGraph.py --model_type %s --hidden_dim 32 --normalization %s --graph_path %s --output_dir %s --plot_graph_features --do_validation --save_tag %s --batch_size 1024 --learning_rate 0.001 --num_files 5 --graph_name %s --epochs %d --model_path %s/%s\n" %(model, normalization, GraphFolder+input_graph, OutputDir, SaveTag, GraphName, Epochs, ModelFolder,ModelFile)) fout.write("echo 'STOP---------------'\n") fout.write("echo\n") fout.write("echo\n") From 37875c80ece05a3deee44be15800b5d36175141a Mon Sep 17 00:00:00 2001 From: folguera Date: Thu, 5 Dec 2024 15:41:48 +0100 Subject: [PATCH 16/16] change default job name --- test/submitJobs_plot_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/submitJobs_plot_results.py b/test/submitJobs_plot_results.py index c63451d..7f2bef2 100644 --- a/test/submitJobs_plot_results.py +++ b/test/submitJobs_plot_results.py @@ -41,7 +41,7 @@ sys.exit() ### NOW SUBMIT THE JOBS -queue = "microcentury" +queue = "espresso" WORKDIR = "/afs/cern.ch/user/f/folguera/workdir/INTREPID/tmp/PlotModel/" path = os.getcwd()