From de5f06e5c96e7566da4fd027d4138740b0a41e1a Mon Sep 17 00:00:00 2001 From: ferugit Date: Tue, 28 Feb 2023 17:36:35 +0100 Subject: [PATCH] initial commit --- requirements.txt | 0 src/keyword_spotting/hydra_config.py | 31 ++ src/keyword_spotting/loader.py | 263 +++++++++++++++++ src/keyword_spotting/model.py | 73 +++++ src/keyword_spotting/trainer.py | 404 +++++++++++++++++++++++++++ src/train.py | 3 + 6 files changed, 774 insertions(+) create mode 100644 requirements.txt create mode 100644 src/keyword_spotting/hydra_config.py create mode 100644 src/keyword_spotting/loader.py create mode 100644 src/keyword_spotting/model.py create mode 100644 src/keyword_spotting/trainer.py create mode 100644 src/train.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/src/keyword_spotting/hydra_config.py b/src/keyword_spotting/hydra_config.py new file mode 100644 index 0000000..0a7216c --- /dev/null +++ b/src/keyword_spotting/hydra_config.py @@ -0,0 +1,31 @@ +# +# +# Lenet Audio Hydra configuration +# +# + +from dataclasses import dataclass, field + + +@dataclass +class LenetAudioHydraConf: + + optimizer: str = 'adam' + momentum: float = 0.9 + weight_decay: float = 0.0 + seed: int = 2021 + epochs: int = 80 + batch_size: int = 12 + patience: int = 10 + lr: float = 0.001 + + window_size: float = 3.0 + sampling_rate: int = 16000 + + cuda: bool = True + balance: bool = True + + num_classes: int = 5 + + _target_: str = field(init=False, repr=False, default="gymnos.audio.audio_classification.lenet_audio." + "trainer.LenetAudioTrainer") \ No newline at end of file diff --git a/src/keyword_spotting/loader.py b/src/keyword_spotting/loader.py new file mode 100644 index 0000000..78bd36d --- /dev/null +++ b/src/keyword_spotting/loader.py @@ -0,0 +1,263 @@ +# Fernando López Gavilánez, 2023 + +import random +import json +import os + +import torch +import torchaudio +import numpy as np +import pandas as pd + + +def load_train_partitions(path, window_size=24000, fs=16000): + + # Augments + augments = ['white_noise'] + augments = {key: True for key in augments} + + # Read class index + with open(path + '/metadata/classes_index.json') as f: + class_index = json.load(f) + + # Load train set + train_df = pd.read_csv(path + '/metadata/train.tsv', sep='\t') + train_ID_list = list(train_df['Sample_ID']) + + # Load validation set + validation_df = pd.read_csv(path + '/metadata/dev.tsv', sep='\t') + validation_ID_list = list(validation_df['Sample_ID']) + + # Generate Datasets + train_dataset = AudioDataset( + path, + train_ID_list, + train_df, + class_index, + window_size=window_size, + fs=fs, + augments=augments + ) + validation_dataset = AudioDataset( + path, + validation_ID_list, + validation_df, + class_index, + window_size=window_size, + fs=fs, + augments=augments + ) + + return train_dataset, validation_dataset + + +def load_test_partition(path, window_size=24000, fs=16000, augments=None): + + # Augments + augments = ['white_noise'] + augments = {key: False for key in augments} + + # Read class index + with open(path + '/metadata/classes_index.json') as f: + class_index = json.load(f) + + # Load train set + test_df = pd.read_csv(path + '/metadata/test.tsv', sep='\t') + test_ID_list = list(test_df['Sample_ID']) + + # Generate Datasets + test_dataset = AudioDataset( + path, + test_ID_list, + test_df, + class_index, + window_size=window_size, + fs=fs, + augments=augments + ) + + return test_dataset + + +class AudioDataset(torch.utils.data.Dataset): + """ + Torch dataset for lazy load. + """ + def __init__(self, path, list_IDs, dataframe, class_index, window_size=24000, fs=16000, augments=None): + + # data path + self.path = path + + self.window_size = window_size + self.fs = fs # Hz + + # Data information + self.list_IDs = list_IDs + self.dataframe = dataframe + self.n_samples = len(list_IDs) + self.class_index = class_index + + # Data augments + self.augments = [k for k,v in augments.items() if v == True] + self.white_noise = True if augments['white_noise'] else None + + # Number of classes + self.n_classes = len(self.get_unique_classes()) + self.classes = self.get_unique_classes() + + def __len__(self): + """ + Denote dataset sample. + """ + return len(self.list_IDs) + + def get_unique_classes(self): + """ + Get unique classes from multilabel dataset. + """ + return list(self.dataframe['Label'].unique()) + + def get_class_weigths(self): + """ + Get class weigths. + """ + # Calculate the real value counts for each class + unique_classes = self.get_unique_classes() + real_counts = dict.fromkeys(unique_classes, 0) + value_counts = self.dataframe['Label'].value_counts().to_dict() + + for unique_class in unique_classes: + for value_count in value_counts: + if(unique_class in value_count): + real_counts[unique_class] += value_counts[value_count] + + # Calculate class weigths + class_weigths = dict.fromkeys(unique_classes, 1.0) + for unique_class in unique_classes: + class_weigths[unique_class] /= real_counts[unique_class] + + return class_weigths + + def get_sample_weigths(self): + class_weigths = self.get_class_weigths() + samples_labels = self.dataframe['Label_Index'] + + # Get a dict with the classes index and weigths + index_weigths = {} + for index in self.class_index: + index_weigths[index] = class_weigths[self.class_index[index]] + + samples_weight = np.zeros(len(samples_labels)) + + iterator = 0 + for sample_label in samples_labels: + samples_weight[iterator] += index_weigths[str(sample_label)] + iterator += 1 + + return samples_weight + + def get_number_of_classes(self): + return self.n_classes + + def __repr__(self): + """ + Data infromation + """ + repr_str = ( + "Number of samples: " + str(self.n_samples) + "\n" + "Window size: " + str(self.window_size) + "\n" + "Augments: " + str(self.augments) + "\n" + + ) + return repr_str + + def __getitem__(self, index): + """ + Get a single sample + Args: + index: index to recover a single sample + Returns: + x,y: features extracted and label + """ + # Select sample + ID = self.list_IDs[index] + + # Read audio + start = 0.0 + end = self.dataframe.set_index('Sample_ID').at[ID, 'Audio_Length'] + audio_path = self.dataframe.set_index('Sample_ID').at[ID, 'Sample_Path'] + audio = self.__read_wav(audio_path, start, end) + + # Prepare audio + audio = self.__prepare_audio(audio) + + # Get label + label = self.dataframe.set_index('Sample_ID').at[ID, 'Label_Index'] + + # One-hot encoding + target = self.__one_hot_encoding(label) + + return ID, audio, target + + def __one_hot_encoding(self, label): + target = torch.eye(len(self.class_index))[int(label)] + return target.float() + + def __read_wav(self, filepath, start, end): + """ + Read audio wave file applying normalization with respecto of the maximum of the signal + Args: + filepath: audio file path + Returns: + audio_signal: numpy array containing audio signal + """ + audio_signal, _ = torchaudio.load( + os.path.join(self.path, filepath), + frame_offset=int(start*self.fs), + num_frames=int((end-start)*self.fs) + ) + audio_signal = audio_signal[0].numpy() + if(np.max(np.abs(audio_signal)) == 0.0): + print('Problem with audio: {} start at {} and end at {}'.format(filepath, start, end)) + + audio_signal = self.__normalize_audio(audio_signal) + return audio_signal + + def __normalize_audio(self, audio, eps=0.001): + """ + Peak normalization. + """ + return (audio.astype(np.float32) / float(np.amax(np.abs(audio)))) + eps + + def __prepare_audio(self, audio_signal): + """ + Adapt audio clip to window size. Crops if larger and pads if shorter + """ + + # Adapt sample to windows size + audio_length = audio_signal.shape[0] + if(audio_length >= self.window_size): + + # If audio is bigger than window size use random crop: random shift + left_bound = random.randint(0, audio_length - self.window_size) + right_bound = left_bound + self.window_size + audio_signal = audio_signal[left_bound:right_bound] + + else: + # If the audio is smaller than the window size: pad original signal with 0z + padding = self.window_size - audio_length + bounds_sizes = np.random.multinomial(padding, np.ones(2)/2, size=1)[0] + audio_signal = np.pad( + audio_signal, + (bounds_sizes[0], bounds_sizes[1]), + 'constant', + constant_values=(0, 0) + ) + + # Add white noise + if(self.white_noise): + noise = (np.random.normal(0,1.3,len(audio_signal))*32).astype('int16') + noise = noise.astype(np.float32) / float(np.iinfo(noise.dtype).max) + audio_signal += noise + + return audio_signal diff --git a/src/keyword_spotting/model.py b/src/keyword_spotting/model.py new file mode 100644 index 0000000..67901a3 --- /dev/null +++ b/src/keyword_spotting/model.py @@ -0,0 +1,73 @@ +# Fernando López Gavilánez, 2023 + +import math +import torch +import torch.nn as nn +import torchaudio + + +class SerializableModule(nn.Module): + + def __init__(self): + super().__init__() + + def save(self, filename): + torch.save(self.state_dict(), filename +'.pt') + + def save_entire_model(self, filename): + torch.save(self, filename +'_entire.pt') + + def save_scripted(self, filename): + scripted_module = torch.jit.script(self) + scripted_module.save(filename + '.zip') + + def load(self, filename): + self.load_state_dict(torch.load(filename, map_location=lambda storage, loc: storage)) + +class LeNetAudio(SerializableModule): + + def __init__(self, num_classes, window_size=24000): + super().__init__() + + # Mel-Spectrogram + self.mel_spectrogram = torchaudio.transforms.MelSpectrogram( + sample_rate=16000, + n_fft=512, + win_length=320, + hop_length=160, + n_mels=40 + ) + + self.features = nn.Sequential( + nn.InstanceNorm2d(1), + nn.Conv2d(1, 16, kernel_size=5), + nn.ReLU(), + nn.MaxPool2d(2), + nn.Conv2d(16, 32, kernel_size=5), + nn.ReLU(), + nn.MaxPool2d(2), + nn.Dropout(0.2) + ) + + f_bins = 40 + t_bins = int(window_size/160) + 1 + f_r = self.__get_size(f_bins) + t_r = self.__get_size(t_bins) + + self.classifier = nn.Sequential( + nn.Linear(32 * f_r * t_r, 100), + nn.Dropout(0.5), + nn.ReLU(), + nn.Linear(100, num_classes) + ) + + def forward(self, x): + x = x.unsqueeze(1) # [b, ch, t] + x = self.mel_spectrogram(x) # [b, ch, f_b, t_b] + x = self.features(x) + x = x.view(x.shape[0], -1) # [b, ch*f_b*t_b] + x = torch.sigmoid(self.classifier(x)) # [b, 10] + return x + + def __get_size(self, in_dim): + return int(math.floor((((in_dim-4)/2)-4)/2)) \ No newline at end of file diff --git a/src/keyword_spotting/trainer.py b/src/keyword_spotting/trainer.py new file mode 100644 index 0000000..7de5be3 --- /dev/null +++ b/src/keyword_spotting/trainer.py @@ -0,0 +1,404 @@ +# Fernando López Gavilánez, 2023 + +import torch +import numpy as np +import mlflow +from sklearn.metrics import roc_curve, roc_auc_score, classification_report + +import logging +import inspect +import time +import random + +from dataclasses import dataclass + +from ....base import BaseTrainer +from .hydra_conf import LenetAudioHydraConf +from . import loader +from .model import LeNetAudio + + +@dataclass +class LenetAudioTrainer(LenetAudioHydraConf, BaseTrainer): + """ + TODO: docstring for trainer + """ + + def __post_init__(self): + + self._set_seed() + + self._model = LeNetAudio( + self.num_classes, + window_size=int(self.window_size*self.sampling_rate) + ) + + # Select device + if (torch.cuda.is_available() and self.cuda): + self._device = torch.device("cuda") + else: + self._device = torch.device("cpu") + + self._model.to(self._device) + + def prepare_data(self, root): + self.train_dataset, self.dev_dataset = loader.load_train_partitions( + root, + window_size=int(self.window_size*self.sampling_rate) + ) + self.test_dataset = loader.load_test_partition( + root, + window_size=int(self.window_size*self.sampling_rate) + ) + + def train(self): + + logger = logging.getLogger(__name__) + + logger.info(inspect.cleandoc(f'''Starting training: + Epochs: {self.epochs} + Batch size: {self.batch_size} + Learning rate: {self.lr} + Training samples: {self.train_dataset.n_samples} + Validation samples: {self.dev_dataset.n_samples} + Device: {self._device.type} + Optimizer: {self.optimizer} + Dataset classes: {self.train_dataset.classes} + Balance: {self.balance} + Patience: {self.patience} + Cuda: {self.cuda} + ''')) + + # Optimizer + if self.optimizer.lower() == 'adam': + optimizer = torch.optim.Adam(self._model.parameters(), lr=self.lr, weight_decay=self.weight_decay) + elif(self.optimizer.lower() == 'rmsprop'): + optimizer = torch.optim.RMSprop(self._model.parameters(),lr=self.lr, weight_decay=self.weight_decay) + elif('sgd' in self.optimizer.lower()): + optimizer = torch.optim.SGD(self._model.parameters(), lr=self.lr, momentum=self.momentum, weight_decay=self.weight_decay) + + # Loss function + criterion = torch.nn.CrossEntropyLoss() + + # If balance dataset: use Weigth Random Sampler + if(self.balance): + samples_per_class = 100 # empirical value + sampler = torch.utils.data.WeightedRandomSampler( + torch.from_numpy(self.train_dataset.get_sample_weigths()), + num_samples=self.train_dataset.n_classes*samples_per_class, + replacement=True + ) + shuffle_train = False + else: + sampler = None + shuffle_train = True + + # Generate data loaders + train_loader = torch.utils.data.DataLoader( + self.train_dataset, + shuffle=shuffle_train, + batch_size=self.batch_size, + drop_last=False, + sampler=sampler + ) + + validation_loader = torch.utils.data.DataLoader( + self.dev_dataset, + shuffle=True, + batch_size=self.batch_size, + drop_last=False + ) + + # Metrics + train_losses = [] + train_accuracies = [] + validation_losses = [] + validation_accuracies = [] + epoch_times = [] + + # Early stopping + best_loss_validation = np.inf + patience_counter = self.patience + + # Get trainable parameters + trainable_params = sum(p.numel() for p in self._model.parameters() if p.requires_grad) + logger.info('Number of trainable parameters: ' + str(trainable_params)) + + # For present intermediate information + n_intermediate_steps = int(len(train_loader)/3) + + # Start training loop + print('Starting trainig...') + for epoch in range(1, self.epochs+1): + start_time = time.process_time() + + # Train model + train_loss = 0.0 + train_accuracy = 0.0 + counter = 0 + + self._model.train() + for _, x, target in train_loader: + counter += 1 + self._model.zero_grad() + + # Model forward + out = self._model(x.to(self._device).float()) + + # Backward and optimization + loss = criterion(out, target.to(self._device)) + loss.backward() + optimizer.step() + + # Store metrics + train_loss += loss.item() + train_accuracy += self._right_predictions(out, target) + + # Present intermediate results + if (counter%n_intermediate_steps == 0): + logger.info("Epoch {}......Step: {}/{}....... Average Loss for Step: {} | Accuracy: {}".format( + epoch, + counter, + len(train_loader), + round(train_loss/counter, 4), + round(train_accuracy/(counter*self.batch_size), 4) + )) + + # Validate model + validation_loss = 0.0 + validation_accuracy = 0.0 + + with torch.no_grad(): + self._model.eval() + for _, x, target in validation_loader: + + out = self._model(x.to(self._device).float()) + + # Store metrics: loss + loss = criterion(out, target.to(self._device)) + validation_loss += loss.item() + validation_accuracy += self._right_predictions(out, target) + + # Calculate average losses + train_loss = train_loss/len(train_loader) + train_accuracy = train_accuracy/len(train_loader.sampler) + validation_loss = validation_loss/len(validation_loader) + validation_accuracy = validation_accuracy/len(validation_loader.sampler) + train_losses.append(train_loss) + train_accuracies.append(train_accuracy) + validation_losses.append(validation_loss) + validation_accuracies.append(validation_accuracy) + + mlflow.log_metrics({ + "train/loss": train_loss, + "train/accuracy": train_accuracy, + "val/loss": validation_loss, + "val/accuracy": validation_accuracy, + }, counter) + mlflow.log_metric("epoch", epoch) + + # Print epoch information + current_time = time.process_time() + logger.info("") + logger.info("Epoch {}/{} Done.".format(epoch, self.epochs)) + logger.info("\t Tain Loss: {} | Train Accuracy: {}".format(train_loss, train_accuracy)) + logger.info("\t Validation Loss: {} | Validation Accuracy: {}".format(validation_loss, validation_accuracy)) + logger.info("\t Time Elapsed for Epoch: {} seconds".format(str(current_time-start_time))) + logger.info("") + epoch_times.append(current_time-start_time) + + # Early stopping + if(best_loss_validation <= validation_loss): + patience_counter += 1 + + logger.info('Validation loss did not improve {:.3f} vs {:.3f}. Patience {}/{}.'.format( + validation_loss, + best_loss_validation, + patience_counter, + self.patience + )) + logger.info("") + + if(patience_counter == self.patience): + logger.info('Breaking train loop: out of patience') + logger.info("") + break + else: + # Reinitialize patience counter and save model + patience_counter = 0 + best_loss_validation = validation_loss + self._model.save("lenet") + mlflow.log_artifact("lenet.pt") + + logger.info("Total Training Time: {} seconds".format(str(sum(epoch_times)))) + + def test(self): + + logger = logging.getLogger(__name__) + + # Loss function + criterion = torch.nn.CrossEntropyLoss() + + test_loader = torch.utils.data.DataLoader( + self.test_dataset, + shuffle=False, + batch_size=self.batch_size, + drop_last=False + ) + + # Metrics initialization + test_loss = 0.0 + test_accuracy = 0.0 + + labels = [] + predictions = [] + + # For present intermediate information + n_intermediate_steps = int(len(test_loader)/2) + counter = 0 + + with torch.no_grad(): + + self._model.eval() + + for ID, x, target in test_loader: + counter += 1 + + # Model forward + out = self._model(x.to(self._device).float()) + + # Store metrics: loss and accuracy + loss = criterion(out, target.to(self._device)) + test_loss += loss.item() + test_accuracy += self._right_predictions(out, target) + + # Present intermediate results + if (counter%n_intermediate_steps == 0): + logger.info("Epoch {}......Step: {}/{}....... Average Loss for Step: {} | Accuracy: {}".format( + 1, + counter, + len(test_loader), + round(test_loss/counter, 4), + round(test_accuracy/(counter*self.batch_size), 4) + )) + + # Store labels and predictions + labels += target.squeeze().tolist() + predictions += out.squeeze().tolist() + + # Calculate average losses ans accuracies + test_loss = test_loss/len(test_loader) + test_accuracy = test_accuracy/len(test_loader.sampler) + + mlflow.log_metrics({ + "test/loss": test_loss, + "test/accuracy": test_accuracy, + }) + + # Classification report + target_names = [] + classes_index = self.test_dataset.class_index + for index in range(len(classes_index)): + target_names.append(classes_index[str(index)]) + report = self._get_metrics(labels, predictions, target_names=target_names) + mlflow.log_dict(report, "classification_report.json") + + def _right_predictions(self, out, label): + """ + From a given set of labes and predictions + it uses a softmax function to determine the + output label + """ + bool_out = out.data.cpu() > 0.5 + bool_label = label > 0.5 + + counter = 0 + for i in range(out.shape[0]): + + if (torch.all(torch.eq(bool_out[i], bool_label[i]))): + counter += 1 + + return counter + + def _get_metrics(self, labels, outputs, target_names=None, optimal_thresholds=None): + """ + Get classification metrics + Args: + - labels: target labels + - outputs: scores obtained from the model + - target_names: used for the classification report + - optimal_thersholds: if defined, thresholds aren't calculed + Returns: + - metrics: dict with metrics + - opt_thresholds: optimal threshols + """ + auc_list = [] + opt_thresholds = [] + labels_array = np.array(labels) + outputs_array = np.array(outputs) + n_classes = labels_array.shape[1] + + # Get AUC, optimal threshold, f1_score curve and for each class + for i in range(n_classes): + true_labels, pred_scores = labels_array[:, i], outputs_array[:, i] + fpr, tpr, thresholds, auc = self._compute_auc(true_labels, pred_scores) + optimal_threshold, _ = self._get_youden_threshold(tpr, fpr, thresholds) + auc_list.append(auc) + opt_thresholds.append(optimal_threshold) + + # If there are already defined thresholds (arg) ignore the calculated ones + if(optimal_thresholds): + opt_thresholds = optimal_thresholds + + # Apply thresholds + predicted_labels = self._scores_to_labels(outputs_array, opt_thresholds) + + # Classification Report + report = classification_report(labels, predicted_labels, target_names=target_names, output_dict=True) + + # Add optimal threshold and AUC to metrics dict + for index in range(n_classes): + report[target_names[index]]['AUC'] = auc_list[index] + report[target_names[index]]['Optimal_threshold'] = opt_thresholds[index] + + return report + + def _compute_auc(self, true_labels, pred_scores): + """ + Computes the ROC AUC + args: + true_labels: target labels + pred_scores: model output + """ + fpr, tpr, thresholds = roc_curve(true_labels, pred_scores) + roc_auc = roc_auc_score(true_labels, pred_scores) + + return fpr, tpr, thresholds, roc_auc + + def _get_youden_threshold(self, tpr, fpr, thresholds): + "Calculates the optimal threshold using the Younden's index" + J = tpr - fpr + threshold_index = np.argmax(J) + optimal_threshold = thresholds[threshold_index] + return optimal_threshold, threshold_index + + def _scores_to_labels(self, outputs_array, thresholds): + """ + Transform scores into labels using a different thershold per class. + """ + predicted_labels = torch.zeros(outputs_array.shape).float() + for i in range(outputs_array.shape[1]): + predicted_labels[:, i] = (torch.tensor(outputs_array[:, i]) > thresholds[i]).float() + + return predicted_labels.data.numpy().tolist() + + def _set_seed(self): + """ + Fix seed of torch, numpy and random. + """ + torch.manual_seed(self.seed) + np.random.seed(self.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(self.seed) + torch.backends.cudnn.deterministic = True + random.seed(self.seed) \ No newline at end of file diff --git a/src/train.py b/src/train.py new file mode 100644 index 0000000..2c95df2 --- /dev/null +++ b/src/train.py @@ -0,0 +1,3 @@ + + +# Use here the rest of the elements to train the model \ No newline at end of file