From d8d977e31cae023568e3023e87fc164920441564 Mon Sep 17 00:00:00 2001 From: Aleksey Zhelo Date: Mon, 24 Aug 2020 22:36:57 +0300 Subject: [PATCH] Added format-based dataset class switch, removed scp: prefix from paths --- .../calculate_accuracy.py | 12 +++-- acoustic_word_embeddings/calculate_ap.py | 20 ++++---- .../core/average_precision.py | 2 +- .../core/gru_classifier.py | 2 +- .../core/lstm_classifier.py | 2 +- acoustic_word_embeddings/core/siamese_gru.py | 4 +- acoustic_word_embeddings/core/siamese_lstm.py | 4 +- .../core/util/net_util.py | 6 +-- acoustic_word_embeddings/gen_embeddings.py | 16 ++++--- acoustic_word_embeddings/nets/gru_fc_base.py | 4 +- acoustic_word_embeddings/nets/lstm_fc_base.py | 2 +- acoustic_word_embeddings/nets/model.py | 2 +- acoustic_word_embeddings/train_classifier.py | 20 +++++--- acoustic_word_embeddings/train_siamese.py | 10 ++-- auto_rating/collect_manual_rating_data.py | 2 +- .../rs_accuracy_analysis_by_example_count.py | 3 -- auto_rating/rs_analysis.py | 3 -- base/common.py | 2 +- base/data_io/dataset.py | 31 ++++++++++++- base/data_io/dataset2lmdb.py | 1 - base/data_io/kaldi_dataset.py | 46 ++++++++----------- base/data_io/lmdb_dataset.py | 3 +- .../collect_independent_words.py | 4 +- .../prepare_independent_words_dataset.py | 4 +- dataset_prep/core/cleaning.py | 2 +- dataset_prep/split_train_dev_test.py | 4 +- 26 files changed, 120 insertions(+), 91 deletions(-) diff --git a/acoustic_word_embeddings/calculate_accuracy.py b/acoustic_word_embeddings/calculate_accuracy.py index 56a065f..1d5c73e 100644 --- a/acoustic_word_embeddings/calculate_accuracy.py +++ b/acoustic_word_embeddings/calculate_accuracy.py @@ -1,13 +1,13 @@ import numpy as np from sklearn.neighbors import KNeighborsClassifier -from acoustic_word_embeddings.core.util.common import load_embeddings from acoustic_word_embeddings.core.loss.embedding_loss import loss_name2class +from acoustic_word_embeddings.core.util.common import load_embeddings from acoustic_word_embeddings.core.util.net_util import read_embedding_loss, load_net from acoustic_word_embeddings.gen_embeddings import get_or_generate_embeddings from acoustic_word_embeddings.train_classifier import process_classifier_epoch from base.common import get_dataset_paths -from base.data_io.kaldi_dataset import KaldiDataset +from base.data_io.dataset import get_dataset_class_for_path from conf import current_dataset @@ -91,15 +91,17 @@ def do_calculate_accuracy(run_dir, epoch, is_classifier, dataset=None, partition if dataset is None: dataset = current_dataset train_path, dev_path, test_path = get_dataset_paths(dataset) + # noinspection PyPep8Naming + DatasetClass = get_dataset_class_for_path(train_path, logger=None) if partition == 'train': - dataset = KaldiDataset('scp:' + train_path, parent_dataset_path=train_scp, training=False, logger=None, + dataset = DatasetClass(train_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) if partition == 'dev': - dataset = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_scp, training=False, logger=None, + dataset = DatasetClass(dev_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) if partition == 'test': - dataset = KaldiDataset('scp:' + test_path, parent_dataset_path=train_scp, training=False, logger=None, + dataset = DatasetClass(test_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) # TODO: no automatic detection for batch_first and data_parallel diff --git a/acoustic_word_embeddings/calculate_ap.py b/acoustic_word_embeddings/calculate_ap.py index 8084e90..cf26653 100644 --- a/acoustic_word_embeddings/calculate_ap.py +++ b/acoustic_word_embeddings/calculate_ap.py @@ -2,12 +2,12 @@ import numpy as np -from acoustic_word_embeddings.core.util.args_util import parse_load_epoch_args from acoustic_word_embeddings.core.average_precision import average_precision +from acoustic_word_embeddings.core.util.args_util import parse_load_epoch_args from acoustic_word_embeddings.core.util.net_util import load_net from acoustic_word_embeddings.gen_embeddings import get_siamese_embeddings, get_classifier_embeddings from base.common import get_dataset_paths -from base.data_io.kaldi_dataset import KaldiDataset +from base.data_io.dataset import get_dataset_class_for_path from conf import current_dataset @@ -35,6 +35,8 @@ def do_calculate_ap(run_dir, epoch, dataset=None, partition='dev'): dataset = current_dataset train_path, dev_path, test_path = get_dataset_paths(dataset) + # noinspection PyPep8Naming + DatasetClass = get_dataset_class_for_path(train_path, logger=None) if len(checkpoints) == 0: print('No checkpoints found in {0} for run {1}'.format(checkpoint_dir, run_dir)) @@ -42,18 +44,18 @@ def do_calculate_ap(run_dir, epoch, dataset=None, partition='dev'): sys.exit(-1) if partition == 'train': - data_train = KaldiDataset('scp:' + train_path, parent_dataset_path=train_scp, training=False, logger=None, + data_train = DatasetClass(train_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) return get_epoch_ap(net, config, checkpoints, loss, data_train, epoch, get_embeddings, subsample_size=3000) if partition == 'dev': - data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_scp, training=False, logger=None, + data_dev = DatasetClass(dev_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) return get_epoch_ap(net, config, checkpoints, loss, data_dev, epoch, get_embeddings) if partition == 'test': - data_test = KaldiDataset('scp:' + test_path, parent_dataset_path=train_scp, training=False, logger=None, + data_test = DatasetClass(test_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) return get_epoch_ap(net, config, checkpoints, loss, data_test, epoch, get_embeddings) @@ -85,6 +87,8 @@ def __main(run_dir, dataset=None, for_epochs=None, gen_train=False, gen_dev=True dataset = current_dataset train_path, dev_path, test_path = get_dataset_paths(dataset) + # noinspection PyPep8Naming + DatasetClass = get_dataset_class_for_path(train_path, logger=None) if len(checkpoints) == 0: print('No checkpoints found in {0} for run {1}'.format(checkpoint_dir, run_dir)) @@ -95,18 +99,18 @@ def __main(run_dir, dataset=None, for_epochs=None, gen_train=False, gen_dev=True for_epochs = sorted(list(checkpoints.keys())) if gen_train: - data_train = KaldiDataset('scp:' + train_path, parent_dataset_path=train_scp, training=False, logger=None, + data_train = DatasetClass(train_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) _print_ap_per_epoch(net, config, checkpoints, loss, data_train, 'train', for_epochs, get_embeddings, subsample_size=3000) if gen_dev: - data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_scp, training=False, logger=None, + data_dev = DatasetClass(dev_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) _print_ap_per_epoch(net, config, checkpoints, loss, data_dev, 'dev', for_epochs, get_embeddings) if gen_test: - data_test = KaldiDataset('scp:' + test_path, parent_dataset_path=train_scp, training=False, logger=None, + data_test = DatasetClass(test_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) _print_ap_per_epoch(net, config, checkpoints, loss, data_test, 'test', for_epochs, get_embeddings) diff --git a/acoustic_word_embeddings/core/average_precision.py b/acoustic_word_embeddings/core/average_precision.py index 093fe75..5f3f578 100644 --- a/acoustic_word_embeddings/core/average_precision.py +++ b/acoustic_word_embeddings/core/average_precision.py @@ -1,6 +1,6 @@ import numpy as np -from scipy.special import comb from scipy.spatial.distance import pdist +from scipy.special import comb def test_chance_level(labels): diff --git a/acoustic_word_embeddings/core/gru_classifier.py b/acoustic_word_embeddings/core/gru_classifier.py index 0b892ed..478c792 100644 --- a/acoustic_word_embeddings/core/gru_classifier.py +++ b/acoustic_word_embeddings/core/gru_classifier.py @@ -1,9 +1,9 @@ import torch import torch.nn as nn -from base.settings import Settings from acoustic_word_embeddings.nets.common import hidden2fc_input from acoustic_word_embeddings.nets.gru_fc_base import GRU_FC_base +from base.settings import Settings class GRUClassifier(GRU_FC_base): diff --git a/acoustic_word_embeddings/core/lstm_classifier.py b/acoustic_word_embeddings/core/lstm_classifier.py index 4a1dc68..9c050bc 100644 --- a/acoustic_word_embeddings/core/lstm_classifier.py +++ b/acoustic_word_embeddings/core/lstm_classifier.py @@ -1,9 +1,9 @@ import torch import torch.nn as nn -from base.settings import Settings from acoustic_word_embeddings.nets.common import hidden2fc_input from acoustic_word_embeddings.nets.lstm_fc_base import LSTM_FC_base +from base.settings import Settings class LSTMClassifier(LSTM_FC_base): diff --git a/acoustic_word_embeddings/core/siamese_gru.py b/acoustic_word_embeddings/core/siamese_gru.py index 98691c9..988b5fe 100644 --- a/acoustic_word_embeddings/core/siamese_gru.py +++ b/acoustic_word_embeddings/core/siamese_gru.py @@ -2,10 +2,10 @@ import torch from acoustic_word_embeddings.core.loss.embedding_loss import triplet_loss_offline -from base import util -from base.settings import Settings from acoustic_word_embeddings.nets.common import hidden2fc_input from acoustic_word_embeddings.nets.gru_fc_base import GRU_FC_base +from base import util +from base.settings import Settings class SiameseGRU(GRU_FC_base): diff --git a/acoustic_word_embeddings/core/siamese_lstm.py b/acoustic_word_embeddings/core/siamese_lstm.py index e2af18f..73cb123 100644 --- a/acoustic_word_embeddings/core/siamese_lstm.py +++ b/acoustic_word_embeddings/core/siamese_lstm.py @@ -2,10 +2,10 @@ import torch from acoustic_word_embeddings.core.loss.embedding_loss import triplet_loss_offline -from base import util -from base.settings import Settings from acoustic_word_embeddings.nets.common import hidden2fc_input from acoustic_word_embeddings.nets.lstm_fc_base import LSTM_FC_base +from base import util +from base.settings import Settings class SiameseLSTM(LSTM_FC_base): diff --git a/acoustic_word_embeddings/core/util/net_util.py b/acoustic_word_embeddings/core/util/net_util.py index 43f05d6..c82c29a 100644 --- a/acoustic_word_embeddings/core/util/net_util.py +++ b/acoustic_word_embeddings/core/util/net_util.py @@ -7,15 +7,15 @@ import numpy as np -from acoustic_word_embeddings.core.util.args_util import parse_training_args -from acoustic_word_embeddings.core.loss.embedding_loss import DistanceBasedLoss from acoustic_word_embeddings.core.gru_classifier import GRUClassifier +from acoustic_word_embeddings.core.loss.embedding_loss import DistanceBasedLoss from acoustic_word_embeddings.core.lstm_classifier import LSTMClassifier from acoustic_word_embeddings.core.siamese_gru import SiameseGRU from acoustic_word_embeddings.core.siamese_lstm import SiameseLSTM +from acoustic_word_embeddings.core.util.args_util import parse_training_args +from acoustic_word_embeddings.nets.common import torch_load_unwrapped from base import util from base.settings import Settings -from acoustic_word_embeddings.nets.common import torch_load_unwrapped from base.util import create_logger from conf import awe_runs_dir diff --git a/acoustic_word_embeddings/gen_embeddings.py b/acoustic_word_embeddings/gen_embeddings.py index 485614e..fd19f60 100644 --- a/acoustic_word_embeddings/gen_embeddings.py +++ b/acoustic_word_embeddings/gen_embeddings.py @@ -8,10 +8,10 @@ from acoustic_word_embeddings.core.util.args_util import parse_gen_args from acoustic_word_embeddings.core.util.common import embeddings_dir2dict from acoustic_word_embeddings.core.util.net_util import load_net +from acoustic_word_embeddings.nets.common import torch_load_unwrapped from base import util from base.common import get_dataset_paths -from base.data_io.kaldi_dataset import KaldiDataset -from acoustic_word_embeddings.nets.common import torch_load_unwrapped +from base.data_io.dataset import get_dataset_class_for_path from conf import current_dataset, new_path, processed_data_dir @@ -98,22 +98,24 @@ def generate_embeddings(run_dir, dataset=None, gen_train=False, gen_dev=False, g if dataset is None: dataset = current_dataset train_path, dev_path, test_path = get_dataset_paths(dataset) + # noinspection PyPep8Naming + DatasetClass = get_dataset_class_for_path(train_path, logger=None) if gen_train: - data_train = KaldiDataset('scp:' + train_path, parent_dataset_path=train_scp, training=False, logger=None, + data_train = DatasetClass(train_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) if gen_dev: - data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_scp, training=False, logger=None, + data_dev = DatasetClass(dev_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) if gen_test: - data_test = KaldiDataset('scp:' + test_path, parent_dataset_path=train_scp, training=False, logger=None, + data_test = DatasetClass(test_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) if gen_new: - data_new = KaldiDataset('scp:' + new_path, parent_dataset_path=train_scp, training=False, logger=None, + data_new = DatasetClass(new_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) if gen_background: background_path = os.path.join(processed_data_dir, 'background_train_v4', 'background_data.scp') - data_background = KaldiDataset('scp:' + background_path, parent_dataset_path=train_scp, training=False, logger=None, + data_background = DatasetClass(background_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) train_embeddings_dir = os.path.join(run_dir, 'train_embeddings') diff --git a/acoustic_word_embeddings/nets/gru_fc_base.py b/acoustic_word_embeddings/nets/gru_fc_base.py index 8626960..8f119aa 100644 --- a/acoustic_word_embeddings/nets/gru_fc_base.py +++ b/acoustic_word_embeddings/nets/gru_fc_base.py @@ -3,10 +3,10 @@ import torch import torch.nn as nn -from base.settings import Settings from acoustic_word_embeddings.nets.common import hidden2fc_input -from acoustic_word_embeddings.nets.modules_experimental.custom_gru import CustomGRU, BiRNNMode from acoustic_word_embeddings.nets.model import Model +from acoustic_word_embeddings.nets.modules_experimental.custom_gru import CustomGRU, BiRNNMode +from base.settings import Settings # noinspection PyPep8Naming diff --git a/acoustic_word_embeddings/nets/lstm_fc_base.py b/acoustic_word_embeddings/nets/lstm_fc_base.py index 32f87c4..0e14cda 100644 --- a/acoustic_word_embeddings/nets/lstm_fc_base.py +++ b/acoustic_word_embeddings/nets/lstm_fc_base.py @@ -3,9 +3,9 @@ import torch import torch.nn as nn -from base.settings import Settings from acoustic_word_embeddings.nets.common import hidden2fc_input from acoustic_word_embeddings.nets.model import Model +from base.settings import Settings # noinspection PyPep8Naming diff --git a/acoustic_word_embeddings/nets/model.py b/acoustic_word_embeddings/nets/model.py index 16ea683..200ddf8 100644 --- a/acoustic_word_embeddings/nets/model.py +++ b/acoustic_word_embeddings/nets/model.py @@ -1,7 +1,7 @@ import torch -from base import util from acoustic_word_embeddings.nets.common import torch_load_unwrapped +from base import util class Model(torch.nn.Module): diff --git a/acoustic_word_embeddings/train_classifier.py b/acoustic_word_embeddings/train_classifier.py index 33f2e39..af21db6 100644 --- a/acoustic_word_embeddings/train_classifier.py +++ b/acoustic_word_embeddings/train_classifier.py @@ -4,11 +4,11 @@ import numpy as np import torch -from acoustic_word_embeddings.core.util.net_util import setup_training_run from acoustic_word_embeddings.core.gru_classifier import GRUClassifier from acoustic_word_embeddings.core.lstm_classifier import LSTMClassifier +from acoustic_word_embeddings.core.util.net_util import setup_training_run from base.common import get_dataset_paths -from base.data_io.kaldi_dataset import KaldiDataset +from base.data_io.dataset import get_dataset_class_for_path from conf import current_dataset @@ -67,10 +67,13 @@ def __main(): supplement_rare = getattr(config.general_training, 'supplement_rare_with_noisy', False) supplement_seed = getattr(config.general_training, 'supplement_seed', 112) train_path, dev_path, _ = get_dataset_paths(current_dataset) - data_train = KaldiDataset('scp:' + train_path, logger=logger, noise_multiplier=noise_mult, noise_prob=noise_prob, + # noinspection PyPep8Naming + DatasetClass = get_dataset_class_for_path(train_path, logger=logger) + + data_train = DatasetClass(train_path, logger=logger, noise_multiplier=noise_mult, noise_prob=noise_prob, mean_subtraction=mean_sub, variance_normalization=var_norm, supplement_rare_with_noisy=supplement_rare, supplement_seed=supplement_seed) - data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_path, training=False, logger=logger, + data_dev = DatasetClass(dev_path, parent_dataset_path=train_path, training=False, logger=logger, mean_subtraction=mean_sub, variance_normalization=var_norm) data_parallel = args.gpu_count > 1 @@ -94,7 +97,8 @@ def __main(): verbose=True) # log initial performance level - dev_losses, dev_accuracy = process_classifier_epoch(net, config, optimizer, data_dev, batch_first, data_parallel, train=False) + dev_losses, dev_accuracy = process_classifier_epoch(net, config, optimizer, data_dev, batch_first, data_parallel, + train=False) logger.info('Initial avg dev loss = {0:.4f}, dev accuracy = {1:.4f}'.format(np.mean(dev_losses), dev_accuracy)) for epoch in range(config.classifier_training.train_epochs): @@ -102,9 +106,11 @@ def __main(): .format(epoch, [group['lr'] for group in optimizer.param_groups][0])) start = time.time() - train_losses, train_accuracy = process_classifier_epoch(net, config, optimizer, data_train, batch_first, data_parallel, + train_losses, train_accuracy = process_classifier_epoch(net, config, optimizer, data_train, batch_first, + data_parallel, train=True) - dev_losses, dev_accuracy = process_classifier_epoch(net, config, optimizer, data_dev, batch_first, data_parallel, + dev_losses, dev_accuracy = process_classifier_epoch(net, config, optimizer, data_dev, batch_first, + data_parallel, train=False) if config.classifier_training.lr_schedule: diff --git a/acoustic_word_embeddings/train_siamese.py b/acoustic_word_embeddings/train_siamese.py index 8b729de..14205e4 100644 --- a/acoustic_word_embeddings/train_siamese.py +++ b/acoustic_word_embeddings/train_siamese.py @@ -7,11 +7,11 @@ from acoustic_word_embeddings.core.average_precision import average_precision from acoustic_word_embeddings.core.loss.embedding_loss import margin_loss -from acoustic_word_embeddings.core.util.net_util import setup_training_run, create_embedding_loss from acoustic_word_embeddings.core.siamese_gru import SiameseGRU from acoustic_word_embeddings.core.siamese_lstm import SiameseLSTM +from acoustic_word_embeddings.core.util.net_util import setup_training_run, create_embedding_loss from base.common import get_dataset_paths -from base.data_io.kaldi_dataset import KaldiDataset +from base.data_io.dataset import get_dataset_class_for_path from conf import current_dataset @@ -94,10 +94,12 @@ def __main(): supplement_rare = getattr(config.general_training, 'supplement_rare_with_noisy', False) supplement_seed = getattr(config.general_training, 'supplement_seed', 112) train_path, dev_path, _ = get_dataset_paths(current_dataset) - data_train = KaldiDataset('scp:' + train_path, logger=logger, noise_multiplier=noise_mult, noise_prob=noise_prob, + # noinspection PyPep8Naming + DatasetClass = get_dataset_class_for_path(train_path, logger=logger) + data_train = DatasetClass(train_path, logger=logger, noise_multiplier=noise_mult, noise_prob=noise_prob, mean_subtraction=mean_sub, variance_normalization=var_norm, supplement_rare_with_noisy=supplement_rare, supplement_seed=supplement_seed) - data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_path, training=False, logger=logger, + data_dev = DatasetClass(dev_path, parent_dataset_path=train_path, training=False, logger=logger, mean_subtraction=mean_sub, variance_normalization=var_norm) loss_fn = create_embedding_loss(config, len(data_train.word2id)) diff --git a/auto_rating/collect_manual_rating_data.py b/auto_rating/collect_manual_rating_data.py index b990a55..a3d8e07 100644 --- a/auto_rating/collect_manual_rating_data.py +++ b/auto_rating/collect_manual_rating_data.py @@ -50,7 +50,7 @@ def collect_csv_ratings(dataset_scp, include_missing=True): output_name = os.path.splitext(dataset_scp)[0] + '_ratings' + '_full' if include_missing else '' data_path = os.path.join(processed_data_dir, dataset_scp) # training set to True to avoid providing parent path - dataset = KaldiDataset('scp:' + data_path, training=True, logger=None) + dataset = KaldiDataset(data_path, training=True, logger=None) ratings = get_snodgrass_ratings_for_keys(dataset) if not include_missing \ else get_snodgrass_ratings_for_patients(dataset) diff --git a/auto_rating/rs_accuracy_analysis_by_example_count.py b/auto_rating/rs_accuracy_analysis_by_example_count.py index c61df0c..874240a 100644 --- a/auto_rating/rs_accuracy_analysis_by_example_count.py +++ b/auto_rating/rs_accuracy_analysis_by_example_count.py @@ -133,8 +133,5 @@ def word2example_count_group(word): group_analysis(net_rated_words, word2example_count_group, 6) - - - if __name__ == '__main__': __main() diff --git a/auto_rating/rs_analysis.py b/auto_rating/rs_analysis.py index d45ecdd..9547be4 100644 --- a/auto_rating/rs_analysis.py +++ b/auto_rating/rs_analysis.py @@ -1,9 +1,6 @@ import os from typing import List -import os -from typing import List - import matplotlib.pyplot as plt import numpy as np import pandas as pd diff --git a/base/common.py b/base/common.py index 6879236..3d4f69a 100644 --- a/base/common.py +++ b/base/common.py @@ -6,7 +6,7 @@ from conf import processed_data_dir, res_dir -def get_dataset_paths(dataset, fmt='scp'): +def get_dataset_paths(dataset, fmt='lmdb'): """fmt values: scp, lmdb""" train_path = os.path.join(processed_data_dir, '{0}_train.{format}'.format(dataset, format=fmt)) dev_path = os.path.join(processed_data_dir, '{0}_dev.{format}'.format(dataset, format=fmt)) diff --git a/base/data_io/dataset.py b/base/data_io/dataset.py index a1bbba0..556579b 100644 --- a/base/data_io/dataset.py +++ b/base/data_io/dataset.py @@ -3,6 +3,7 @@ import pickle import sys from itertools import islice, cycle +from typing import Type import numpy as np @@ -15,7 +16,7 @@ class Dataset(metaclass=abc.ABCMeta): def __init__(self, data_path, parent_dataset_path=None, training=True, logger=None, variance_normalization=False, noise_multiplier=0, noise_prob=1, mean_subtraction=False, supplement_rare_with_noisy=False, supplement_seed=112): - self.data_path = data_path if not data_path.startswith('scp:') else data_path[4:] + self.data_path = data_path self.word2idxs = {} self.idx2word = [] self.idx2source_dataset = [] @@ -357,3 +358,31 @@ def load_derived_data(self, source_data_path): def _raw_data_iterator(self): """Must produce (key, features) pairs.""" return + + +def _print_patients(data_train, data_dev, data_test): + from base.common import snodgrass_key2patient + for ds in [data_train, data_dev, data_test]: + patients = np.unique([snodgrass_key2patient(ds.idx2key[i]) for i in range(ds.data.shape[0]) if + ds.idx2source_dataset[i] == 'snodgrass']) + print(patients) + + +def get_dataset_class_for_format(fmt, logger=None) -> Type[Dataset]: + if fmt == 'scp': + from base.data_io.kaldi_dataset import KaldiDataset + util.warn_or_print(logger, 'Selecting KaldiDataset for data handling') + return KaldiDataset + elif fmt == 'lmdb': + from base.data_io.lmdb_dataset import LMDBDataset + util.warn_or_print(logger, 'Selecting LMDBDataset for data handling') + return LMDBDataset + else: + msg = 'Unsupported data format: {0}'.format(fmt) + util.warn_or_print(logger, msg) + raise RuntimeError(msg) + + +def get_dataset_class_for_path(dataset_path, logger) -> Type[Dataset]: + # get extension, remove leading dot, and pass as the format name + return get_dataset_class_for_format(os.path.splitext(dataset_path)[1][1:], logger) diff --git a/base/data_io/dataset2lmdb.py b/base/data_io/dataset2lmdb.py index 6760e2b..4f44dbe 100644 --- a/base/data_io/dataset2lmdb.py +++ b/base/data_io/dataset2lmdb.py @@ -1,5 +1,4 @@ import lmdb - import numpy as np import base.data_io.proto.tensor_pb2 as tensor_pb2 diff --git a/base/data_io/kaldi_dataset.py b/base/data_io/kaldi_dataset.py index 316b645..eca9a2c 100644 --- a/base/data_io/kaldi_dataset.py +++ b/base/data_io/kaldi_dataset.py @@ -6,7 +6,7 @@ import numpy as np from base.common import get_dataset_paths, snodgrass_key2patient, snodgrass_key2date, key2word -from base.data_io.dataset import Dataset +from base.data_io.dataset import Dataset, _print_patients from base.sound_util import frames2time from conf import current_dataset, processed_data_dir @@ -23,23 +23,15 @@ def _raw_data_iterator(self): return kaldi_io.read_mat_scp(self.data_path) -def _print_patients(data_train, data_dev, data_test): - from base.common import snodgrass_key2patient - for ds in [data_train, data_dev, data_test]: - patients = np.unique([snodgrass_key2patient(ds.idx2key[i]) for i in range(ds.data.shape[0]) if - ds.idx2source_dataset[i] == 'snodgrass']) - print(patients) - - def __main(): start = time.time() - train_path, dev_path, test_path = get_dataset_paths(current_dataset) - data_train = KaldiDataset('scp:' + train_path, noise_multiplier=1.0, noise_prob=0.5, + train_path, dev_path, test_path = get_dataset_paths(current_dataset, fmt='scp') + data_train = KaldiDataset(train_path, noise_multiplier=1.0, noise_prob=0.5, supplement_rare_with_noisy=False, supplement_seed=112) - data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_path, training=False) - data_test = KaldiDataset('scp:' + test_path, parent_dataset_path=train_path, training=False) + data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path, training=False) + data_test = KaldiDataset(test_path, parent_dataset_path=train_path, training=False) _print_patients(data_train, data_dev, data_test) @@ -51,7 +43,7 @@ def __main(): def __main_snodgrass_test(): snodgrass_path = '/home/aleks/data/speech_processed/snodgrass_words_cleaned_v3/snodgrass_data_v3.scp' - data_snodgrass = KaldiDataset('scp:' + snodgrass_path) + data_snodgrass = KaldiDataset(snodgrass_path) patients = np.unique([snodgrass_key2patient(x) for x in data_snodgrass.idx2key]) sessions = np.unique([snodgrass_key2date(x) for x in data_snodgrass.idx2key]) @@ -65,7 +57,7 @@ def __main_snodgrass_test(): def __main_external_test(): external_path = os.path.join(processed_data_dir, 'external_snodgrass_words.scp') - data_external = KaldiDataset('scp:' + external_path) + data_external = KaldiDataset(external_path) total_seconds_of_data = np.sum(frames2time(x.shape[0]) for x in data_external.data) print('Hours of data: {0:.3f}'.format(total_seconds_of_data / 60 / 60)) @@ -73,14 +65,14 @@ def __main_external_test(): def __main_independent_test(): swc_path = '/home/aleks/data/speech_processed/independent_test_v2/SWC_independent_test.scp' - data_swc = KaldiDataset('scp:' + swc_path) + data_swc = KaldiDataset(swc_path) print(data_swc.counts) - train_path, dev_path, test_path = get_dataset_paths('independent_cleaned_v3') - data_train = KaldiDataset('scp:' + train_path) - data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_path) - data_test = KaldiDataset('scp:' + test_path, parent_dataset_path=train_path) + train_path, dev_path, test_path = get_dataset_paths('independent_cleaned_v3', fmt='scp') + data_train = KaldiDataset(train_path) + data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path) + data_test = KaldiDataset(test_path, parent_dataset_path=train_path) print(data_dev.counts) @@ -108,16 +100,16 @@ def dump_to_dir(dataset, out_dir, dataset_name): start = time.time() - train_path, dev_path, test_path = get_dataset_paths(current_dataset) - data_train = KaldiDataset('scp:' + train_path, noise_multiplier=1.0, noise_prob=0.5, + train_path, dev_path, test_path = get_dataset_paths(current_dataset, fmt='scp') + data_train = KaldiDataset(train_path, noise_multiplier=1.0, noise_prob=0.5, supplement_rare_with_noisy=False, supplement_seed=112) dump_to_dir(data_train, current_dataset, 'train') - data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_path, training=False) + data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path, training=False) dump_to_dir(data_dev, current_dataset, 'dev') - data_test = KaldiDataset('scp:' + test_path, parent_dataset_path=train_path, training=False) + data_test = KaldiDataset(test_path, parent_dataset_path=train_path, training=False) dump_to_dir(data_test, current_dataset, 'test') print('dump: {0}'.format(time.time() - start)) @@ -130,15 +122,15 @@ def __dump_lmdb(): train_path, dev_path, test_path = get_dataset_paths(current_dataset, fmt='scp') train_path_lmdb, dev_path_lmdb, test_path_lmdb = get_dataset_paths(current_dataset, fmt='lmdb') - data_train = KaldiDataset('scp:' + train_path, noise_multiplier=1.0, noise_prob=0.5, + data_train = KaldiDataset(train_path, noise_multiplier=1.0, noise_prob=0.5, supplement_rare_with_noisy=False, supplement_seed=112) dataset2lmdb(data_train, train_path_lmdb) - data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_path, training=False) + data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path, training=False) dataset2lmdb(data_dev, dev_path_lmdb) - data_test = KaldiDataset('scp:' + test_path, parent_dataset_path=train_path, training=False) + data_test = KaldiDataset(test_path, parent_dataset_path=train_path, training=False) dataset2lmdb(data_test, test_path_lmdb) print('dump to LMDB: {0}'.format(time.time() - start)) diff --git a/base/data_io/lmdb_dataset.py b/base/data_io/lmdb_dataset.py index 7963d44..deaab02 100644 --- a/base/data_io/lmdb_dataset.py +++ b/base/data_io/lmdb_dataset.py @@ -3,8 +3,7 @@ import lmdb from base.common import get_dataset_paths -from base.data_io.dataset import Dataset -from base.data_io.kaldi_dataset import _print_patients +from base.data_io.dataset import Dataset, _print_patients from base.data_io.proto import tensor_pb2, utils from conf import current_dataset diff --git a/dataset_prep/clean_dataset/collect_independent_words.py b/dataset_prep/clean_dataset/collect_independent_words.py index e200323..e1ff562 100644 --- a/dataset_prep/clean_dataset/collect_independent_words.py +++ b/dataset_prep/clean_dataset/collect_independent_words.py @@ -72,7 +72,7 @@ def get_dataset_word_counts(scp_path): out_path = os.path.splitext(os.path.basename(scp_path))[0] + '_word_counts.pckl' if not os.path.exists(out_path): - dataset = KaldiDataset('scp:' + scp_path) + dataset = KaldiDataset(scp_path) with open(out_path, 'wb') as f: pickle.dump(dataset.counts, f) return dataset.counts @@ -91,7 +91,7 @@ def select_independent_words(): counts_per_emu_db = get_emu_word_counts() counts_emu_total = collapse_nested_dict(counts_per_emu_db) counts_swc = get_swc_word_counts() - train_path, dev_path, _ = get_dataset_paths('all_snodgrass_cleaned_v5') + train_path, dev_path, _ = get_dataset_paths('all_snodgrass_cleaned_v5', fmt='scp') counts_train = get_dataset_word_counts(train_path) counts_dev = get_dataset_word_counts(dev_path) diff --git a/dataset_prep/clean_dataset/prepare_independent_words_dataset.py b/dataset_prep/clean_dataset/prepare_independent_words_dataset.py index aab5b60..b538b6f 100644 --- a/dataset_prep/clean_dataset/prepare_independent_words_dataset.py +++ b/dataset_prep/clean_dataset/prepare_independent_words_dataset.py @@ -100,10 +100,10 @@ def compose_test_from_non_validation_words(swc_path, dev_path, test_path): if __name__ == '__main__': # train_scp, dev_scp, test_scp = split_independent_words('independent_cleaned_v3', 'independent_test_v2', # 'all_snodgrass_cleaned_v4') - # train_data = KaldiDataset('scp:' + train_scp) + # train_data = KaldiDataset(train_scp) # train_data.dump_derived_data() - train_path, dev_path, test_path = get_dataset_paths('independent_cleaned_v3') + train_path, dev_path, test_path = get_dataset_paths('independent_cleaned_v3', fmt='scp') swc_path = '/home/aleks/data/speech_processed/independent_test_v2/SWC_independent_test.scp' compose_test_from_non_validation_words(swc_path, dev_path, test_path) diff --git a/dataset_prep/core/cleaning.py b/dataset_prep/core/cleaning.py index b35990b..dff07f3 100644 --- a/dataset_prep/core/cleaning.py +++ b/dataset_prep/core/cleaning.py @@ -1,7 +1,7 @@ from base import util -def reject_by_duration_sec(duration_sec, dataset, key, logger=None, verbose=True): +def reject_by_duration_sec(duration_sec, dataset, key, logger=None, verbose=True): if duration_sec < 0.200: if verbose: util.warn_or_print(logger, diff --git a/dataset_prep/split_train_dev_test.py b/dataset_prep/split_train_dev_test.py index cff14af..49be3ac 100644 --- a/dataset_prep/split_train_dev_test.py +++ b/dataset_prep/split_train_dev_test.py @@ -40,7 +40,7 @@ def split_snodgrass_dataset(source_sub_dir, snodgrass_file, same_split_as=None): patients_test = knapsack(patients_left, len(lines) / 4)[1] patients_dev = remove_all(patients_left, patients_test) else: - train_path, dev_path, test_path = get_dataset_paths(same_split_as) + train_path, dev_path, test_path = get_dataset_paths(same_split_as, fmt='scp') patients_train = scp2snodgrass_patients(train_path) patients_test = scp2snodgrass_patients(test_path) patients_dev = scp2snodgrass_patients(dev_path) @@ -119,7 +119,7 @@ def __main_v5(): same_split_as='all_snodgrass_cleaned_v3') # Dump the training dataset mean and word2id for future use - train_data = KaldiDataset('scp:' + train_scp) + train_data = KaldiDataset(train_scp) train_data.dump_derived_data()