Skip to content

Commit

Permalink
Added format-based dataset class switch, removed scp: prefix from paths
Browse files Browse the repository at this point in the history
  • Loading branch information
AleksZhelo committed Aug 24, 2020
1 parent d98b964 commit d8d977e
Show file tree
Hide file tree
Showing 26 changed files with 120 additions and 91 deletions.
12 changes: 7 additions & 5 deletions acoustic_word_embeddings/calculate_accuracy.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

from acoustic_word_embeddings.core.util.common import load_embeddings
from acoustic_word_embeddings.core.loss.embedding_loss import loss_name2class
from acoustic_word_embeddings.core.util.common import load_embeddings
from acoustic_word_embeddings.core.util.net_util import read_embedding_loss, load_net
from acoustic_word_embeddings.gen_embeddings import get_or_generate_embeddings
from acoustic_word_embeddings.train_classifier import process_classifier_epoch
from base.common import get_dataset_paths
from base.data_io.kaldi_dataset import KaldiDataset
from base.data_io.dataset import get_dataset_class_for_path
from conf import current_dataset


Expand Down Expand Up @@ -91,15 +91,17 @@ def do_calculate_accuracy(run_dir, epoch, is_classifier, dataset=None, partition
if dataset is None:
dataset = current_dataset
train_path, dev_path, test_path = get_dataset_paths(dataset)
# noinspection PyPep8Naming
DatasetClass = get_dataset_class_for_path(train_path, logger=None)

if partition == 'train':
dataset = KaldiDataset('scp:' + train_path, parent_dataset_path=train_scp, training=False, logger=None,
dataset = DatasetClass(train_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)
if partition == 'dev':
dataset = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_scp, training=False, logger=None,
dataset = DatasetClass(dev_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)
if partition == 'test':
dataset = KaldiDataset('scp:' + test_path, parent_dataset_path=train_scp, training=False, logger=None,
dataset = DatasetClass(test_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)

# TODO: no automatic detection for batch_first and data_parallel
Expand Down
20 changes: 12 additions & 8 deletions acoustic_word_embeddings/calculate_ap.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

import numpy as np

from acoustic_word_embeddings.core.util.args_util import parse_load_epoch_args
from acoustic_word_embeddings.core.average_precision import average_precision
from acoustic_word_embeddings.core.util.args_util import parse_load_epoch_args
from acoustic_word_embeddings.core.util.net_util import load_net
from acoustic_word_embeddings.gen_embeddings import get_siamese_embeddings, get_classifier_embeddings
from base.common import get_dataset_paths
from base.data_io.kaldi_dataset import KaldiDataset
from base.data_io.dataset import get_dataset_class_for_path
from conf import current_dataset


Expand Down Expand Up @@ -35,25 +35,27 @@ def do_calculate_ap(run_dir, epoch, dataset=None, partition='dev'):
dataset = current_dataset

train_path, dev_path, test_path = get_dataset_paths(dataset)
# noinspection PyPep8Naming
DatasetClass = get_dataset_class_for_path(train_path, logger=None)

if len(checkpoints) == 0:
print('No checkpoints found in {0} for run {1}'.format(checkpoint_dir, run_dir))
print('Exiting')
sys.exit(-1)

if partition == 'train':
data_train = KaldiDataset('scp:' + train_path, parent_dataset_path=train_scp, training=False, logger=None,
data_train = DatasetClass(train_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)
return get_epoch_ap(net, config, checkpoints, loss, data_train, epoch, get_embeddings,
subsample_size=3000)

if partition == 'dev':
data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_scp, training=False, logger=None,
data_dev = DatasetClass(dev_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)
return get_epoch_ap(net, config, checkpoints, loss, data_dev, epoch, get_embeddings)

if partition == 'test':
data_test = KaldiDataset('scp:' + test_path, parent_dataset_path=train_scp, training=False, logger=None,
data_test = DatasetClass(test_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)
return get_epoch_ap(net, config, checkpoints, loss, data_test, epoch, get_embeddings)

Expand Down Expand Up @@ -85,6 +87,8 @@ def __main(run_dir, dataset=None, for_epochs=None, gen_train=False, gen_dev=True
dataset = current_dataset

train_path, dev_path, test_path = get_dataset_paths(dataset)
# noinspection PyPep8Naming
DatasetClass = get_dataset_class_for_path(train_path, logger=None)

if len(checkpoints) == 0:
print('No checkpoints found in {0} for run {1}'.format(checkpoint_dir, run_dir))
Expand All @@ -95,18 +99,18 @@ def __main(run_dir, dataset=None, for_epochs=None, gen_train=False, gen_dev=True
for_epochs = sorted(list(checkpoints.keys()))

if gen_train:
data_train = KaldiDataset('scp:' + train_path, parent_dataset_path=train_scp, training=False, logger=None,
data_train = DatasetClass(train_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)
_print_ap_per_epoch(net, config, checkpoints, loss, data_train, 'train', for_epochs, get_embeddings,
subsample_size=3000)

if gen_dev:
data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_scp, training=False, logger=None,
data_dev = DatasetClass(dev_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)
_print_ap_per_epoch(net, config, checkpoints, loss, data_dev, 'dev', for_epochs, get_embeddings)

if gen_test:
data_test = KaldiDataset('scp:' + test_path, parent_dataset_path=train_scp, training=False, logger=None,
data_test = DatasetClass(test_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)
_print_ap_per_epoch(net, config, checkpoints, loss, data_test, 'test', for_epochs, get_embeddings)

Expand Down
2 changes: 1 addition & 1 deletion acoustic_word_embeddings/core/average_precision.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
from scipy.special import comb
from scipy.spatial.distance import pdist
from scipy.special import comb


def test_chance_level(labels):
Expand Down
2 changes: 1 addition & 1 deletion acoustic_word_embeddings/core/gru_classifier.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import torch
import torch.nn as nn

from base.settings import Settings
from acoustic_word_embeddings.nets.common import hidden2fc_input
from acoustic_word_embeddings.nets.gru_fc_base import GRU_FC_base
from base.settings import Settings


class GRUClassifier(GRU_FC_base):
Expand Down
2 changes: 1 addition & 1 deletion acoustic_word_embeddings/core/lstm_classifier.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import torch
import torch.nn as nn

from base.settings import Settings
from acoustic_word_embeddings.nets.common import hidden2fc_input
from acoustic_word_embeddings.nets.lstm_fc_base import LSTM_FC_base
from base.settings import Settings


class LSTMClassifier(LSTM_FC_base):
Expand Down
4 changes: 2 additions & 2 deletions acoustic_word_embeddings/core/siamese_gru.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import torch

from acoustic_word_embeddings.core.loss.embedding_loss import triplet_loss_offline
from base import util
from base.settings import Settings
from acoustic_word_embeddings.nets.common import hidden2fc_input
from acoustic_word_embeddings.nets.gru_fc_base import GRU_FC_base
from base import util
from base.settings import Settings


class SiameseGRU(GRU_FC_base):
Expand Down
4 changes: 2 additions & 2 deletions acoustic_word_embeddings/core/siamese_lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import torch

from acoustic_word_embeddings.core.loss.embedding_loss import triplet_loss_offline
from base import util
from base.settings import Settings
from acoustic_word_embeddings.nets.common import hidden2fc_input
from acoustic_word_embeddings.nets.lstm_fc_base import LSTM_FC_base
from base import util
from base.settings import Settings


class SiameseLSTM(LSTM_FC_base):
Expand Down
6 changes: 3 additions & 3 deletions acoustic_word_embeddings/core/util/net_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@

import numpy as np

from acoustic_word_embeddings.core.util.args_util import parse_training_args
from acoustic_word_embeddings.core.loss.embedding_loss import DistanceBasedLoss
from acoustic_word_embeddings.core.gru_classifier import GRUClassifier
from acoustic_word_embeddings.core.loss.embedding_loss import DistanceBasedLoss
from acoustic_word_embeddings.core.lstm_classifier import LSTMClassifier
from acoustic_word_embeddings.core.siamese_gru import SiameseGRU
from acoustic_word_embeddings.core.siamese_lstm import SiameseLSTM
from acoustic_word_embeddings.core.util.args_util import parse_training_args
from acoustic_word_embeddings.nets.common import torch_load_unwrapped
from base import util
from base.settings import Settings
from acoustic_word_embeddings.nets.common import torch_load_unwrapped
from base.util import create_logger
from conf import awe_runs_dir

Expand Down
16 changes: 9 additions & 7 deletions acoustic_word_embeddings/gen_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
from acoustic_word_embeddings.core.util.args_util import parse_gen_args
from acoustic_word_embeddings.core.util.common import embeddings_dir2dict
from acoustic_word_embeddings.core.util.net_util import load_net
from acoustic_word_embeddings.nets.common import torch_load_unwrapped
from base import util
from base.common import get_dataset_paths
from base.data_io.kaldi_dataset import KaldiDataset
from acoustic_word_embeddings.nets.common import torch_load_unwrapped
from base.data_io.dataset import get_dataset_class_for_path
from conf import current_dataset, new_path, processed_data_dir


Expand Down Expand Up @@ -98,22 +98,24 @@ def generate_embeddings(run_dir, dataset=None, gen_train=False, gen_dev=False, g
if dataset is None:
dataset = current_dataset
train_path, dev_path, test_path = get_dataset_paths(dataset)
# noinspection PyPep8Naming
DatasetClass = get_dataset_class_for_path(train_path, logger=None)

if gen_train:
data_train = KaldiDataset('scp:' + train_path, parent_dataset_path=train_scp, training=False, logger=None,
data_train = DatasetClass(train_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)
if gen_dev:
data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_scp, training=False, logger=None,
data_dev = DatasetClass(dev_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)
if gen_test:
data_test = KaldiDataset('scp:' + test_path, parent_dataset_path=train_scp, training=False, logger=None,
data_test = DatasetClass(test_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)
if gen_new:
data_new = KaldiDataset('scp:' + new_path, parent_dataset_path=train_scp, training=False, logger=None,
data_new = DatasetClass(new_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)
if gen_background:
background_path = os.path.join(processed_data_dir, 'background_train_v4', 'background_data.scp')
data_background = KaldiDataset('scp:' + background_path, parent_dataset_path=train_scp, training=False, logger=None,
data_background = DatasetClass(background_path, parent_dataset_path=train_scp, training=False, logger=None,
mean_subtraction=mean_sub, variance_normalization=var_norm)

train_embeddings_dir = os.path.join(run_dir, 'train_embeddings')
Expand Down
4 changes: 2 additions & 2 deletions acoustic_word_embeddings/nets/gru_fc_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import torch
import torch.nn as nn

from base.settings import Settings
from acoustic_word_embeddings.nets.common import hidden2fc_input
from acoustic_word_embeddings.nets.modules_experimental.custom_gru import CustomGRU, BiRNNMode
from acoustic_word_embeddings.nets.model import Model
from acoustic_word_embeddings.nets.modules_experimental.custom_gru import CustomGRU, BiRNNMode
from base.settings import Settings


# noinspection PyPep8Naming
Expand Down
2 changes: 1 addition & 1 deletion acoustic_word_embeddings/nets/lstm_fc_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import torch
import torch.nn as nn

from base.settings import Settings
from acoustic_word_embeddings.nets.common import hidden2fc_input
from acoustic_word_embeddings.nets.model import Model
from base.settings import Settings


# noinspection PyPep8Naming
Expand Down
2 changes: 1 addition & 1 deletion acoustic_word_embeddings/nets/model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import torch

from base import util
from acoustic_word_embeddings.nets.common import torch_load_unwrapped
from base import util


class Model(torch.nn.Module):
Expand Down
20 changes: 13 additions & 7 deletions acoustic_word_embeddings/train_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
import numpy as np
import torch

from acoustic_word_embeddings.core.util.net_util import setup_training_run
from acoustic_word_embeddings.core.gru_classifier import GRUClassifier
from acoustic_word_embeddings.core.lstm_classifier import LSTMClassifier
from acoustic_word_embeddings.core.util.net_util import setup_training_run
from base.common import get_dataset_paths
from base.data_io.kaldi_dataset import KaldiDataset
from base.data_io.dataset import get_dataset_class_for_path
from conf import current_dataset


Expand Down Expand Up @@ -67,10 +67,13 @@ def __main():
supplement_rare = getattr(config.general_training, 'supplement_rare_with_noisy', False)
supplement_seed = getattr(config.general_training, 'supplement_seed', 112)
train_path, dev_path, _ = get_dataset_paths(current_dataset)
data_train = KaldiDataset('scp:' + train_path, logger=logger, noise_multiplier=noise_mult, noise_prob=noise_prob,
# noinspection PyPep8Naming
DatasetClass = get_dataset_class_for_path(train_path, logger=logger)

data_train = DatasetClass(train_path, logger=logger, noise_multiplier=noise_mult, noise_prob=noise_prob,
mean_subtraction=mean_sub, variance_normalization=var_norm,
supplement_rare_with_noisy=supplement_rare, supplement_seed=supplement_seed)
data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_path, training=False, logger=logger,
data_dev = DatasetClass(dev_path, parent_dataset_path=train_path, training=False, logger=logger,
mean_subtraction=mean_sub, variance_normalization=var_norm)

data_parallel = args.gpu_count > 1
Expand All @@ -94,17 +97,20 @@ def __main():
verbose=True)

# log initial performance level
dev_losses, dev_accuracy = process_classifier_epoch(net, config, optimizer, data_dev, batch_first, data_parallel, train=False)
dev_losses, dev_accuracy = process_classifier_epoch(net, config, optimizer, data_dev, batch_first, data_parallel,
train=False)
logger.info('Initial avg dev loss = {0:.4f}, dev accuracy = {1:.4f}'.format(np.mean(dev_losses), dev_accuracy))

for epoch in range(config.classifier_training.train_epochs):
logger.info('Starting epoch {0}, learning_rate = {1}'
.format(epoch, [group['lr'] for group in optimizer.param_groups][0]))

start = time.time()
train_losses, train_accuracy = process_classifier_epoch(net, config, optimizer, data_train, batch_first, data_parallel,
train_losses, train_accuracy = process_classifier_epoch(net, config, optimizer, data_train, batch_first,
data_parallel,
train=True)
dev_losses, dev_accuracy = process_classifier_epoch(net, config, optimizer, data_dev, batch_first, data_parallel,
dev_losses, dev_accuracy = process_classifier_epoch(net, config, optimizer, data_dev, batch_first,
data_parallel,
train=False)

if config.classifier_training.lr_schedule:
Expand Down
10 changes: 6 additions & 4 deletions acoustic_word_embeddings/train_siamese.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@

from acoustic_word_embeddings.core.average_precision import average_precision
from acoustic_word_embeddings.core.loss.embedding_loss import margin_loss
from acoustic_word_embeddings.core.util.net_util import setup_training_run, create_embedding_loss
from acoustic_word_embeddings.core.siamese_gru import SiameseGRU
from acoustic_word_embeddings.core.siamese_lstm import SiameseLSTM
from acoustic_word_embeddings.core.util.net_util import setup_training_run, create_embedding_loss
from base.common import get_dataset_paths
from base.data_io.kaldi_dataset import KaldiDataset
from base.data_io.dataset import get_dataset_class_for_path
from conf import current_dataset


Expand Down Expand Up @@ -94,10 +94,12 @@ def __main():
supplement_rare = getattr(config.general_training, 'supplement_rare_with_noisy', False)
supplement_seed = getattr(config.general_training, 'supplement_seed', 112)
train_path, dev_path, _ = get_dataset_paths(current_dataset)
data_train = KaldiDataset('scp:' + train_path, logger=logger, noise_multiplier=noise_mult, noise_prob=noise_prob,
# noinspection PyPep8Naming
DatasetClass = get_dataset_class_for_path(train_path, logger=logger)
data_train = DatasetClass(train_path, logger=logger, noise_multiplier=noise_mult, noise_prob=noise_prob,
mean_subtraction=mean_sub, variance_normalization=var_norm,
supplement_rare_with_noisy=supplement_rare, supplement_seed=supplement_seed)
data_dev = KaldiDataset('scp:' + dev_path, parent_dataset_path=train_path, training=False, logger=logger,
data_dev = DatasetClass(dev_path, parent_dataset_path=train_path, training=False, logger=logger,
mean_subtraction=mean_sub, variance_normalization=var_norm)

loss_fn = create_embedding_loss(config, len(data_train.word2id))
Expand Down
2 changes: 1 addition & 1 deletion auto_rating/collect_manual_rating_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def collect_csv_ratings(dataset_scp, include_missing=True):
output_name = os.path.splitext(dataset_scp)[0] + '_ratings' + '_full' if include_missing else ''
data_path = os.path.join(processed_data_dir, dataset_scp)
# training set to True to avoid providing parent path
dataset = KaldiDataset('scp:' + data_path, training=True, logger=None)
dataset = KaldiDataset(data_path, training=True, logger=None)

ratings = get_snodgrass_ratings_for_keys(dataset) if not include_missing \
else get_snodgrass_ratings_for_patients(dataset)
Expand Down
3 changes: 0 additions & 3 deletions auto_rating/rs_accuracy_analysis_by_example_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,5 @@ def word2example_count_group(word):
group_analysis(net_rated_words, word2example_count_group, 6)





if __name__ == '__main__':
__main()
3 changes: 0 additions & 3 deletions auto_rating/rs_analysis.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import os
from typing import List

import os
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
Expand Down
2 changes: 1 addition & 1 deletion base/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from conf import processed_data_dir, res_dir


def get_dataset_paths(dataset, fmt='scp'):
def get_dataset_paths(dataset, fmt='lmdb'):
"""fmt values: scp, lmdb"""
train_path = os.path.join(processed_data_dir, '{0}_train.{format}'.format(dataset, format=fmt))
dev_path = os.path.join(processed_data_dir, '{0}_dev.{format}'.format(dataset, format=fmt))
Expand Down
Loading

0 comments on commit d8d977e

Please sign in to comment.