-
Notifications
You must be signed in to change notification settings - Fork 0
/
db.py
70 lines (63 loc) · 2.54 KB
/
db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# -*- coding: utf-8 -*-
import os
import pandas as pd
class TIMIT:
def __init__(self, timit_path: str):
self.timit_path = timit_path
self.data_train = pd.read_csv(
os.path.join(timit_path, 'train_data.csv'),
index_col='index', skip_blank_lines=True,
usecols=[
'index', 'speaker_id', 'filename', 'path_from_data_dir', 'is_converted_audio',
'is_audio', 'is_word_file',
'is_phonetic_file', 'is_sentence_file'
], dtype=dict(
index='Int16',
speaker_id=str,
filename=str,
path_from_data_dir=str,
is_converted_audio='boolean',
is_word_file='boolean',
is_phonetic_file='boolean',
is_sentence_file='boolean'
))
self.data_test = pd.read_csv(
os.path.join(timit_path, 'test_data.csv'),
index_col='index', skip_blank_lines=True,
usecols=[
'index', 'speaker_id', 'filename', 'path_from_data_dir', 'is_converted_audio',
'is_audio', 'is_word_file',
'is_phonetic_file', 'is_sentence_file'
], dtype=dict(
index='Int16',
speaker_id=str,
filename=str,
path_from_data_dir=str,
is_converted_audio='boolean',
is_word_file='boolean',
is_phonetic_file='boolean',
is_sentence_file='boolean'
))
self.data_train = self.data_train[self.data_train.index.notna()]
self.data_test = self.data_test[self.data_test.index.notna()]
def speakers(self, split: str):
if split == 'train':
data = self.data_train
elif split == 'test':
data = self.data_test
else:
raise ValueError('Split must be "train" or "test"')
return data.speaker_id.unique()
def audio(self, split: str, speaker_id: str):
if split == 'train':
data = self.data_train
elif split == 'test':
data = self.data_test
else:
raise ValueError('Split must be "train" or "test"')
audio_list = data[data.is_converted_audio].loc[data.speaker_id == speaker_id]['path_from_data_dir'].to_list()
audio_list = list(map(lambda s: os.path.join(self.timit_path, 'data', s), audio_list))
return audio_list
if __name__ == '__main__':
tmt = TIMIT('/datasets/TIMIT')
print(tmt.audio('train', 'MMDM0'))