-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_preprocessing.py
280 lines (240 loc) · 10.9 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import os
from tqdm import tqdm
import torch
from torch.utils.data import Dataset
import pandas as pd
import librosa, librosa.display
import numpy as np
import matplotlib.pyplot as plt
from create_csv import create_csv
import random
import re
class COVID_dataset(Dataset):
'''
Custom COVID dataset.
'''
def __init__(self, dset, folds, eval_type='random',
transform=None, task='all',
window_size=1,
sample_rate=48000,
hop_length=512,
n_fft=2048,
masking=False,
pitch_shift=False,
cross_val=False,
breathcough=False):
df = pd.read_csv(os.path.join('paths/cross_val', task+'.csv'))
rows = df[df.fold.isin(folds)].index.tolist()
np.random.shuffle(rows)
self.data_index = df.iloc[rows]
self.dset = dset
self.root_dir = '/vol/bitbucket/hgc19/COVID_Audio_Diagnosis/KDD_paper_data'
self.window_size = window_size * sample_rate
self.sample_rate = sample_rate
self.hop_length = hop_length
self.n_fft = n_fft
self.transform = transform
self.eval_type = eval_type
self.masking = masking
self.pitch_shift = pitch_shift
self.breathcough = breathcough
def __len__(self):
return len(self.data_index.index)
def custom_transform(self, signal):
"""
create log spectrograph of signal
"""
stft = librosa.stft(signal, n_fft=self.n_fft, hop_length=self.hop_length)
spectrogram = np.abs(stft)
log_spectrogram = librosa.amplitude_to_db(spectrogram)
if self.masking:
log_spectrogram = self.spec_augment(log_spectrogram)
if self.transform:
log_spectrogram = self.transform(log_spectrogram)
return log_spectrogram
def pad(self, signal):
sample_signal = np.zeros((self.window_size,))
sample_signal[:signal.shape[0],] = signal
return sample_signal
def __getitem__(self, index):
# get path of chosen index
audio_path = self.data_index['path'].iloc[index]
label = self.data_index['label'].iloc[index]
chunks = self.load_process(audio_path)
# get path of a cough or breath sample which was provided by the same user
# if a cough sample is provided need to get a breath sample and visa
# versa
if self.breathcough:
# flag is used to insure that cough and breath are always passed to the model in the same
# order.
audio_path_2, label2, flag = self.return_pair(audio_path)
if label2 != None:
assert label == label2, 'pairs samples have mismatching labels, Investigate!'
if audio_path_2 == None: # there is no pair (patient didn't give cough and breath)
print('*'*30)
print('No Pair!')
label2 = label
if self.dset == 'train' or self.eval_type != 'maj_vote':
chunks_2 = torch.zeros(chunks.size())
else:
chunks_2 = [torch.zeros(chunks[0].size()) for i in range(len(chunks))]
else:
chunks_2 = self.load_process(audio_path_2)
if self.dset == 'train' or self.eval_type != 'maj_vote':
if flag == 'cough':
return torch.cat([chunks, chunks_2], dim=0), label
elif flag == 'breath':
return torch.cat([chunks_2, chunks], dim=0), label
else:
if flag == 'cough':
return [torch.cat([i, j], dim=0) for i, j in zip(chunks, chunks_2)], label
elif flag == 'breath':
return [torch.cat([j, i], dim=0) for i, j in zip(chunks, chunks_2)], label
return chunks, label
def load_process(self, audio_path):
# load the data
signal, sample_rate = librosa.load(audio_path, sr=self.sample_rate)
# perform pitch shift:
if self.pitch_shift:
step = np.random.uniform(-6,6)
signal = librosa.effects.pitch_shift(
signal, sample_rate, step)
# For train, sample random window size from audiofile
if self.dset == 'train' or self.eval_type != 'maj_vote':
# Apply padding if necessary. Else sampsle random window.
if signal.shape[0] <= self.window_size:
sample_signal = self.pad(signal)
else:
if self.eval_type == 'random':
rand_indx = np.random.randint(0, signal.shape[0] - self.window_size)
else:
rand_indx = 0
sample_signal = signal[rand_indx:rand_indx + self.window_size]
# perform transformations
sample_signal = self.custom_transform(sample_signal)
return sample_signal
# For eval/test, chunk audiofile into chunks of size wsz and
# process and return all
else:
chunks = np.array_split(signal, int(np.ceil(signal.shape[0] / self.window_size)))
def process_chunk(chunk):
if chunk.shape[0] <= self.window_size:
sample_signal = self.pad(chunk)
chunk = self.custom_transform(sample_signal)
return chunk
chunks = [process_chunk(chunk) for chunk in chunks]
return chunks
def spec_augment(self,
spec: np.ndarray,
num_mask=2,
freq_masking_max_percentage=0.15,
time_masking_max_percentage=0.3):
spec = spec.copy()
for i in range(num_mask):
all_frames_num, all_freqs_num = spec.shape
freq_percentage = random.uniform(0.0, freq_masking_max_percentage)
num_freqs_to_mask = int(freq_percentage * all_freqs_num)
f0 = np.random.uniform(low=0.0,
high=all_freqs_num - num_freqs_to_mask)
f0 = int(f0)
spec[:, f0:f0 + num_freqs_to_mask] = 0
time_percentage = random.uniform(0.0, time_masking_max_percentage)
num_frames_to_mask = int(time_percentage * all_frames_num)
t0 = np.random.uniform(low=0.0,
high=all_frames_num - num_frames_to_mask)
t0 = int(t0)
spec[t0:t0 + num_frames_to_mask, :] = 0
return spec
def return_pair(self, audio_path):
'''
function that given a path to an audio file of a person coughing returns a sample of the same
person coughing or breathing (depending on whether the original sample is cough or breath)
inputs: audio_path --> str
output: audio_path_2 --> str, label2 --> str
'''
if 'web' in audio_path:
if 'breathe' in audio_path:
audio_path_2 = audio_path.replace('breathe', 'cough')
flag = 'cough'
elif 'cough' in audio_path:
num_cough = re.findall('cough', audio_path)
if len(num_cough) == 1:
audio_path_2 = audio_path.replace('cough', 'breathe')
else:
audio_path_2 = self.nth_repl(audio_path, 'cough', 'breathe', 2)
flag = 'breath'
else:
raise Exception('This should not be a possibility - path should contain breathe of cough')
assert self.data_index['path'].isin([audio_path_2]).any(), f'{audio_path_2} not in data'
# getting the label to check that it is the same
label2 = self.data_index.loc[self.data_index['path'] == audio_path_2]['label'].iloc[0]
return audio_path_2, label2, flag
elif 'android' in audio_path:
# this is more complicated as breathe and cough samples have different unique codes so can't just
# swap breathe with cough as in web
if 'breaths' in audio_path:
# folder -> breaths
# file --> breath
audio_path_2 = audio_path.replace('breath', 'cough', 1)
audio_path_2 = audio_path_2.replace('breaths', 'cough', 1)
flag = 'cough'
elif 'cough' in audio_path:
num_cough = re.findall('cough', audio_path)
flag = 'breath'
if len(num_cough) == 2:
audio_path_2 = audio_path.replace('cough', 'breath', 1)
audio_path_2 = audio_path_2.replace('cough', 'breaths', 1)
else:
audio_path_2 = self.nth_repl(audio_path,'cough', 'breath', 2)
audio_path_2 = self.nth_repl(audio_path_2, 'cough', 'breaths', 2)
else:
raise Exception(
'This should not be a possibility - path should contain breathe of cough'
)
audio_path_2 = re.sub("[0-9]{13}", "", audio_path_2)
audio_path_2 = audio_path_2.replace('.wav', "")
rows_to_swap = self.data_index[
self.data_index['path'].str.contains(
audio_path_2)]
if len(rows_to_swap["path"].values.tolist()) == 0: # no pairs pad with zeros
return None, None, flag
audio_path_2 = np.random.choice(rows_to_swap["path"].values.tolist())
assert self.data_index['path'].isin(
[audio_path_2]).any(), f'{audio_path_2} not in data'
label2 = self.data_index.loc[self.data_index['path'] ==
audio_path_2]['label'].iloc[0]
return audio_path_2, label2, flag
else:
raise Exception(
'This should not be a possibility - path should contain breathe of cough'
)
def nth_repl(self, s, sub, repl, n):
find = s.find(sub)
# If find is not -1 we have found at least one match for the substring
i = find != -1
# loop util we find the nth or we find no match
while find != -1 and i != n:
# find + 1 means we start searching from after the last match
find = s.find(sub, find + 1)
i += 1
# If i is equal to n we found nth match so replace
if i == n:
return s[:find] + repl + s[find + len(sub):]
return s
if __name__ == "__main__":
test_dataset = COVID_dataset('dev', None)
for i in tqdm(range(len(test_dataset))):
sample, label = test_dataset[i]
print(sample.shape)
break
plt.figure()
librosa.display.specshow(sample,
sr=48000,
hop_length=512)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar(format="%+2.0f dB")
plt.title("Spectrogram (dB)")
path_to_save = 'figs/log_spectrogram'+str(i)+'.png'
plt.savefig(path_to_save)
plt.close()