forked from aclew/launcher
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhigh_volubility.py
executable file
·526 lines (419 loc) · 20.5 KB
/
high_volubility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
#!/usr/bin/env python
#
# author = The ACLEW Team
#
"""
This script extract short snippets of sound (approx. 10s long),
and runs them through a SAD or diarization tool to detect chunks of audio with :
1) a lot of speech.
--> python tools/high_volubility.py file_path.wav --sad noisemes_sad
2) a lot of child speech
--> python tools/high_volubility.py file_path.wav --diar yunitate --mode CHI
3) a lot of parent-child conversations
--> python tools/high_volubility.py file_path.wav --diar yunitate --mode PCCONV
4) a lot of adults conversations with a child minimally aware
--> python tools/high_volubility.py file_path.wav --diar yunitate --mode ACCA
"""
import os
import sys
import re
import wave
import math
import numpy
import argparse
import subprocess
from operator import itemgetter
def get_audio_length(wav):
""" Return duration of Wave file.
Input:
wav: path to the wav file.
Output:
dur: float, duration, in seconds, of wav file.
"""
audio = wave.open(wav, 'r')
frames = audio.getnframes()
rate = audio.getframerate()
duration = frames / float(rate)
audio.close()
return duration
def select_onsets(duration, step):
""" Return list of onsets on which this script will extract the chunks of
10s
Input:
duration: float, duration of the daylong recording
Ouput:
onsets: list[floats], list the onsets on which this script will extract
the chunks of 10s to be run through the SAD tools
"""
return numpy.arange(0.0, duration, step)
def extract_chunks(wav, onset_list, chunk_size, temp):
""" Given a list of onset and a length in seconds, extract a snippet of
audio at each onset of this length. The extraction will be done using
SoX, called by subprocess.
Input:
wav: path to the wav file.
onset_list: list[float], list of the onsets in the 'wav' file at which
we'll extract the segments
chunk_size: float, length in seconds of the chunks of audio this script
will extract.
Output:
'temp': the output of this function is the set of small wav file of
'chunk_size' seconds in the temp folder.
The name of the wav files will be:
id_onset_length.wav
where id is the name of the original wav file, onset is the
onset at which the chunk occurs in the original wav file, and
length is the length of the chunk.
"""
# for each onset, call SoX using subprocess to extract a chunk.
for on in onset_list:
# get "id" basename of wav file
basename = os.path.splitext(os.path.basename(wav))[0]
# create name of output
off = on + chunk_size
str_on = (6 - len(str(on))) * '0' + str(on)
str_off = (6 - len(str(off))) * '0' + str(off)
chunk_name = '_'.join([basename, str_on, str_off])
# call subprocess
cmd = ['sox', wav,
os.path.join(temp, '{}.wav'.format(chunk_name)),
'trim', str(on), str(chunk_size)]
subprocess.call(cmd)
def run_Model(temp_rel, temp_abs, sad, diar=None):
""" When all the snippets of sound are extracted, and stored in the temp
file, run them through a sad or diar tool, and keep
the 10%% that have the most speech.
By default, the function is on the SAD mode. That means that it will
run a SAD model (by default noiseme). However, if the diar parameter is
provided, this function will be on the diarization mode.
Input:
temp_rel: relative path to the temp folder in which the snippets of
sounds are stored. Here we need the relative path, not the
abs path, since the SAD tool will add the "/vagrant/" part of
the path.
temp_abs: absolute path to the temp folder in which the snippets of
sounds are stored.
sad: name of the sad tool to be used to analyze the snippets of
sound
diar: name of the diar tool to be used to analyze the snippets of
sound. Note that it will use that sad parameter as a choice
of SAD provider.
Output:
_: In temp, the SAD analyses will be written in RTTM format.
"""
if diar is not None: # Diarization mode
if diar == 'yunitate':
cmd = ['tools/{}.sh'.format(diar), '{}'.format(temp_rel)]
elif diar == 'diartk':
cmd = ['tools/{}.sh'.format(diar), '{}'.format(temp_rel), '{}'.format(sad)]
else:
cmd = ['exit 1']
else: # SAD mode
cmd = ['tools/{}.sh'.format(sad), '{}'.format(temp_rel)]
subprocess.call(cmd)
# After the model has finished running, remove the wav files
for fin in os.listdir(temp_abs):
if fin.endswith('.wav'):
os.remove(os.path.join(temp_abs, fin))
def extract_base_name(filename, diar):
"""
Giving a filename generated by a model and composed of :
model_name + wav_filename + tbeg + tdur
Extracts the base name composed of the three last elements.
Input:
filename: a filename generated by a model
diar: a diarization model (if provided)
Output:
base : wavfilename_tbeg_tdur
"""
if diar == 'yunitate':
base = filename.split('_')[1:]
else:
base = filename.split('_')[2:]
base = os.path.splitext('_'.join(base))[0]
return base
def detect_parent_child_conv(previous_activity, curr_activity, last_silence_dur):
"""
Attempts to try if a turn-taking happened between a child and his/her parents
Input:
previous_activity: the last activity amongst ['CHI', 'MAL', 'FEM']
curr_activity: the current activity amongst ['CHI', 'MAL', 'FEM']
last_silence_dur: the duration of the last silence
Output:
A boolean indicating if a turn-taking happened between a child and his/her parents
True : a turn-taking happened
False : nothing happened
"""
if last_silence_dur <= 2.0:
if (previous_activity == 'MAL' and curr_activity == 'CHI') or \
(previous_activity == 'FEM' and curr_activity == 'CHI') or \
(previous_activity == 'CHI' and curr_activity == 'MAL') or \
(previous_activity == 'CHI' and curr_activity == 'FEM'):
return True
return False
def detect_adults_conv(previous_activity, curr_activity, last_silence_dur):
"""
Attempts to try if a turn-taking happened between two adults
Input:
previous_activity: the last activity amongst ['CHI', 'MAL', 'FEM']
curr_activity: the current activity amongst ['CHI', 'MAL', 'FEM']
last_silence_dur: the duration of the last silence
Output:
A boolean indicating if a turn-taking happened between a child and his/her parents
True : a turn-taking happened
False : nothing happened
"""
if last_silence_dur <= 2.0:
if (previous_activity == 'MAL' and curr_activity == 'FEM') or \
(previous_activity == 'FEM' and curr_activity == 'MAL') or \
(previous_activity == 'FEM' and curr_activity == 'FEM') or \
(previous_activity == 'MAL' and curr_activity == 'MAL'):
return True
return False
def read_analyses(temp_abs, sad, perc, diar=None, mode='CHI', child_aware=False):
""" When the model has finished producing its outputs, read all the
transcriptions and sort the files by the quantity of their speech
content or the quantity of their speech belonging to the noiseme_class.
Input:
temp: path to the temp folder in which the snippets of sound are
stored. Here we need the relative path, not the absolute
path, since the SAD tool will add the "/vagrant/" part of
the path.
sad: name of the sad tool to be used to analyze the snippets of
sound.
perc: the percentage of speech to keep.
diar: the diarization model (if provided).
mode: the type of speech that needs to be kept (has to be amongst ['CHI']).
child_aware : (only used if mode == ACCA) Indicates, if we should filter the snippets
that don't present any child activity.
Output:
sorted_files: list(str), list of the files, sorted by the quantity
of the speech content (as returned by the SAD tool)
or the quantity of the speech content classified as
belonging to the noiseme class (as returned by the
diarization tool).
only 10 %% of all files, that contain the most speech
"""
# get list of model outputs
all_files = os.listdir(temp_abs)
diar_mode = diar is not None
if diar_mode:
model = diar
else:
model = sad
# we consider only the annotations that have been generated by the model
annotations = [file for file in all_files if file.startswith(model[:-1])]
# read all annotations and store duple in list (filename, speech_dur)
files_n_dur = []
for file in annotations:
# we extract the base name composed of wav_file_name + t_deg + t_dur
base = extract_base_name(file, diar)
with open(os.path.join(temp_abs, file), 'r') as fin:
speech_activity = fin.readlines()
# total duration of the speech of interest
tot_dur = 0.0
# duration of the last silence
silence_dur = 0.0
# type of the last activity
previous_activity = None
onset_prev = 0.0
dur_prev = 0.0
# variable to detect if the child is aware in ACCA mode.
chi_points = 0.0
for line in speech_activity:
anno_fields = line.split(' ')
dur = float(anno_fields[4])
onset = float(anno_fields[3])
curr_activity = anno_fields[7]
if onset_prev+dur_prev == onset:
silence_dur = 0.0
else:
silence_dur = onset-onset_prev-dur_prev
if not diar_mode: # SAD mode
tot_dur += dur
elif diar_mode and mode == 'CHI' and curr_activity == 'CHI': # Child detection mode
tot_dur += 1
elif diar_mode and mode == 'PCCONV': # Parent-child detection mode
if detect_parent_child_conv(previous_activity, curr_activity, silence_dur):
# Here we consider more an objective function (number of turn-taking) that
# we want to maximize. That comes from the fact that adults speak during a
# longer time while children speak only for few seconds. And we don't want
# to put too much weight on the adults speech.
tot_dur += 1
elif diar_mode and mode == 'ACCA': # Adults conversation child aware detection
if detect_adults_conv(previous_activity, curr_activity, silence_dur):
tot_dur += 1
elif curr_activity == 'CHI':
chi_points += 1
previous_activity = curr_activity
onset_prev = onset
dur_prev = dur
files_n_dur.append((base, tot_dur, chi_points))
# remove annotation when finished reading
os.remove(os.path.join(temp_abs, file))
if child_aware and diar_mode == 'ACCA':
files_n_dur = [file for file in files_n_dur if file[2] > 3]
if len(files_n_dur) == 0:
raise ValueError("No moments for the {} mode has been found.\n Try to decrease the step parameter or " + \
"to increase the size of the chunks.")
files_n_dur = sorted(files_n_dur, key=itemgetter(1), reverse=True)
# return top 10%% of snippets
percent = max(1, int(math.ceil(perc * len(files_n_dur))))
sorted_files = files_n_dur[:percent]
return sorted_files
def new_onsets_two_minutes(sorted_files):
"""
Given a selection of file with lots of speech,
extract new 2minutes long chunks of audio in the original wav,
centered around the short 10s snippets that were analysed.
Input:
sorted_files: list of the snippets that were selected
because they had lot of speech
temp_abs: absolute path to the temp folder that contains the
snippets
wav: path to the daylong recording
Ouput:
_: in the temp folder, new two minutes long chunks of
audio will be stored.
"""
# loop over selected files and retrieve their onsets from their name
new_onset_list = []
for snippet, speech_dur, chi_points in sorted_files:
onset = os.path.splitext(snippet)[0].split('_')[-2]
offset = os.path.splitext(snippet)[0].split('_')[-1]
length = float(offset) - float(onset)
# new segment is centered around snippet, so get middle of snippet
new_onset = length / 2 + float(onset)
new_onset_list.append(new_onset)
return new_onset_list
def new_onsets_five_minutes(sorted_files):
"""
Given a selection of file with lots of speech,
extract new 5minutes long chunks of audio in the original wav,
by adding 2 minutes before and 1 minute after the 2minute chunks.
Input:
sorted_files: list of the snippets that were selected
because they had lot of speech
temp_abs: absolute path to the temp folder that contains the
snippets
wav: path to the daylong recording
Ouput:
_: in the temp folder, new two minutes long chunks of
audio will be stored.
"""
# loop over selected files and retrieve their onsets from their name
new_onset_list = []
for snippet, speech_dur, chi_points in sorted_files:
onset = os.path.splitext(snippet)[0].split('_')[-2]
offset = os.path.splitext(snippet)[0].split('_')[-1]
# new segment starts 2 minute before the 2 minute chunk
new_onset = float(onset) - 120.0
new_onset_list.append(new_onset)
return new_onset_list
def main():
"""
Get duration of wav file
Given duration of wav file, extract list of onsets
Given list of onsets in wav file, extract chunks of wav
Input:
daylong: path to the daylong recording
--step: (optional) step in seconds between each chunk.
By default 600 seconds.
--chunk_size: (optional) size of the chunks to extract.
--temp: (optional) path to a temporary folder to store the
extracted chunks.
--sad: (optional) name of the SAD tool to call to analyse
the chunks. By default noiseme
--diar: (optional) name of the diarization tool to call to analyse
the chunks. No default option
"""
parser = argparse.ArgumentParser()
parser.add_argument('daylong', metavar='AUDIO_FILE',
help='''Give RELATIVE path to the daylong recording'''
'''in wav format.''')
parser.add_argument('--step', default=600.0, type=float,
help='''(Optional) Step, in seconds, between each chunk of '''
'''audio that will be extracted to be analysed by the SAD '''
'''tool. By default, step=600 seconds (10 minutes)''')
parser.add_argument('--chunk_sizes', nargs=3,
default=[10.0, 120.0, 300.0], type=float,
help='''(Optional) Size of the chunks to extract and analyze. '''
'''By default it's 10.0 120.0 300.0: \n'''
'''10s chunks are extracted, analyzed by the '''
'''SAD tool, the 10%% chunks that contain the most speech '''
'''are kept, than 120s chunks centered on the 10s chunks, '''
'''these are again analysed by the SAD tool, the 10%% that '''
''' contain the most speech are kept, and 300.0s chunks '''
''' are finally extracted around these kept chunks.''')
parser.add_argument('--percentage', default=10, type=float,
help='''(Optional) Percentage of snippets to keep at each stage. '''
'''By default, we keep 10%% of snippets each time.\n'''
'''For a 15h long recording, we have 90x10s snippets, '''
'''we keep the 10%% with the most speech content, that '''
'''lead to 9x120s snippets, we again keep the 10%% with the '''
'''most speech content, so in the end we have 1x300 seconds '''
'''segment.''')
parser.add_argument('--temp', default='tmp',
help='''(Optional) Path to a temp folder in which the small wav '''
'''segments will be stored. If it doesn't exist, it will be '''
'''created.''')
parser.add_argument('--sad', default='noisemes_sad',
help='''(Optional) name of the sad tool that will be used to '''
'''analyze the snippets of sound''')
parser.add_argument('--diar',
help='''(Optional) name of the diar tool that will be used to '''
'''analyze the snippets of sound. If this argument is provided by
the user, this script will be on the diarization mode. Otherwise, it will
be on the SAD mode.''')
parser.add_argument('--mode', default='CHI', choices=['CHI', 'PCCONV', 'ACCA'],
help='''(Optional) the noiseme class(es) of interest. '''
'''Used only when the diar argument is provided.''')
args = parser.parse_args()
if args.diar == 'yunitate' and (args.mode == 'PCCONV' or args.mode == 'ACCA' or args.mode == 'CHI'):
if args.chunk_sizes[0] == 10.0:
print("Resetting chunk size at 20.0 seconds (more suitable for CHI/PCCONV/ACCA mode).")
args.chunk_sizes[0] = 20.0
if args.step == 600.0:
print("Resetting step at 300.0 seconds (more suitable for CHI/PCCONV/ACCA mode).")
args.step = 300.0
# Define Data dir
data_dir = "/vagrant"
# check if temp dir exist and create it if not
temp_abs = os.path.join(data_dir, args.temp)
temp_rel = args.temp # to launch SAD tool we need the relative path to temp
if not os.path.isdir(temp_abs):
os.makedirs(temp_abs)
# Define absolute path to wav file
wav_abs = os.path.join(data_dir, args.daylong)
# get percentage
perc = args.percentage / 100.0
# get path to current (tools/) dir - useful to call the SAD tool
dir_path = os.path.dirname(os.path.realpath(__file__))
# get duration
duration = get_audio_length(wav_abs)
# get list of onsets
onset_list = select_onsets(duration, args.step)
# call subprocess to extract the chunks
extract_chunks(wav_abs, onset_list, args.chunk_sizes[0], temp_abs)
# analyze using SAD tool
run_Model(temp_rel, temp_abs, args.sad, args.diar)
# sort by speech duration
sorted_files = read_analyses(temp_abs, args.sad, perc, args.diar, args.mode)
# get new onsets for two minutes chunks
new_onset_list = new_onsets_two_minutes(sorted_files)
# extract two minutes chunks
extract_chunks(wav_abs, new_onset_list, args.chunk_sizes[1], temp_abs)
# analyze using SAD tool
run_Model(temp_rel, temp_abs, args.sad, args.diar)
# sort by speech duration again
child_aware = False
if args.mode == 'ACCA':
child_aware = True
sorted_files = read_analyses(temp_abs, args.sad, perc, args.diar, args.mode, child_aware)
# get new onsets for five minutes chunks
new_onset_list = new_onsets_five_minutes(sorted_files)
# extract final five minutes long chunks
output_dir = os.path.dirname(wav_abs)
extract_chunks(wav_abs, new_onset_list, args.chunk_sizes[2], output_dir)
if __name__ == '__main__':
main()