forked from fortypercnt/stream-translator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvad.py
133 lines (114 loc) · 5.63 KB
/
vad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import torch
import warnings
warnings.filterwarnings("ignore")
class VAD:
def __init__(self):
self.model = init_jit_model("silero_vad.jit")
def no_speech(self, audio):
speech = get_speech_timestamps(torch.Tensor(audio), self.model, return_seconds=True)
# print(speech)
return len(speech) == 0
def init_jit_model(model_path: str,
device=torch.device('cpu')):
torch.set_grad_enabled(False)
model = torch.jit.load(model_path, map_location=device)
model.eval()
return model
def get_speech_timestamps(audio: torch.Tensor,
model,
threshold: float = 0.5,
sampling_rate: int = 16000,
min_speech_duration_ms: int = 250,
min_silence_duration_ms: int = 100,
window_size_samples: int = 1536,
speech_pad_ms: int = 30,
return_seconds: bool = False):
"""
From https://github.com/snakers4/silero-vad/blob/master/utils_vad.py
This method is used for splitting long audios into speech chunks using silero VAD
Parameters
----------
audio: torch.Tensor
One dimensional float torch.Tensor, other types are cast to torch if possible
model: preloaded .jit silero VAD model
threshold: float (default - 0.5)
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value
are considered as SPEECH. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is
pretty good for most datasets.
sampling_rate: int (default - 16000)
Currently silero VAD models support 8000 and 16000 sample rates
min_speech_duration_ms: int (default - 250 milliseconds)
Final speech chunks shorter min_speech_duration_ms are thrown out
min_silence_duration_ms: int (default - 100 milliseconds)
In the end of each speech chunk wait for min_silence_duration_ms before separating it
window_size_samples: int (default - 1536 samples)
Audio chunks of window_size_samples size are fed to the silero VAD model.
WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768
samples for 8000 sample rate.Values other than these may affect model performance!!
speech_pad_ms: int (default - 30 milliseconds)
Final speech chunks are padded by speech_pad_ms each side
return_seconds: bool (default - False)
whether return timestamps in seconds (default - samples)
Returns
----------
speeches: list of dicts
list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
"""
model.reset_states()
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
audio_length_samples = len(audio)
speech_probs = []
for current_start_sample in range(0, audio_length_samples, window_size_samples):
chunk = audio[current_start_sample: current_start_sample + window_size_samples]
if len(chunk) < window_size_samples:
chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk))))
speech_prob = model(chunk, sampling_rate).item()
speech_probs.append(speech_prob)
triggered = False
speeches = []
current_speech = {}
neg_threshold = threshold - 0.15
temp_end = 0
for i, speech_prob in enumerate(speech_probs):
if (speech_prob >= threshold) and temp_end:
temp_end = 0
if (speech_prob >= threshold) and not triggered:
triggered = True
current_speech['start'] = window_size_samples * i
continue
if (speech_prob < neg_threshold) and triggered:
if not temp_end:
temp_end = window_size_samples * i
if (window_size_samples * i) - temp_end < min_silence_samples:
continue
else:
current_speech['end'] = temp_end
if (current_speech['end'] - current_speech['start']) > min_speech_samples:
speeches.append(current_speech)
temp_end = 0
current_speech = {}
triggered = False
continue
if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
current_speech['end'] = audio_length_samples
speeches.append(current_speech)
for i, speech in enumerate(speeches):
if i == 0:
speech['start'] = int(max(0, speech['start'] - speech_pad_samples))
if i != len(speeches) - 1:
silence_duration = speeches[i + 1]['start'] - speech['end']
if silence_duration < 2 * speech_pad_samples:
speech['end'] += int(silence_duration // 2)
speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - silence_duration // 2))
else:
speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - speech_pad_samples))
else:
speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
if return_seconds:
for speech_dict in speeches:
speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)
return speeches