-
Notifications
You must be signed in to change notification settings - Fork 33
/
utils.py
104 lines (73 loc) · 3.08 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#! /usr/bin/env python
# encoding: utf-8
from __future__ import unicode_literals
import numpy as np
import soundfile as sf
class Audio(object):
def __init__(self, data, sample_rate):
if data is not None:
self.data = data
if len(self.data.shape) == 1:
self.data = self.data[:, np.newaxis]
self.sample_rate = sample_rate
else:
self.data, self.sample_rate = None, None
@classmethod
def from_file(cls, filename):
data, sample_rate = sf.read(filename)
return cls(data, sample_rate)
def trim_silences(self, threshold_db):
self.data = np.expand_dims(
trim(np.squeeze(self.data), top_db=threshold_db)[0], axis=1)
def write(self, filename):
sf.write(filename, self.data, self.sample_rate)
def duration(self):
return float(len(self.data)) / float(self.sample_rate)
def power_to_db(S, amin=1e-10):
magnitude = np.abs(S)
ref_value = np.max(magnitude)
log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
return log_spec
class ViewArray(object):
def __init__(self, interface, base):
self.__array_interface__ = interface
self.base = base
def as_strided(x, shape=None, strides=None):
x = np.array(x, copy=False)
interface = dict(x.__array_interface__)
if shape is not None:
interface['shape'] = tuple(shape)
if strides is not None:
interface['strides'] = tuple(strides)
array = np.asarray(ViewArray(interface, base=x))
array.dtype = x.dtype
return array
def frame(y, frame_length=2048, hop_length=512):
n_frames = 1 + int((len(y) - frame_length) / hop_length)
y_frames = as_strided(y, shape=(frame_length, n_frames),
strides=(y.itemsize, hop_length * y.itemsize))
return y_frames
def rmse(y, frame_length=2048, hop_length=512):
y = np.pad(y, int(frame_length // 2), mode='reflect')
x = frame(y, frame_length=frame_length, hop_length=hop_length)
return np.sqrt(np.mean(np.abs(x) ** 2, axis=0, keepdims=True))
def _signal_to_frame_nonsilent(y, frame_length=2048, hop_length=512, top_db=60):
mse = rmse(y=y,
frame_length=frame_length,
hop_length=hop_length) ** 2
return power_to_db(mse.squeeze()) > - top_db
def frames_to_samples(frames, hop_length=512, n_fft=None):
return (np.atleast_1d(frames) * hop_length).astype(int)
def trim(y, top_db=25, frame_length=2048, hop_length=512):
non_silent = _signal_to_frame_nonsilent(y,
frame_length=frame_length,
hop_length=hop_length,
top_db=top_db)
nonzero = np.flatnonzero(non_silent)
start = int(frames_to_samples(nonzero[0], hop_length))
end = min(y.shape[-1],
int(frames_to_samples(nonzero[-1] + 1, hop_length)))
full_index = [slice(None)] * y.ndim
full_index[-1] = slice(start, end)
return y[full_index], np.asarray([start, end])