forked from pabdzadeh/voice-spoof-detection-system
-
Notifications
You must be signed in to change notification settings - Fork 1
/
feature_layers.py
171 lines (142 loc) · 5.92 KB
/
feature_layers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import torch.nn as nn
import torch
import tools.audio_utils
import librosa
import numpy as np
class CQT(nn.Module):
def __init__(self, sampling_rate):
super(CQT, self).__init__()
self.sampling_rate = sampling_rate
def forward(self, x, device):
batch_size = x.shape[0]
batch_output = torch.zeros(batch_size, 84, 126)
batch_count = 0
for item in x:
numpy_item = item.numpy()
item_cqt = librosa.cqt(numpy_item, sr=self.sampling_rate)
item_cqt = librosa.amplitude_to_db(np.abs(item_cqt), ref=np.max)
item_torch_cqt = torch.from_numpy(item_cqt).to(device)
batch_output[batch_count] = item_torch_cqt
batch_count += 1
return batch_output.to(device)
class Spectrogram(nn.Module):
def __init__(self, n_fft):
super(Spectrogram, self).__init__()
self.n_fft = n_fft
def forward(self, x, device):
batch_size = x.shape[0]
batch_output = torch.zeros(batch_size, 1025, 126)
batch_count = 0
for item in x:
numpy_item = item.numpy()
item_stft = librosa.stft(numpy_item, n_fft=self.n_fft)
item_stft = librosa.amplitude_to_db(np.abs(item_stft), ref=np.max)
item_torch_stft = torch.from_numpy(item_stft).to(device)
batch_output[batch_count] = item_torch_stft
batch_count += 1
return batch_output.to(device)
class LinearDCT(nn.Linear):
"""Implement any DCT as a linear layer; in practice this executes around
50x faster on GPU. Unfortunately, the DCT matrix is stored, which will
increase memory usage.
:param in_features: size of expected input
:param type: which dct function in this file to use"""
def __init__(self, in_features, type, norm=None, bias=False):
self.type = type
self.N = in_features
self.norm = norm
super(LinearDCT, self).__init__(in_features, in_features, bias=bias)
def reset_parameters(self):
# initialise using dct function
I = torch.eye(self.N)
if self.type == 'dct1':
self.weight.data = tools.audio_utils.dct1(I).data.t()
elif self.type == 'idct1':
self.weight.data = tools.audio_utils.idct1(I).data.t()
elif self.type == 'dct':
self.weight.data = tools.audio_utils.dct(I, norm=self.norm).data.t()
elif self.type == 'idct':
self.weight.data = tools.audio_utils.idct(I, norm=self.norm).data.t()
self.weight.requires_grad = False # don't learn this!
class LFCC(nn.Module):
""" Based on asvspoof.org baseline Matlab code.
Difference: with_energy is added to set the first dimension as energy
"""
def __init__(self, fl, fs, fn, sr, filter_num,
with_energy=False, with_emphasis=True,
with_delta=True, flag_for_LFB=False):
""" Initialize LFCC
Para:
-----
fl: int, frame length, (number of waveform points)
fs: int, frame shift, (number of waveform points)
fn: int, FFT points
sr: int, sampling rate (Hz)
filter_num: int, number of filters in filter-bank
with_energy: bool, (default False), whether replace 1st dim to energy
with_emphasis: bool, (default True), whether pre-emphaze input wav
with_delta: bool, (default True), whether use delta and delta-delta
for_LFB: bool (default False), reserved for LFB feature
"""
super(LFCC, self).__init__()
self.fl = fl
self.fs = fs
self.fn = fn
self.sr = sr
self.filter_num = filter_num
# build the triangle filter bank
f = (sr / 2) * torch.linspace(0, 1, fn // 2 + 1)
filter_bands = torch.linspace(min(f), max(f), filter_num + 2)
filter_bank = torch.zeros([fn // 2 + 1, filter_num])
for idx in range(filter_num):
filter_bank[:, idx] = tools.audio_utils.trimf(
f, [filter_bands[idx],
filter_bands[idx + 1],
filter_bands[idx + 2]])
self.lfcc_fb = nn.Parameter(filter_bank, requires_grad=False)
# DCT as a linear transformation layer
self.l_dct = LinearDCT(filter_num, 'dct', norm='ortho')
# opts
self.with_energy = with_energy
self.with_emphasis = with_emphasis
self.with_delta = with_delta
self.flag_for_LFB = flag_for_LFB
return
def forward(self, x):
"""
input:
------
x: tensor(batch, length), where length is waveform length
output:
-------
lfcc_output: tensor(batch, frame_num, dim_num)
"""
# pre-emphsis
if self.with_emphasis:
x[:, 1:] = x[:, 1:] - 0.97 * x[:, 0:-1]
# STFT
x_stft = torch.stft(x, self.fn, self.fs, self.fl,
window=torch.hamming_window(self.fl).to(x.device),
onesided=True, pad_mode="constant")
# amplitude
sp_amp = torch.norm(x_stft, 2, -1).pow(2).permute(0, 2, 1).contiguous()
# filter bank
fb_feature = torch.log10(torch.matmul(sp_amp, self.lfcc_fb) +
torch.finfo(torch.float32).eps)
# DCT (if necessary, remove DCT)
lfcc = self.l_dct(fb_feature) if not self.flag_for_LFB else fb_feature
# Add energy
if self.with_energy:
power_spec = sp_amp / self.fn
energy = torch.log10(power_spec.sum(axis=2) +
torch.finfo(torch.float32).eps)
lfcc[:, :, 0] = energy
# Add delta coefficients
if self.with_delta:
lfcc_delta = tools.audio_utils.delta(lfcc)
lfcc_delta_delta = tools.audio_utils.delta(lfcc_delta)
lfcc_output = torch.cat((lfcc, lfcc_delta, lfcc_delta_delta), 2)
else:
lfcc_output = lfcc
# done
return lfcc_output