forked from Rayhane-mamah/Tacotron-2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess_audio.py
51 lines (46 loc) · 1.89 KB
/
preprocess_audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from scipy.io.wavfile import write
import librosa
import numpy as np
import argparse
import os
sr = 22050
max_wav_value=32768.0
trim_fft_size = 1024
trim_hop_size = 256
trim_top_db = 40
def preprocess_audio(file_list, dataset_prefix, silence_audio_size):
for F in file_list:
f = open(F)
R = f.readlines()
f.close()
print('='*5+F+'='*5)
for i, r in enumerate(R):
wav_file = os.path.join(dataset_prefix, r.split('|')[0])
data, sampling_rate = librosa.core.load(wav_file, sr)
data = data / np.abs(data).max() *0.999
data_= librosa.effects.trim(data, top_db= trim_top_db, frame_length=trim_fft_size, hop_length=trim_hop_size)[0]
data_ = data_*max_wav_value
data_ = np.append(data_, [0.]*silence_audio_size)
data_ = data_.astype(dtype=np.int16)
write(wav_file, sr, data_)
#print(len(data),len(data_))
if(i%100 == 0):
print (i)
if __name__ == "__main__":
"""
usage
python preprocess_audio.py -f=namgung-highquality/metadata.csv -p=namgung-highquality/wavs -s=3
python preprocess_audio.py -f=namgung-old/metadata.csv -p=namgung-old/wavs -s=0
"""
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file_list', type=str,
help='file list to preprocess')
parser.add_argument('-s', '--silence_mel_padding', type=int, default=0,
help='silence audio size is hop_length * silence mel padding')
parser.add_argument('-p', '--dataset_prefix', type=str,
help='audio path = dataset_prefix + file_prefix')
args = parser.parse_args()
file_list = args.file_list.split(',')
dataset_prefix = args.dataset_prefix
silence_audio_size = trim_hop_size * args.silence_mel_padding
preprocess_audio(file_list, dataset_prefix, silence_audio_size)