-
Notifications
You must be signed in to change notification settings - Fork 8
/
extract_features.sh
executable file
·93 lines (81 loc) · 2.94 KB
/
extract_features.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/bin/bash
# conda env
conda=/share/mini1/sw/std/python/anaconda3-2019.07/v3.7
conda_env=torch_1.9
source $conda/bin/activate $conda_env
# stage
stage=1
stop_stage=1
# set up
dataset=libritts # vctk or libritts
linguistic_encoder=vqwav2vec
speaker_encoder=utt_dvec
prosodic_encoder=ppgvc_f0
decoder=vits
vocoder=none
. bin/parse_options.sh || exit 1;
# decide feature_type based on choices of vocoder and decoder
if [ "$vocoder" == "ppgvc_hifigan" ]; then
feature_type=ppgvc_mel # mel, vits_spec, ppgvc_mel
elif [ "$vocoder" == "bigvgan" ]; then
feature_type=bigvgan_mel
elif [ "$decoder" == "vits"]; then
feature_type=vits_spec
else
feature_type=mel
fi
if [ "$dataset" == "vctk" ]; then
train_split=train_nodev_all
dev_split=dev_all
eval_split=eval_all
splits="train_nodev_all dev_all eval_all"
elif [ "$dataset" == "libritts" ]; then
train_split=train_nodev_clean
dev_split=dev_clean
eval_split=eval_clean
splits="train_nodev_clean dev_clean eval_clean"
else
exit 1;
fi
# step 1: spectrogram extraction
if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
echo "extract $feature_type for $dataset"
#./bin/feature_extraction.sh $dataset $feature_type "$splits"
echo "done!"
if [ "$feature_type" == "mel" ] || [ "$feature_type" == "bigvgan_mel" ]; then
# normalize as parallel_wavegan
echo "compute_statistics $feature_type for $dataset"
stats_path=dump/$dataset/$train_split/$feature_type/${train_split}.npy
./bin/compute_statistics.sh $dataset $train_split $feature_type
echo "done!"
echo "normalize $feature_type for $dataset"
./bin/normalize.sh $dataset "$splits" $feature_type $stats_path
echo "done!"
fi
fi
# step 2: linguistic representation extraction
if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
echo "start extracting $linguistic_encoder representations"
./bin/${linguistic_encoder}_feature_extraction.sh "$splits" $dataset
echo "done!"
fi
# step 3: prosodic representation extraction
if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
echo "extract $prosodic_encoder for $dataset"
./bin/feature_extraction.sh $dataset $prosodic_encoder $splits
echo "done!"
if [ "$prosodic_encoder" == "fastspeech2_pitch_energy" ]; then
# normalize pitch & energy
echo "compute_statistics $prosodic_encoder for $dataset"
stats_path=dump/$dataset/$train_split/$prosodic_encoder/${train_split}.npy
./bin/compute_statistics.sh $dataset $train_split $prosodic_encoder
echo "done!"
echo "normalize $prosodic_encoder for $dataset"
./bin/normalize.sh $dataset "$splits" $prosodic_encoder $stats_path
echo "done!"
fi
fi
# step 4: speaker representation extraction
if [ "${stage}" -le 4 ] && [ "${stop_stage}" -ge 4 ]; then
./bin/${speaker_encoder}_feature_extraction.sh "$splits" $dataset
fi