From 0e0ed5b4cfb9f6928534a4c12ce5604d4f396ffc Mon Sep 17 00:00:00 2001 From: mimbres <1ronyar@gmail.com> Date: Fri, 25 May 2018 18:45:22 +0900 Subject: [PATCH] live loader added --- FFTNet_dilconv.py | 6 +- FFTNet_split.py | 6 +- __pycache__/FFTNet_dilconv.cpython-36.pyc | Bin 3050 -> 3068 bytes data/processed_slt_arctic/scale_factors.npy | Bin 31767 -> 31724 bytes train_with_liveloader.py | 232 ++++++++++++++++++ .../live_loading_dataset.cpython-36.pyc | Bin 0 -> 4787 bytes util/live_loading_dataset.py | 145 +++++++++++ util/loading_dataset.py | 106 ++++++-- util/preprocessing_cmu_arctic.py | 13 +- 9 files changed, 477 insertions(+), 31 deletions(-) create mode 100644 train_with_liveloader.py create mode 100644 util/__pycache__/live_loading_dataset.cpython-36.pyc create mode 100644 util/live_loading_dataset.py diff --git a/FFTNet_dilconv.py b/FFTNet_dilconv.py index 4975d8b..460e2d1 100644 --- a/FFTNet_dilconv.py +++ b/FFTNet_dilconv.py @@ -40,7 +40,7 @@ def fftnet_residual(input_dim=256, num_layer=11, io_ch=256, skip_ch=256): ''' class FFT_Block(nn.Module): - def __init__(self, cond_dim=25, io_ch=int, recep_sz=int, bias=True): + def __init__(self, cond_dim=26, io_ch=int, recep_sz=int, bias=True): super(FFT_Block, self).__init__() self.cond_dim=cond_dim # Number of dimensions of condition input @@ -72,11 +72,11 @@ def forward(self, x, cond): - [11 FFT_blocks] --> [FC_layer] --> [softmax] ''' class FFTNet(nn.Module): - def __init__(self, input_dim=256, cond_dim=25, num_layer=11, io_ch=256, skip_ch=0, bias=True): + def __init__(self, input_dim=256, cond_dim=26, num_layer=11, io_ch=256, skip_ch=0, bias=True): super(FFTNet, self).__init__() self.input_dim = input_dim # 256 (=num_classes) - self.cond_dim = cond_dim # 25 + self.cond_dim = cond_dim # 26 self.num_layer = num_layer # 11 self.io_ch = io_ch # 256 ch. in the paper self.skip_ch = skip_ch # Not implemented yet (no skip channel in the paper) diff --git a/FFTNet_split.py b/FFTNet_split.py index 789bae7..da93bf4 100644 --- a/FFTNet_split.py +++ b/FFTNet_split.py @@ -38,7 +38,7 @@ def fftnet_residual(input_dim=256, num_layer=11, io_ch=256, skip_ch=256): ''' class FFT_Block(nn.Module): - def __init__(self, is_first_block=True, initial_input_dim=256, cond_dim=25, io_ch=int, recep_sz=int, bias=True): + def __init__(self, is_first_block=True, initial_input_dim=256, cond_dim=26, io_ch=int, recep_sz=int, bias=True): super(FFT_Block, self).__init__() self.is_first_block = is_first_block # If True, an input_embedding_layer will be created (this is not clear in the paper). @@ -111,11 +111,11 @@ def forward(self, x, cond): - [11 FFT_blocks] --> [FC_layer] --> [softmax] ''' class FFTNet(nn.Module): - def __init__(self, input_dim=256, cond_dim=25, num_layer=11, io_ch=256, skip_ch=0, bias=True ): + def __init__(self, input_dim=256, cond_dim=26, num_layer=11, io_ch=256, skip_ch=0, bias=True ): super(FFTNet, self).__init__() self.input_dim = input_dim # 256 (=num_classes) - self.cond_dim = cond_dim # 25 + self.cond_dim = cond_dim # 26 self.num_layer = num_layer # 11 self.io_ch = io_ch self.skip_ch = skip_ch diff --git a/__pycache__/FFTNet_dilconv.cpython-36.pyc b/__pycache__/FFTNet_dilconv.cpython-36.pyc index 026f251dda56d8beecee8e56ed9a92e61bab563b..86a3941d9d28233f1120c43015d0034bb1d3046e 100644 GIT binary patch delta 588 zcmY+A&rTCj6vlhnna(g{TdB%)Xc43;UU5hOjZs1b7Q}==jJh#hXp@e(C~b4fL=zXf zbHnazc>`92r7M@dfQxwnqYIzF1kZO_Im!I;-Sg+(@0{P!-^gFBRD!2Jyw0iro@iS> zQ>(t0X58&3JIS-*D0y+%|3!!?JI>sfv*DBfGNONm}>5h zvr)=ilb@w!iX?Wn!fB+-KvUj2a}>)L=cqZ0z?>qd?cCwCpYaVn8HeX($>p$xd<+;l zac5`RkRec082!hN!Y@wM@tXUbF3aQcADR^7E&TUp$GfxCK;r~puS6$XUoMn2=Rsi# zZpo<^(TsfeHmN3|e?6Q>_@YVf`|YTyXG3f~1;|-r@!7zCL!TxQs4HInI@gaka0X0; z?NqA>+*IVHQMbnrpgi5$L3Wt(LnN)M6{trx$?5oAS1YX1H4Pb$t*6_lS+F(x>EQLj zu$RkjFyXEvB%Po|Mm_{vx|^>-ym%MYi@-hLV!KcYU3Nf%mFRZxkRArmklmS#yKVwC}t=t2ag1DUIC zH}y-js+jo*ow}J%00Ux0s?^^9VZxGsy65hmyZ@d2w{TN%Uj>2plkOV7+!LZz8Bke% z&=+dRa(0IT`JDYgrHen=9970MnG8Pxo&wJRR}qD&-RpN_=1}f)D>NZBYtt_woChZ5 z)S9EBe7C+$&A>6MNU5G$93S?1PEV1=^YYGSe+uy;P?bx2cKSI?9|#qZzCBCfW+%qc z&EBOE*~;J2sH{6nk9sCfwO+u*A;6TxO*Xo0SVo)!3sW#!r@3zDBpps#r%jIc<8HHe z#$~y3-cu;k?u(%h{&CE&5i~kiV7)Oxj(Z0!%`no})D<~wG%X-!(d2xJkD5uRorcbt zR>cHw)!&fBD^pdzdYihk-(IPPW%4pm10I$R>jhw@0WhA7`A7O+4E%*6)7pVNpwh17{DG&v-Ae` z5`tmNNa*s{A?%CC2M7lR4y}y&dP9fRjU#;HDB+m8 zaoh)nX;QpmGT*HCEXv=gUxT&d%t;_A3sAF z5IAcmy8e4|E>p%ib?-di8zfxdd&3=dMC84-{zboOa$tyXQQ%U0pdxa(tO8d!Fig0r z0@q|9GuY=hM>ueua6{l`dw^Gss=zG{j1nvrxGe*#iBDZ-j017Pn82M4flLi|Re(4! zPPnH6_hq1wTl1R_I50s-2u#`uUaORMONFfbU*%eOs0xp`@R*R41?yuSPh?;kt`SLb zAWg^!*#F)@1$e})@;UZ-LZRiSr2jH9X+#|MMZ!~o>2}W>y2_W>dxV*mpOtenI+e)w~gT#{N0sg}~wlFBg2N{44ga2}>>iM*8*IxY6^L Q{X0TkVA-z96&m)bKOylALjV8( delta 885 zcmZXSyHgWU6vlVSD2@c2QBYCiI~c^V(5RS1l!t<>iRR!VC^2U9V1wDfm)t;55nmyq zrG;Z}p`A1NH?*|3va)q_#zOBQxekFXZn3k!-}%0?_s1{&?Qi{)OAp(b9JX4Tr5joq z+pOja+6f&VLl3^!%h+yhjx=eSmiA(Yp&5DwI|-d0!x7Qt+W>ZjLZQ*W@>w|yO<;HR zOWW6teS548OK=hP3hXl@ZSv!6CO28k6htNwv*RV~S2GXr%!7nOYUW|9v-+*0d6V1C zjxNFx0na)|HkCtn6i2IWeWbPe(>2k$rA1DenV*}^XOpqnc>Ybi$93#~{1dhKCkb8| z-&!C1lvy3@U2ytO^VBnhUV*b_WVN#(lJKe6evTa=oa5O2O=a}S*nft)oxbxdxIpL^ z7^n&CRMt)>5*!MHN-)HNi-b#3V7x11*z#6C+v5)32=kW-qXI@F&jGF|KFEBCaJ9}~ zlY9(8M4mXtGCJSy6BC5f-G%WTkJN>uWaECA<5N%um-d z5M#kTLR<+i1sTGW65N-9f#^r4Kg$A}kQ11$bx=E8 z=`f=VU_+izs4qcCLDy8);VZIWmM|w!YFr{$m{geL+| m*YL8zGsQn={sp02=a(d3E{!?bE6l$nEDO9ci*bS1=J9_W1}3`z diff --git a/train_with_liveloader.py b/train_with_liveloader.py new file mode 100644 index 0000000..2cd3a59 --- /dev/null +++ b/train_with_liveloader.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 2 21:11:57 2018 +@author: sungkyun + +FFTNet: A REAL-TIME SPEAKER-DEPENDENT NEURAL VOCODER (Zeyu Jin et al., 2018, ICASSP) + +*CMU Arctic dataset: + - 1032 utterances for train + - 100 utterances for test + - cond. input = F0 and 25-dim MCC + +*Preprocessing: + - 16Khz sampling mono audio + - 8-bit mu-law encoded wav + +*Architecture: + - receptive field = 2048 (2^11) + - 11 FFT-layers (256 ch.) + - Total 1M parameters + - final 2 layers = FC() -> softmax(256) : see details in paper 2.3.2 + +*Training: + - minibatch of 5 x 5000 samples + - 100,000 steps = 500 iters + +*Requirements: + pip install soundfile + pip install librosa + +""" +import os +import argparse +import numpy as np +import torch +#import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +from torch.utils.data import DataLoader +#from util.loading_dataset import YesNoDataset # required for loading YesNo dataset +from util.live_loading_dataset import CmuArcticLiveDataset +from util.utils import mu_law_decode # required for auditioning generated samples + + +# Parsing arguments +parser = argparse.ArgumentParser(description='FFTNet implementation') +parser.add_argument('-exp', '--exp_name', type=str, default='00', metavar='STR', + help='Generated samples will be located in the checkpoints/exp directory. Default="00"') # +parser.add_argument('-btr', '--batch_train', type=int, default=1, metavar='N', + help='Batch size for training. e.g. -btr 5') +parser.add_argument('-bts', '--batch_test', type=int, default=1, metavar='N', + help='Batch size for test. e.g. -bts 5') +parser.add_argument('-load', '--load', type=str, default=None, metavar='STR', + help='e.g. --load checkpoints/exp00/checkpoint_00') +args = parser.parse_args() + +USE_GPU = torch.cuda.is_available() +RAND_SEED = 0 + +#%% Loading data +dset_train = CmuArcticLiveDataset(train_flag=True, cond_sel='mfcc', cache_size=1000) # wav is cahced in memory. +dset_test = CmuArcticLiveDataset(train_flag=False, cond_sel='mfcc', cache_size=1) + + +train_loader = DataLoader(dset_train, + batch_size=args.batch_train, + shuffle=True, + num_workers=6, + pin_memory=True + ) # number of CPU threads, practically, num_worker = 4 * num_GPU + +test_loader = DataLoader(dset_test, + batch_size=args.batch_test, + shuffle=False, + num_workers=8, + pin_memory=True, + ) + + + +#%% Train & Test Functions +def train(epoch): + model.train() + train_loss = 0. + train_acc = [] + total_data_sz = len(train_loader) + + for batch_idx, (_, X_mulaw, X_mfcc ) in enumerate(train_loader): + if USE_GPU: + #X_mulaw, X_mfcc = X_mulaw.cuda(), X_mfcc.cuda() + X_mulaw, X_mfcc = Variable(X_mulaw.cuda()), Variable(X_mfcc.cuda().float()) + #X_mulaw, X_mfcc = Variable(X_mulaw.long()), Variable(X_mfcc.half()) # only for fp16 supported GPU! + else: + X_mulaw, X_mfcc = Variable(X_mulaw), Variable(X_mfcc.float()) + + optimizer.zero_grad() + y = model(X_mulaw, X_mfcc) + loss = F.cross_entropy(input=y.view(-1, 256), target=X_mulaw.view(-1), size_average=True) + # input=y.view(-1, num_classes), where num_classes=256 for 8bit mu-law + # target=X_mulaw.view(-1) + loss.backward() + optimizer.step() + + # Accuracy + pred = y.view(-1,256).data.max(1, keepdim=True)[1] # Get the index of the max log-probability from y + acc = pred.eq(X_mulaw.view(-1).data.view_as(pred)).cpu().sum().numpy() / len(pred) # Compute accuracy by comparing pred with X_mulaw + print('Train Epoch: {} [{}/{}], Loss = {:.6f}, Acc = {:.6f}'.format( + epoch, batch_idx * train_loader.batch_size, total_data_sz, loss.item(), acc)) + + train_loss += loss.item() + train_acc.append(acc) + + return train_loss, np.mean(train_acc) + + + + +def generator(test_file_id, out_filename, recep_sz=2048, verbose=1000): + ''' + Annotation + - X_mfcc : Original condition input + - total samples : The number of total samples to generate (=size(X_mfcc)) + - x_mulaw_slice : A temporary input replacing X_mulaw. At the start, all zeros(encoded as 128s). + - x_mfcc_slice : A temporary condition input. + - y : One sample output of the model. This will be fed into x_mulaw_slice at the right-most column. + - pred : Prediction of one new generated sample + - out : Collection of mu_law_decode(pred)s. + ''' + _, _, X_mfcc = test_loader.dataset.__getitem__(test_file_id) # X_mfcc: (CxL) np.ndarray + feat_dim = X_mfcc.shape[0] + total_samples = X_mfcc.shape[1] + X_mfcc = X_mfcc.reshape(1, feat_dim, total_samples) # BxCxL + + # Initial input slices, filled with zeros. + x_mulaw_slice = Variable(torch.LongTensor(1, recep_sz) * 0 + 128, requires_grad=False).cuda() # all zeros(128s). + x_mfcc_slice = Variable(torch.FloatTensor(1, feat_dim, recep_sz) * 0., requires_grad=False).cuda() + out = [] + + model.eval() # .eval(): Not requires gradients. + + for i in range(total_samples): + # New x_mfcc_slice: shift-left, then fill one 'cond' sample into the right-most column + x_mfcc_slice[:, :, 0:-1] = x_mfcc_slice[:, :, 1:] + x_mfcc_slice[:, :, -1] = torch.FloatTensor(X_mfcc[:, :, i]) + + y = model(x_mulaw_slice, x_mfcc_slice, gen_mod=True) # 1x1x256 (BxLxC) + pred = y.view(-1,256).data.max(1, keepdim=True)[1] # Predict: Get the index of the max log-probability from y + + # New x_mulaw_slice: shift-left, then fill 'pred' into the right-most column + x_mulaw_slice[0, 0:-1] = x_mulaw_slice[0, 1:] # Shift-left + x_mulaw_slice[0, -1] = pred # Push 'pred' + + # Collect generated sample + out.append(float(mu_law_decode(pred.cpu().numpy()))) + + # Print progress + if i % verbose == 0: + print('Generator: {}/{} samples ({:.2f}%)'.format(i, total_samples, + 100 * i / total_samples ) ) + + # Save audio + import librosa + librosa.output.write_wav(out_filename, np.asarray(out), sr=16000) + + + +def validate(epoch): + model.eval() + val_loss = 0. + # Not implemented yet + return val_loss + + +def load_checkpoint(filepath): + ''' + Load pre-trained model. + ''' + dt = torch.load(filepath) + model.load_state_dict(dt['state_dict']) + optimizer.load_state_dict(dt['optimizer']) + return 0 + +def save_checkpoint(state, accuracy, exp_name): + checkpoint_dir = 'checkpoints/' + exp_name + os.makedirs(checkpoint_dir, exist_ok=True) + filepath = checkpoint_dir + '/checkpoint.pth.tar' + torch.save(state, filepath) + +def history_recorder(): + ''' + history_recorder(): + - save training history as .csv files. + - save learning-curve as .png files + ''' + return 0 + +def print_model_sz(model): + model_parameters = filter(lambda p: p.requires_grad, model.parameters()) + print('Number of trainable parameters = {}'.format(sum([np.prod(p.size()) for p in model_parameters])) ) + +#%% Experiment: train +from FFTNet_dilconv import FFTNet # <-- implemented using 2x1 dilated conv. +#from FFTNet_split import FFTNet # <-- same with paper + +model = FFTNet(cond_dim=26).cuda() if USE_GPU else FFTNet(cond_dim=26).cpu() # or .cuda() +optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-5) +print_model_sz(model) + + +#load_checkpoint('checkpoints/00/checkpoint.pth.tar') + + +for epoch in range(100): + torch.manual_seed(RAND_SEED + epoch) + tr_loss, tr_acc = train(epoch) + + +save_checkpoint({'epoch': epoch, + 'state_dict': model.state_dict(), + 'optimizer': optimizer.state_dict(),}, + tr_acc, args.exp_name) + +#%% Experiment: generation +test_file_id = 0 # Select 0~5 for different condition input +generator(test_file_id=test_file_id, out_filename='aaa.wav') + + + + + + diff --git a/util/__pycache__/live_loading_dataset.cpython-36.pyc b/util/__pycache__/live_loading_dataset.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff4ea428b155d168f1c57a4b62a656a10fba83cb GIT binary patch literal 4787 zcmai1%X1t@8K2kA&W?7a)x)wRJI=;=kWFH(Y$w4OV{B{;1+fxhRT2im)OdP!HLH1S zdU`EsYgLtELlqZQ1#bKaR8jm1oH%fR1J}NCL3d8M!0+o>tsR$u-RjrZJzsZ!kKgz8 z^+vU zT8Wd^yqa25N!)bKn@j6nJ#Bc6bl#g+bURr{n_e?r^cK@4Z)wVJ*;`h8R=gFo)#O~d z>aAkV@y`E1V>RY}p)ogVPE2nN<2hExxE`4puQTJ3)^3b$xc7Jzh={fFto4A$t%t#3 z>)Jc5t#@u--MYDTy>)Ht>J8Vu6AZ;}&TqDgVbY1h@glBx=x&l zh!3u^wk7Q}&eGsOR?{fRo&;RhOM+j7K@!P|YTBlpe-I}Tv3QgZc^Jt>8khYkV|n3W zl;-^KUJ&j^4|tG9`*EQ@DR*t8Lq7@j{U{4_7L7I-n01Oo_yG?^9Qxg1lK6@k(wR?x zGp)S_N~A$snrA>p7OQ-rdnU74^$X3jn8Rvlp*P5@E^8kLPs-D6pL~8>(@u1J9ag4H zsU^PmP>PRG2~FsIeGE%DuTAucKGu9=Y@#;D7PIzh{MuOS8%!HpzcSV}HH#7G9}<0R zFcVaRVeWLEX*fq^d*pWZgD0IgHc|6=?8NCHC;!HegkR(wI+a#0$sfywI2#OwkClEG z!YyP?1iTjszu@hPw2CO{${GpC=XoyVJT3A;^?8u>BEAg9u*BfN&xUEq;rALp59(Jx z>Fnld)HyYZ&iyI=HO5|z8`0?@clJxP}EUr`igGrj@CpOEzRV=dAcy9o70b4DoXk`I*=hr zONW{AC5+LmpiyTpE80Xh!aPZ$P?UGO1X5w3(u~phr@MV^O{nIvgp^G?sVy&~BQ#RC zQmQV@zQv4V>*vOiab%J*SLis__73dcD7duJJj-Q;iNitE``cgX_pk3lCDkN;%=03U z770ixKpNRVR=XIBtM9fgbq{F`0X^x_ZiQ8t4JWl&l{v~e71RM#6~qBl8*CnQ zhgp#Kf?T3cn?-Lg_RwMzZ-_}KTY6RNXMaH6@|7}3Uy&kV*_5PDWBEHmhw>Y zV)$?$n7-%mo5B(mcsP#MGnjR3C@;6Oi&MJED$mlHO+3njgNJ<|xByC3I5a}3q8Jmx zh?;QGuc`StQJ-j33_^(p$~@-E6Z{dqg$YbQBOXoaiN*dB`X;Lp?(~-@+K~;~%GhS^ zIz@s-!V|^$9B2*D=GoE|V^%?FD$4Sda$cd%%Jz;z86~o|!OuZXoRk?Suq%Z^GhZYJ zA!B=lQjpX2aPCj*N4n6)z`@@bpFuUsI1o3?B+A-`G&Z+9b11}fHuIHxD~Ssc=IP+} zw}>6_wzbjKv$Q%xh0R+@j!0A7M(6bFQG0WXV^PRq#II1^NV;C$;J$SeHL?GqmRi~r2dJVgG_vgs3&#{?h>)`39xye zze)Wj&5B$E$#e}+0~eO{B1i`W~p_c_%n z#CQwqiq}zTh7F*{-*ELcV-@WZpucV`nXc{{j=7?*=}Y*JUihDypV=VAf*Ckx7zp^w zL`QyP^vLJ*gPZUXMQs^b|9dyZI3z+NXj=`A41}J3Pra{J+z>IM4T8&SDdv-_Z0p%H=4MmgQ`97>n^lS1q)#XB14>zeLuiLh4h& zmBOi(Qu!uM!GA~}emPOmh-Puj?D7 zfG(dYpszXBEMXw`bx@Ry{zUQR6vcmmOc9wLYLl7ANEsMO$r|g*IuT*kwKe>aNR)9x zQBGG7PWP+yEifG!CQ0#@mR_oFK_FH2|yflX0u@!cX z_8IHqa{pzbj2$JbS0;MbgypSn?>Ro!Q_=AYYIrHoiMfn?; z?4i)_PW&ZI;N$~zc?VV7mii&TfF681-;cOgLDc99&)v-jQ+W6648bQU_!fO&O8t@g zK#>ucPW}S2H@6qyjl>br3h|(AdJYmmpVCZecH;w1S&pRi!bl=Of-cFR^?KHjm^=oan z%(N?U#-aoMGPrxrXb>>GHknsF@Y5llp2)wqJV(UH2@c}IGn4KX-J4cSZ}A+M$T{D~ z1B^nf?>`39f`XuCxB!$j{Vnsd*+jI%gGaw)*3s7SU&G(ko7RST$y_x?tusOA8MZT6 zrZ7m*tRRQ*2PqifctQ_aJdF$`1O=C82M1a8NnE~vjBm@Akv_WkU$$2Z+co|g*79#q z^*Sn9yMK4*uK(ft@9&h41d2DD5S7PN^{G1RxzNT0WjTeyO?~Ah$93kMH#yPU8+0<3 zhrZ84jJ-9b$4ydlu}LviRS2w{t}iQiz77t{6lW0xWpLhlmZjO=ejINOcr?uo@g&{s zZgG>CuAV*5DY@n3=$ujvWr5z}Z0$6n7o4!Z%;x9@{HLf&BoKQN!6r`1<0@a`d$feI nK6Ssd1R|YV<=g8vnS5~(l?5xQtB6hQ8Zst|-gUz=n& 2. pre-emphasis --> 3. 8bit mu-law + x, fs = librosa.load(path, sr=self.target_sr, mono=True, dtype=np.float16) + return x * 1.3 + + + + + +class CmuArcticLiveDataset(Dataset): + def __init__(self, data_root_dir=DATA_ROOT, train_flag=True, cond_sel='mfcc', cache_size=1000, transform=None): + + self.train_flag = train_flag + self.cond_sel = cond_sel # 'mfcc' or 'pyspec' + self.cache_size= cache_size + self.data_root_dir = data_root_dir + + if self.train_flag is True: + self.X = FileSourceDataset(WavSource(data_root=data_root_dir, file_sel_range=[0,1000])) + else: + self.X = FileSourceDataset(WavSource(data_root=data_root_dir, file_sel_range=[1000,1132])) + self.cache_size = 1 + + self.utt_lengths = [len(utt) for utt in self.X] + self.X_raw = MemoryCacheFramewiseDataset(self.X, self.utt_lengths, self.cache_size) + self.utt_total_length = len(self.X_raw) + + self.sample_start, self.sample_end = list(), list() + + # # This initializes self.sample_start and self.sample_end + if self.train_flag is True: + self.rand_flush() + else: + self.init_for_test() + + # Feature scaling factors + scf = np.load(self.data_root_dir + '../processed_slt_arctic/scale_factors.npy').item() + self.pyspec_max = np.max(scf['pyworld_max'][64:64+513]) #11.159795 + self.mfcc_mean = scf['melmfcc_mean'][128:128+25] + self.mfcc_std = scf['melmfcc_std'][128:128+25] + return None + + + def rand_flush(self): + print ('Flush: Randomize sample selection in dataset...') + self.sample_start = [0] + while self.sample_start[-1] < self.utt_total_length-5000: + self.sample_start.append(self.sample_start[-1] + np.random.randint(low=2500, high=5000)) + + self.sample_end = self.sample_start[1:] + self.sample_end.append(self.utt_total_length) + return None + + def init_for_test(self): + self.sample_start = [0] + for utt_length in self.utt_lengths: + self.sample_start.append(self.sample_start[-1] + utt_length) + + self.sample_end = self.sample_start[1:] + self.sample_end.append(self.utt_total_length) + assert(len(self.sample_end) == len(self.sample_start)) + + + + def __getitem__(self, index): + + if self.train_flag is True: + # x: Zero-padded raw_audio + x = np.zeros(5000, dtype=np.float64) + zpad_end = 5000 - (self.sample_end[index] - self.sample_start[index]) + x[zpad_end:] = self.X_raw[self.sample_start[index]:self.sample_end[index]].astype(np.float64) + # assert(len(x)==5000) + else: + x = self.X_raw[self.sample_start[index]:self.sample_end[index]].astype(np.float64) + + # x_mulaw: 8bit Mulaw encoded + x_mulaw = mu_law_encode(x).astype(np.uint8) + + # cond: features to be used as a conditional input. mfcc or pyspec + f0, timeaxis = pyworld.dio(x, 16000, frame_period=5) #(T,) + f0 = pyworld.stonemask(x, f0, timeaxis, 16000) + lf0 = f0.copy() + lf0[np.nonzero(f0)] = np.log(f0[np.nonzero(f0)]) #(T,) + #vuv = (lf0 != 0).astype(np.uint8) #(T,) + + if self.cond_sel is 'pyspec': + cond = pyworld.cheaptrick(x, f0, timeaxis, 16000) # (T,d=513) + cond = cond / self.pyspec_max + elif self.cond_sel is 'mfcc': + melspec = librosa.feature.melspectrogram(y=x, sr=16000, power=2.0, n_fft=400, hop_length=80, n_mels=128) + cond = librosa.feature.mfcc(S=librosa.power_to_db(melspec), sr=16000, n_mfcc=25) #(d=25, T) + cond = np.transpose(cond) # (T,d) + cond = scale(cond, self.mfcc_mean, self.mfcc_std) + + + # Stack cond + cond = np.hstack((lf0[:,None], cond)) + + # cond: transpose to (d,T) and Resample + cond = librosa.core.resample(np.transpose(cond), 200, 16000, res_type='kaiser_fast', fix=True, scale=False) + + # Resize and transpose + cond = librosa.util.fix_length(cond, len(x_mulaw), mode='edge') # (d,T) + + return index, torch.LongTensor(x_mulaw), cond.astype(np.float32) + + + def __len__(self): + return len(self.sample_start) # return the number of examples that we have + + +#%% \ No newline at end of file diff --git a/util/loading_dataset.py b/util/loading_dataset.py index 6629499..f038d64 100644 --- a/util/loading_dataset.py +++ b/util/loading_dataset.py @@ -8,40 +8,108 @@ """ import torch from torch.utils.data.dataset import Dataset -from torch import from_numpy +#from torch import from_numpy import numpy as np import pandas as pd -from sklearn import preprocessing -from sklearn.preprocessing import StandardScaler -from sklearn.externals import joblib +#from sklearn import preprocessing +#from sklearn.preprocessing import StandardScaler +#from sklearn.externals import joblib import glob +from nnmnkwii import minmax_scale, scale + + + +DIM_INDEX = dict() +DIM_INDEX['linguistic'] = np.arange(0,420) # source: /linguistic +DIM_INDEX['f0'] = [0] # source: /pyworld +DIM_INDEX['log-f0'] = [1] # source: /pyworld +DIM_INDEX['vuv'] = [2] # source: /pyworld +DIM_INDEX['bap'] = [3] # source: /pyworld +DIM_INDEX['melcep'] = np.arange(4,64) # source: /pyworld +DIM_INDEX['pyspec'] = np.arange(64,577) # source: /pyworld +DIM_INDEX['melspec'] = np.arange(0, 128) # source: /melmfcc +DIM_INDEX['mfcc'] = np.arange(128,153) # source: /melmfcc + class CmuArcticDataset(Dataset): - def __init__(self, x_root_dir=None, cond_root_dir=None, scale_factor_path=None, transform=None): - x_root_dir = 'data/processed_slt_arctic/TRAIN/mulaw/' - #x_root_dir = 'data/processed_slt_arctic/TEST/' - cond_root_dir = 'data/processed_slt_arctic/TRAIN/melmfcc/' - #cond_root_dir = 'data/processed_slt_arctic/TEST/melmfcc/' - scale_factor_path = 'data/processed_slt_arctic/scale_factors.npy' - self.mulaw_filepaths = sorted(glob.glob(x_root_dir + '*.npy')) - self.cond_filepaths = sorted(glob.glob(cond_root_dir + '*.npy')) - self.file_ids = [path.split('/')[-1][:-4] for path in mulaw_filepaths] + def __init__(self, data_root_dir=None, random_zpad=bool, cond_feature_select=None, transform=None): + #data_root_dir = 'data/processed_slt_arctic/TRAIN/' + #data_root_dir = 'data/processed_slt_arctic/TEST/' + + self.mulaw_filepaths = sorted(glob.glob(data_root_dir + 'mulaw/*.npy')) + self.linguistic_filepaths = sorted(glob.glob(data_root_dir + 'linguistic/*.npy')) + self.melmfcc_filepaths = sorted(glob.glob(data_root_dir + 'melmfcc/*.npy')) + self.pyworld_filepaths = sorted(glob.glob(data_root_dir + 'pyworld/*.npy')) + self.file_ids = [path.split('/')[-1][:-4] for path in self.mulaw_filepaths] + self.random_zpad = random_zpad + self.cond_feature_select = cond_feature_select # ['linguistic', 'f0', 'log-f0', 'vuv','bap', 'melcep', 'pyspec', 'melspec', 'mfcc'] self.transform = transform + self.scale_factor = np.load(data_root_dir + '../scale_factors.npy') + + # Construct conditional feature selection info + global DIM_INDEX + self.cond_info = dict() + self.cond_dim = 0 # total dimension of condition features + for sel in self.cond_feature_select: + self.cond_info[sel] = np.arange(self.cond_dim, self.cond_dim + len(DIM_INDEX[sel])) + self.cond_dim += len(DIM_INDEX[sel]) + def __getitem__(self, index): # Get 3 items: (file_id, mulaw, cond) file_id = self.file_ids[index] - mulaw_filepath = self.mulaw_filepaths[index] - cond_filepath = self.cond_filepaths[index] - x = np.load(mulaw_filepath) # size(x) = (T,) - cond = np.transpose(np.load(cond_filepath)) # size(cond) = (T,d) --> (d, T) + x = np.load(self.mulaw_filepaths[index]) # size(x) = (T,) + cond = np.empty((len(x),0), np.float16) # size(cond) = (T,d) + + + cond_linguistic, cond_pyworld, cond_melmfcc = [], [], [] + if any(sel in self.cond_feature_select for sel in ['linguistic']): + cond_linguistic = np.load(self.linguistic_filepaths[index]) + if any(sel in self.cond_feature_select for sel in ['f0', 'log-f0', 'vuv', 'bap', 'melcep', 'pyspec']): + cond_pyworld = np.load(self.pyworld_filepaths[index]) + if any(sel in self.cond_feature_select for sel in ['melspec', 'mfcc']): + cond_melmfcc = np.load(self.melmfcc_filepaths[index]) + + global DIM_INDEX + for sel in self.cond_feature_select: + if sel is 'linguistic': + cond = np.hstack((cond, cond_linguistic)) + elif sel in ['f0', 'log-f0', 'vuv', 'bap', 'melcep', 'pyspec']: + cond = np.hstack((cond, cond_pyworld[:,DIM_INDEX[sel]])) + elif sel in ['melspec', 'mfcc']: + cond = np.hstack((cond, cond_melmfcc[:,DIM_INDEX[sel]])) + + assert(cond.shape[1]==self.cond_dim) # check if stacked cond feature size mismatches - return file_id, x, cond + # Feature-scaling + cond = self.featScaler(cond) + + # Transpose + cond = np.transpose(cond) # size(cond) = (T,d) --> (d, T): required for pytorch dataloading + + # Random zeropadding 20~50% + if self.random_zpad is True: + zpad_sz = int(len(x) * np.random.uniform(0.2,0.5)) + x[0:zpad_sz] = 128 # fill first samples with zeros (in mulaw-enc, 128) + cond[:,0:zpad_sz] = 0. + return file_id, torch.LongTensor(x), cond + + + + def featScaler(self, feat): + + for sel in self.cond_feature_select: + if sel is 'linguistic': + feat[:,self.cond_info[sel]] = minmax_scale(feat[:,self.cond_info[sel]], + self.scale_factor['linguistic_min'], self.scale_factor['linguistic_max'], feature_range=(0.01, 0.99)) + + return feat + def __len__(self): - return len(self.mulaw_filepaths) # return the number of examples that we have + return len(self.file_ids) # return the number of examples that we have diff --git a/util/preprocessing_cmu_arctic.py b/util/preprocessing_cmu_arctic.py index 9a42fbc..cc04fdd 100644 --- a/util/preprocessing_cmu_arctic.py +++ b/util/preprocessing_cmu_arctic.py @@ -305,9 +305,11 @@ def collect_features(self, wav_path): X_linguistic = FileSourceDataset(LinguisticSource(data_root=DATA_ROOT, question_path=QUESTION_PATH)) X_pyworld = FileSourceDataset(PyworldSource(data_root=DATA_ROOT)) X_melmfcc = FileSourceDataset(MelspecMfccSource(data_root=DATA_ROOT)) - +print('X_lingusitc with dimension = {}'.format(X_linguistic[0].shape[1])) +print('X_pyworld with dimension = {}'.format(X_pyworld[0].shape[1])) +print('X_melmfcc with dimension = {}'.format(X_melmfcc[0].shape[1])) # Calculate Scale factors: -''' + print('Calculating scale factors: This process will take longer than 10 minutes...') #wav_len = [len(y) for y in Y] #y_min, y_max = minmax(Y, wav_len) @@ -324,9 +326,8 @@ def collect_features(self, wav_path): scale_factors['melmfcc_mean'], scale_factors['melmfcc_var'] = meanvar(X_melmfcc, scale_factors['pyworld_len']) scale_factors['melmfcc_std'] = np.sqrt(scale_factors['melmfcc_var']) scale_factors['melmfcc_min'], scale_factors['melmfcc_max'] = minmax(X_melmfcc, scale_factors['pyworld_len']) - np.save(DST_ROOT + 'scale_factors.npy', scale_factors) -''' + ''' To load scale_factors: scale_factors = np.load(DST_ROOT + 'scale_factors.npy').item() ''' scale_factors = np.load(DST_ROOT + 'scale_factors.npy').item() @@ -363,8 +364,8 @@ def collect_features(self, wav_path): # Reduce unlabeled index x_linguistic = x_linguistic[:sil_sample_idx.max()+1] - x_pyworld = x_linguistic[:sil_sample_idx.max()+1] - x_melmfcc = x_linguistic[:sil_sample_idx.max()+1] + x_pyworld = x_pyworld[:sil_sample_idx.max()+1] + x_melmfcc = x_pyworld[:sil_sample_idx.max()+1] # Apply 0 to silence samples y_mulaw[sil_sample_idx] = 128