Skip to content

Commit

Permalink
live loader added
Browse files Browse the repository at this point in the history
  • Loading branch information
mimbres committed May 25, 2018
1 parent f9c0783 commit 0e0ed5b
Show file tree
Hide file tree
Showing 9 changed files with 477 additions and 31 deletions.
6 changes: 3 additions & 3 deletions FFTNet_dilconv.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def fftnet_residual(input_dim=256, num_layer=11, io_ch=256, skip_ch=256):
'''
class FFT_Block(nn.Module):

def __init__(self, cond_dim=25, io_ch=int, recep_sz=int, bias=True):
def __init__(self, cond_dim=26, io_ch=int, recep_sz=int, bias=True):

super(FFT_Block, self).__init__()
self.cond_dim=cond_dim # Number of dimensions of condition input
Expand Down Expand Up @@ -72,11 +72,11 @@ def forward(self, x, cond):
- [11 FFT_blocks] --> [FC_layer] --> [softmax]
'''
class FFTNet(nn.Module):
def __init__(self, input_dim=256, cond_dim=25, num_layer=11, io_ch=256, skip_ch=0, bias=True):
def __init__(self, input_dim=256, cond_dim=26, num_layer=11, io_ch=256, skip_ch=0, bias=True):

super(FFTNet, self).__init__()
self.input_dim = input_dim # 256 (=num_classes)
self.cond_dim = cond_dim # 25
self.cond_dim = cond_dim # 26
self.num_layer = num_layer # 11
self.io_ch = io_ch # 256 ch. in the paper
self.skip_ch = skip_ch # Not implemented yet (no skip channel in the paper)
Expand Down
6 changes: 3 additions & 3 deletions FFTNet_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def fftnet_residual(input_dim=256, num_layer=11, io_ch=256, skip_ch=256):
'''
class FFT_Block(nn.Module):

def __init__(self, is_first_block=True, initial_input_dim=256, cond_dim=25, io_ch=int, recep_sz=int, bias=True):
def __init__(self, is_first_block=True, initial_input_dim=256, cond_dim=26, io_ch=int, recep_sz=int, bias=True):

super(FFT_Block, self).__init__()
self.is_first_block = is_first_block # If True, an input_embedding_layer will be created (this is not clear in the paper).
Expand Down Expand Up @@ -111,11 +111,11 @@ def forward(self, x, cond):
- [11 FFT_blocks] --> [FC_layer] --> [softmax]
'''
class FFTNet(nn.Module):
def __init__(self, input_dim=256, cond_dim=25, num_layer=11, io_ch=256, skip_ch=0, bias=True ):
def __init__(self, input_dim=256, cond_dim=26, num_layer=11, io_ch=256, skip_ch=0, bias=True ):

super(FFTNet, self).__init__()
self.input_dim = input_dim # 256 (=num_classes)
self.cond_dim = cond_dim # 25
self.cond_dim = cond_dim # 26
self.num_layer = num_layer # 11
self.io_ch = io_ch
self.skip_ch = skip_ch
Expand Down
Binary file modified __pycache__/FFTNet_dilconv.cpython-36.pyc
Binary file not shown.
Binary file modified data/processed_slt_arctic/scale_factors.npy
Binary file not shown.
232 changes: 232 additions & 0 deletions train_with_liveloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 2 21:11:57 2018
@author: sungkyun
FFTNet: A REAL-TIME SPEAKER-DEPENDENT NEURAL VOCODER (Zeyu Jin et al., 2018, ICASSP)
*CMU Arctic dataset:
- 1032 utterances for train
- 100 utterances for test
- cond. input = F0 and 25-dim MCC
*Preprocessing:
- 16Khz sampling mono audio
- 8-bit mu-law encoded wav
*Architecture:
- receptive field = 2048 (2^11)
- 11 FFT-layers (256 ch.)
- Total 1M parameters
- final 2 layers = FC() -> softmax(256) : see details in paper 2.3.2
*Training:
- minibatch of 5 x 5000 samples
- 100,000 steps = 500 iters
*Requirements:
pip install soundfile
pip install librosa
"""
import os
import argparse
import numpy as np
import torch
#import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
#from util.loading_dataset import YesNoDataset # required for loading YesNo dataset
from util.live_loading_dataset import CmuArcticLiveDataset
from util.utils import mu_law_decode # required for auditioning generated samples


# Parsing arguments
parser = argparse.ArgumentParser(description='FFTNet implementation')
parser.add_argument('-exp', '--exp_name', type=str, default='00', metavar='STR',
help='Generated samples will be located in the checkpoints/exp<exp_name> directory. Default="00"') #
parser.add_argument('-btr', '--batch_train', type=int, default=1, metavar='N',
help='Batch size for training. e.g. -btr 5')
parser.add_argument('-bts', '--batch_test', type=int, default=1, metavar='N',
help='Batch size for test. e.g. -bts 5')
parser.add_argument('-load', '--load', type=str, default=None, metavar='STR',
help='e.g. --load checkpoints/exp00/checkpoint_00')
args = parser.parse_args()

USE_GPU = torch.cuda.is_available()
RAND_SEED = 0

#%% Loading data
dset_train = CmuArcticLiveDataset(train_flag=True, cond_sel='mfcc', cache_size=1000) # wav is cahced in memory.
dset_test = CmuArcticLiveDataset(train_flag=False, cond_sel='mfcc', cache_size=1)


train_loader = DataLoader(dset_train,
batch_size=args.batch_train,
shuffle=True,
num_workers=6,
pin_memory=True
) # number of CPU threads, practically, num_worker = 4 * num_GPU

test_loader = DataLoader(dset_test,
batch_size=args.batch_test,
shuffle=False,
num_workers=8,
pin_memory=True,
)



#%% Train & Test Functions
def train(epoch):
model.train()
train_loss = 0.
train_acc = []
total_data_sz = len(train_loader)

for batch_idx, (_, X_mulaw, X_mfcc ) in enumerate(train_loader):
if USE_GPU:
#X_mulaw, X_mfcc = X_mulaw.cuda(), X_mfcc.cuda()
X_mulaw, X_mfcc = Variable(X_mulaw.cuda()), Variable(X_mfcc.cuda().float())
#X_mulaw, X_mfcc = Variable(X_mulaw.long()), Variable(X_mfcc.half()) # only for fp16 supported GPU!
else:
X_mulaw, X_mfcc = Variable(X_mulaw), Variable(X_mfcc.float())

optimizer.zero_grad()
y = model(X_mulaw, X_mfcc)
loss = F.cross_entropy(input=y.view(-1, 256), target=X_mulaw.view(-1), size_average=True)
# input=y.view(-1, num_classes), where num_classes=256 for 8bit mu-law
# target=X_mulaw.view(-1)
loss.backward()
optimizer.step()

# Accuracy
pred = y.view(-1,256).data.max(1, keepdim=True)[1] # Get the index of the max log-probability from y
acc = pred.eq(X_mulaw.view(-1).data.view_as(pred)).cpu().sum().numpy() / len(pred) # Compute accuracy by comparing pred with X_mulaw
print('Train Epoch: {} [{}/{}], Loss = {:.6f}, Acc = {:.6f}'.format(
epoch, batch_idx * train_loader.batch_size, total_data_sz, loss.item(), acc))

train_loss += loss.item()
train_acc.append(acc)

return train_loss, np.mean(train_acc)




def generator(test_file_id, out_filename, recep_sz=2048, verbose=1000):
'''
Annotation
- X_mfcc : Original condition input
- total samples : The number of total samples to generate (=size(X_mfcc))
- x_mulaw_slice : A temporary input replacing X_mulaw. At the start, all zeros(encoded as 128s).
- x_mfcc_slice : A temporary condition input.
- y : One sample output of the model. This will be fed into x_mulaw_slice at the right-most column.
- pred : Prediction of one new generated sample
- out : Collection of mu_law_decode(pred)s.
'''
_, _, X_mfcc = test_loader.dataset.__getitem__(test_file_id) # X_mfcc: (CxL) np.ndarray
feat_dim = X_mfcc.shape[0]
total_samples = X_mfcc.shape[1]
X_mfcc = X_mfcc.reshape(1, feat_dim, total_samples) # BxCxL

# Initial input slices, filled with zeros.
x_mulaw_slice = Variable(torch.LongTensor(1, recep_sz) * 0 + 128, requires_grad=False).cuda() # all zeros(128s).
x_mfcc_slice = Variable(torch.FloatTensor(1, feat_dim, recep_sz) * 0., requires_grad=False).cuda()
out = []

model.eval() # .eval(): Not requires gradients.

for i in range(total_samples):
# New x_mfcc_slice: shift-left, then fill one 'cond' sample into the right-most column
x_mfcc_slice[:, :, 0:-1] = x_mfcc_slice[:, :, 1:]
x_mfcc_slice[:, :, -1] = torch.FloatTensor(X_mfcc[:, :, i])

y = model(x_mulaw_slice, x_mfcc_slice, gen_mod=True) # 1x1x256 (BxLxC)
pred = y.view(-1,256).data.max(1, keepdim=True)[1] # Predict: Get the index of the max log-probability from y

# New x_mulaw_slice: shift-left, then fill 'pred' into the right-most column
x_mulaw_slice[0, 0:-1] = x_mulaw_slice[0, 1:] # Shift-left
x_mulaw_slice[0, -1] = pred # Push 'pred'

# Collect generated sample
out.append(float(mu_law_decode(pred.cpu().numpy())))

# Print progress
if i % verbose == 0:
print('Generator: {}/{} samples ({:.2f}%)'.format(i, total_samples,
100 * i / total_samples ) )

# Save audio
import librosa
librosa.output.write_wav(out_filename, np.asarray(out), sr=16000)



def validate(epoch):
model.eval()
val_loss = 0.
# Not implemented yet
return val_loss


def load_checkpoint(filepath):
'''
Load pre-trained model.
'''
dt = torch.load(filepath)
model.load_state_dict(dt['state_dict'])
optimizer.load_state_dict(dt['optimizer'])
return 0

def save_checkpoint(state, accuracy, exp_name):
checkpoint_dir = 'checkpoints/' + exp_name
os.makedirs(checkpoint_dir, exist_ok=True)
filepath = checkpoint_dir + '/checkpoint.pth.tar'
torch.save(state, filepath)

def history_recorder():
'''
history_recorder():
- save training history as .csv files.
- save learning-curve as .png files
'''
return 0

def print_model_sz(model):
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
print('Number of trainable parameters = {}'.format(sum([np.prod(p.size()) for p in model_parameters])) )

#%% Experiment: train
from FFTNet_dilconv import FFTNet # <-- implemented using 2x1 dilated conv.
#from FFTNet_split import FFTNet # <-- same with paper

model = FFTNet(cond_dim=26).cuda() if USE_GPU else FFTNet(cond_dim=26).cpu() # or .cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-5)
print_model_sz(model)


#load_checkpoint('checkpoints/00/checkpoint.pth.tar')


for epoch in range(100):
torch.manual_seed(RAND_SEED + epoch)
tr_loss, tr_acc = train(epoch)


save_checkpoint({'epoch': epoch,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),},
tr_acc, args.exp_name)

#%% Experiment: generation
test_file_id = 0 # Select 0~5 for different condition input
generator(test_file_id=test_file_id, out_filename='aaa.wav')






Binary file not shown.
Loading

0 comments on commit 0e0ed5b

Please sign in to comment.