generation text2mel

mimbres · Jun 6, 2018 · 15e9416 · 15e9416
1 parent 79d1be5
commit 15e9416
Show file tree

Hide file tree

Showing 20 changed files with 437 additions and 305 deletions.
diff --git a/__pycache__/live_dataloader.cpython-36.pyc b/__pycache__/live_dataloader.cpython-36.pyc
diff --git a/checkpoints/00/config.json b/checkpoints/00/config.json
@@ -0,0 +1,12 @@
+{
+"exp_name": "00",
+"max_epoch": 1000,
+"batch_train": 16,
+"batch_test": 1,
+"load": null,
+"save_interval": 50,
+"sel_display": 9,
+"batch_norm": false,
+"slience_state_guide": false,
+"generate": null
+}
diff --git a/checkpoints/01_right_zpad/config.json b/checkpoints/01_right_zpad/config.json
@@ -0,0 +1,12 @@
+{
+"exp_name": "01_right_zpad",
+"max_epoch": 1000,
+"batch_train": 32,
+"batch_test": 5,
+"load": null,
+"save_interval": 50,
+"sel_display": 9,
+"batch_norm": false,
+"silence_state_guide": -1,
+"generate": null
+}
diff --git a/checkpoints/02_right_zpad_sil_state/config.json b/checkpoints/02_right_zpad_sil_state/config.json
@@ -0,0 +1,12 @@
+{
+"exp_name": "02_right_zpad_sil_state",
+"max_epoch": 1000,
+"batch_train": 32,
+"batch_test": 5,
+"load": null,
+"save_interval": 50,
+"sel_display": 9,
+"batch_norm": false,
+"silence_state_guide": 0.5,
+"generate": null
+}
diff --git a/checkpoints/02_right_zpad_sil_state/images/Sample 9: epoch = 0_att.png b/checkpoints/02_right_zpad_sil_state/images/Sample 9: epoch = 0_att.png
diff --git a/checkpoints/02_right_zpad_sil_state/images/Sample 9: epoch = 0_mspec.png b/checkpoints/02_right_zpad_sil_state/images/Sample 9: epoch = 0_mspec.png
diff --git a/checkpoints/02_right_zpad_sil_state/images/att_guide.png b/checkpoints/02_right_zpad_sil_state/images/att_guide.png
diff --git a/config_template.json b/config_template.json
@@ -0,0 +1,12 @@
+{
+"exp_name": "00",
+"max_epoch": 1000,
+"batch_train": 32,
+"batch_test": 5,
+"load": null,
+"save_interval": 50,
+"sel_display": 9,
+"batch_norm": false,
+"silence_state_guide": -1,
+"generate": null
+}
diff --git a/live_dataloader.py b/live_dataloader.py
@@ -22,9 +22,9 @@
 OMIT_DATA_ROWS =[945, 1119, 1149, 1371, 1853, 1929, 1939, 2965, 3405, 4246, 4670,
                   4684, 4699, 4706, 4713, 5163, 5164, 5165, 5166, 5167, 5169, 5172,
                    5174, 5182, 5185, 5532, 5935, 5964, 5974, 6039] # Reduce sentences with foreign characters.
-MAX_LEN_TEXT           = 250+1 # Max-lengths are required for z-padding
-MAX_LEN_MELSPEC        = 217+1
-MAX_LEN_SPEC           = 868+4
+MAX_LEN_TEXT           = 250+2 # Max-lengths are required for z-padding
+MAX_LEN_MELSPEC        = 217+2
+MAX_LEN_SPEC           = 868+8
 #MAX_LEN_PAIRED_TEXT    = 360
 #MAX_LEN_PAIRED_MELSPEC = 291# Max-lengths are required for z-padding
 #MAX_LEN_PAIRED_SPEC    = 1164
@@ -190,12 +190,14 @@ def zeropad(self, x, target_length):
         if len(x.shape) is 1:
             # 1D input:
             n_zeros = target_length - len(x)
-            x = np.pad(x, (n_zeros,0), 'constant', constant_values=(0,0))
+            #x = np.pad(x, (n_zeros,0), 'constant', constant_values=(0,0))
+            x = np.pad(x, (1,n_zeros-1), 'constant', constant_values=(0,0))  # letf 1 + right (all-1)-zpading!
         else:
             # 2D input: D x T
             n_zeros = target_length - x.shape[1]
             xz = np.zeros((x.shape[0], x.shape[1]+n_zeros))
-            xz[:, n_zeros:] = x
+            #xz[:, n_zeros:] = x
+            xz[:, 1:(xz.shape[1] - n_zeros + 1)] = x # right-zpading!!
             x = xz
         return x, n_zeros
 

diff --git a/model/FastTacotron.bak b/model/FastTacotron.bak
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu May 31 06:54:33 2018
+
+@author: sungkyun
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+#from util.AttentionGuide import AttentionGuideGen
+
+
+# Highway Layer & Block:  
+class HighwayLayer(nn.Module):
+
+    def __init__(self, in_ch=int, out_ch=int, k_sz=3, dil=int, causality=False):        
+        super(HighwayLayer, self).__init__()
+
+        self.out_ch    = out_ch
+        self.k_sz      = k_sz        
+        self.dil       = dil
+        self.causality = causality       
+
+        self.L = nn.Conv1d(in_ch, out_ch*2, kernel_size=k_sz, dilation=dil)
+        return None
+
+    def forward(self, x):
+
+        if self.k_sz is not 1:
+            if self.causality is True:
+                pad = (self.dil*2, 0) # left-padding
+            else:
+                pad = (self.dil, self.dil) # padding to both sides
+        else:            
+            pad = (0, 0) # in this case, just 1x1 conv..
+
+        h = self.L(F.pad(x, pad))
+        h1, h2 = h[:, :self.out_ch,:], h[:, self.out_ch:, :]
+        return F.sigmoid(h1) * h2 + (1-F.sigmoid(h1)) * x
+
+
+
+class HighwayBlock(nn.Module):
+
+    def __init__(self, io_chs=list, k_szs=list, dils=list, causality=False):
+        super(HighwayBlock, self).__init__()
+
+        #assert(len(io_chs)==len(k_szs) & len(io_chs)==len(dils)) 
+        self.causality = causality
+        self.hlayers   = nn.Sequential()
+        self.construct_hlayers(io_chs, k_szs, dils)
+        return None
+
+    def construct_hlayers(self, io_chs, k_szs, dils):
+
+        total_layers = len(io_chs) # = len(k_szs)
+
+        for l in range(total_layers):
+            self.hlayers.add_module(str(l),
+                                    HighwayLayer(in_ch=io_chs[l],
+                                                 out_ch=io_chs[l],
+                                                 k_sz=k_szs[l],
+                                                 dil=dils[l],
+                                                 causality=self.causality
+                                                 ))
+        return
+
+    def forward(self, x):
+        return self.hlayers(x)
+
+
+
+# Text Encoder:
+class TextEnc(nn.Module):  
+    def __init__(self, input_dim=31, e_dim=128, d_dim=256,
+                 h_io_chs=[512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512],
+                 h_k_szs=[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1],
+                 h_dils=[1, 3, 9, 27, 1, 3, 9, 27, 1, 1, 1, 1]):
+        super(TextEnc, self).__init__()
+        self.d_dim = d_dim
+
+        # Layers:
+        self.embed_layer = nn.Embedding(input_dim, e_dim) 
+        self.conv1x1_0   = nn.Conv1d(e_dim,   2*d_dim, kernel_size=1)
+        self.conv1x1_1   = nn.Conv1d(2*d_dim, 2*d_dim, kernel_size=1)
+        self.h_block     = HighwayBlock(h_io_chs, h_k_szs, h_dils, causality=False)
+        return None
+
+    def forward(self, x):
+        x = self.embed_layer(x).permute(0,2,1)     # BxT -> BxCxT with C=e
+        x = F.relu(self.conv1x1_0(x))
+        x = self.conv1x1_1(x)                      # BxCxT=Bx512xT]
+        x = self.h_block(x)                        # BxCxT with C=2d
+        return x[:, :self.d_dim, :], x[:, :self.d_dim, :]    # Split C={d,d}, to be used as K, V. 
+
+
+
+# Audio Encoder:
+class AudioEnc(nn.Module):
+    def __init__(self, input_dim=80, d_dim=256,
+                 h_io_chs=[256, 256, 256, 256, 256, 256, 256, 256, 256, 256],
+                 h_k_szs=[3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
+                 h_dils=[1, 3, 9, 27, 1, 3, 9, 27, 3, 3]):
+        super(AudioEnc, self).__init__()
+
+        # Layers:
+        self.conv1x1_0 = nn.Conv1d(input_dim, d_dim, kernel_size=1)
+        self.conv1x1_1 = nn.Conv1d(d_dim, d_dim, kernel_size=1)
+        self.conv1x1_2 = nn.Conv1d(d_dim, d_dim, kernel_size=1)
+        self.h_block   = HighwayBlock(h_io_chs, h_k_szs, h_dils, causality=True)
+        return None
+
+    def forward(self, x):
+        x = F.relu(self.conv1x1_0(x))
+        x = F.relu(self.conv1x1_1(x))
+        x = F.relu(self.conv1x1_2(x))
+        return self.h_block(x)             # output BxCxT with C=d
+
+
+
+# Audio Decoder:
+class AudioDec(nn.Module):
+    def __init__(self, input_dim=512, output_dim=80, d_dim=256,
+                 h_io_chs=[256, 256, 256, 256, 256, 256],
+                 h_k_szs=[3, 3, 3, 3, 3, 3],
+                 h_dils=[1, 3, 9, 27, 1, 1]):
+        super(AudioDec, self).__init__()
+
+        # Layers:
+        self.conv1x1_0 = nn.Conv1d(input_dim, d_dim, kernel_size=1)
+        self.h_block   = HighwayBlock(h_io_chs, h_k_szs, h_dils, causality=True)
+        self.conv1x1_1 = nn.Conv1d(d_dim, d_dim, kernel_size=1)
+        self.conv1x1_2 = nn.Conv1d(d_dim, d_dim, kernel_size=1)
+        self.conv1x1_3 = nn.Conv1d(d_dim, d_dim, kernel_size=1)
+        self.conv1x1_4 = nn.Conv1d(d_dim, output_dim, kernel_size=1)
+        return None
+
+    def forward(self, x):
+        x = self.conv1x1_0(x)
+        x = self.h_block(x)
+        x = F.relu(self.conv1x1_1(x))
+        x = F.relu(self.conv1x1_2(x))
+        x = F.relu(self.conv1x1_3(x))
+        return F.sigmoid(self.conv1x1_4(x))
+
+
+
+# SSRN:
+class SSRN(nn.Module):
+    def __init__(self, input_dim=80, output_dim=513, c_dim=512):
+        super(SSRN, self).__init__()
+
+        # Layers:
+        self.ssrn_layers = nn.Sequential()
+        self.ssrn_layers.add_module('conv1x1_0', nn.Conv1d(input_dim, c_dim, kernel_size=1))
+        self.ssrn_layers.add_module('h_block_0', HighwayBlock([c_dim, c_dim], [3,3], [1,3])) # By default, causality=False
+
+        self.ssrn_layers.add_module('deconv2x1_0', nn.ConvTranspose1d(c_dim, c_dim, kernel_size=2))
+        self.ssrn_layers.add_module('h_block_1', HighwayBlock([c_dim, c_dim], [3,3], [1,3]))
+        self.ssrn_layers.add_module('deconv2x1_1', nn.ConvTranspose1d(c_dim, c_dim, kernel_size=2))
+        self.ssrn_layers.add_module('h_block_2', HighwayBlock([c_dim, c_dim], [3,3], [1,3]))
+
+        self.ssrn_layers.add_module('conv1x1_1', nn.Conv1d(c_dim, 2*c_dim, kernel_size=1))
+        self.ssrn_layers.add_module('h_block_3', HighwayBlock([2*c_dim, 2*c_dim], [3,3], [1,1]))
+        self.ssrn_layers.add_module('conv1x1_2', nn.ConvTranspose1d(2*c_dim, output_dim, kernel_size=1))
+
+        self.ssrn_layers.add_module('conv1x1_3', nn.Conv1d(output_dim, output_dim, kernel_size=1))
+        self.ssrn_layers.add_module('relu_0', nn.ReLU())
+        self.ssrn_layers.add_module('conv1x1_4', nn.Conv1d(output_dim, output_dim, kernel_size=1))
+        self.ssrn_layers.add_module('relu_1', nn.ReLU())
+
+        self.ssrn_layers.add_module('conv1x1_5', nn.Conv1d(output_dim, output_dim, kernel_size=1))
+        return None
+
+    def forward(self, x):
+        return F.sigmoid(self.ssrn_layers(x))
+
+
+
+#%% Text2Mel:
+class Text2Mel(nn.Module):
+    def __init__(self, text_dim=31, melspec_dim=80, e_dim=128, d_dim=256):
+        super(Text2Mel, self).__init__()    
+
+        self.e_dim = e_dim; self.d_dim = d_dim
+        self.text_enc  = TextEnc(input_dim=text_dim)
+        self.audio_enc = AudioEnc(input_dim=melspec_dim)
+        self.audio_dec = AudioDec(input_dim=2*d_dim, output_dim=melspec_dim)
+
+        self.optional_output=False
+
+        return None
+
+    def forward(self, x_text, x_melspec):
+
+        # K,V: encoded text.  Q: encoded audio
+        K, V = self.text_enc(x_text)     # Key, Value: Bx256xN with N=nth text 
+        Q    = self.audio_enc(x_melspec) # Query     : Bx256xT with T=T_audio
+
+        # Attention: 
+        K_T = K.permute(0,2,1)        # K, transposed as BxTx256
+        A = F.softmax(torch.matmul(K_T, Q) / np.sqrt(self.d_dim), dim=1) # softmax along with Text length dimension, resulting BxNxT
+
+        # Attentional seed to audio_decoder, RQ = Att(K,V,Q)
+        R = torch.matmul(V, A)        # Bx256xT with T=T_audio
+        RQ = torch.cat([R,Q], dim=1)  # Bx512xT
+
+        # Decoding Mel-spectrogram, Y
+        Y = self.audio_dec(RQ)        # Bx80xT with T=T_audio
+
+        if self.optional_output is True:
+            return Y, A, K, V, Q
+        else:
+            return Y, A
+
diff --git a/FastTacotron.py → model/FastTacotron.py b/FastTacotron.py → model/FastTacotron.py
@@ -9,7 +9,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from utils.AttentionGuide import AttentionGuideGen
+#from util.AttentionGuide import AttentionGuideGen
 
 
 # Highway Layer & Block:  
@@ -193,7 +193,7 @@ def __init__(self, text_dim=31, melspec_dim=80, e_dim=128, d_dim=256):
 
         return None
 
-    def forward(self, x_text, x_melspec):
+    def forward(self, x_text, x_melspec, forced_att=None):
 
         # K,V: encoded text.  Q: encoded audio
         K, V = self.text_enc(x_text)     # Key, Value: Bx256xN with N=nth text 
@@ -203,6 +203,22 @@ def forward(self, x_text, x_melspec):
         K_T = K.permute(0,2,1)        # K, transposed as BxTx256
         A = F.softmax(torch.matmul(K_T, Q) / np.sqrt(self.d_dim), dim=1) # softmax along with Text length dimension, resulting BxNxT
 
+        # Forced attention (for generation stage):
+        if forced_att is True:
+            A[0,:,0] = 0. ; A[0,0,0] = 1.
+            if A.shape[2] > 1:
+                current_pos = int(torch.argmax(A[0,:,-1]))
+                prev_pos    = int(torch.argmax(A[0,:,-2]))
+                print(prev_pos, current_pos)
+                if (-1 <= (current_pos - prev_pos)) &  ((current_pos - prev_pos) <= 3) is False:
+                    print(current_pos-prev_pos)
+                    A[0,:,-1] = 0.
+                    is_same_char = (float(x_text[0, current_pos]) == float(x_text[0, prev_pos])) 
+                    if is_same_char:
+                        A[0, prev_pos + 2, -1] = 1.
+                    else:
+                        A[0, prev_pos + 1, -1] = 1.
+
         # Attentional seed to audio_decoder, RQ = Att(K,V,Q)
         R = torch.matmul(V, A)        # Bx256xT with T=T_audio
         RQ = torch.cat([R,Q], dim=1)  # Bx512xT

diff --git a/model/__pycache__/FastTacotron.cpython-36.pyc b/model/__pycache__/FastTacotron.cpython-36.pyc
diff --git a/run_scripts.sh b/run_scripts.sh