Rayhane-mamah · alex73 · Nov 29, 2018
diff --git a/datasets/preprocessor.py b/datasets/preprocessor.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 from datasets import audio
+from tacotron.utils.text import get_unused_symbols
 from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize
 
 
@@ -29,16 +30,20 @@ def build_from_path(hparams, input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12
 	executor = ProcessPoolExecutor(max_workers=n_jobs)
 	futures = []
 	index = 1
+	unusedSymbols = set()
 	for input_dir in input_dirs:
 		with open(os.path.join(input_dir, 'metadata.csv'), encoding='utf-8') as f:
 			for line in f:
 				parts = line.strip().split('|')
 				basename = parts[0]
 				wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(basename))
 				text = parts[2]
+				unusedSymbols |= get_unused_symbols(text)
 				futures.append(executor.submit(partial(_process_utterance, mel_dir, linear_dir, wav_dir, basename, wav_path, text, hparams)))
 				index += 1
 
+	print('Unused symbols in input data: {}'.format(str(unusedSymbols)))
+
 	return [future.result() for future in tqdm(futures) if future.result() is not None]
 
 

diff --git a/tacotron/feeder.py b/tacotron/feeder.py
@@ -7,7 +7,7 @@
 import tensorflow as tf
 from infolog import log
 from sklearn.model_selection import train_test_split
-from tacotron.utils.text import text_to_sequence
+from tacotron.utils.text import text_to_sequence, get_unused_symbols
 
 _batches_per_group = 64
 
@@ -33,6 +33,12 @@ def __init__(self, coordinator, metadata_filename, hparams):
 			hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
 			log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours))
 
+		# Check unused chars
+		unusedSymbols = set()
+		for m in self._metadata:
+			unusedSymbols |= get_unused_symbols(m[5])
+		log('Unused symbols in training data: {}'.format(str(unusedSymbols)))
+
 		#Train test split
 		if hparams.tacotron_test_size is None:
 			assert hparams.tacotron_test_batches is not None

diff --git a/tacotron/utils/text.py b/tacotron/utils/text.py
@@ -53,6 +53,13 @@ def sequence_to_text(sequence):
       result += s
   return result.replace('}{', ' ')
 
+def get_unused_symbols(text):
+  unused = set()
+  for s in text:
+    if not _should_keep_symbol(s):
+      unused.add(s)
+  return unused
+
 
 def _clean_text(text, cleaner_names):
   for name in cleaner_names: