-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
text/: adding Keith Itos text pre-processing
- Loading branch information
Rafael Valle
committed
May 3, 2018
1 parent
09bbec0
commit d04f38c
Showing
6 changed files
with
338 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
Copyright (c) 2017 Keith Ito | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
""" from https://github.com/keithito/tacotron """ | ||
import re | ||
from text import cleaners | ||
from text.symbols import symbols | ||
|
||
|
||
# Mappings from symbol to numeric ID and vice versa: | ||
_symbol_to_id = {s: i for i, s in enumerate(symbols)} | ||
_id_to_symbol = {i: s for i, s in enumerate(symbols)} | ||
|
||
# Regular expression matching text enclosed in curly braces: | ||
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') | ||
|
||
|
||
def text_to_sequence(text, cleaner_names): | ||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. | ||
The text can optionally have ARPAbet sequences enclosed in curly braces embedded | ||
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." | ||
Args: | ||
text: string to convert to a sequence | ||
cleaner_names: names of the cleaner functions to run the text through | ||
Returns: | ||
List of integers corresponding to the symbols in the text | ||
''' | ||
sequence = [] | ||
|
||
# Check for curly braces and treat their contents as ARPAbet: | ||
while len(text): | ||
m = _curly_re.match(text) | ||
if not m: | ||
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) | ||
break | ||
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) | ||
sequence += _arpabet_to_sequence(m.group(2)) | ||
text = m.group(3) | ||
|
||
# Append EOS token | ||
sequence.append(_symbol_to_id['~']) | ||
return sequence | ||
|
||
|
||
def sequence_to_text(sequence): | ||
'''Converts a sequence of IDs back to a string''' | ||
result = '' | ||
for symbol_id in sequence: | ||
if symbol_id in _id_to_symbol: | ||
s = _id_to_symbol[symbol_id] | ||
# Enclose ARPAbet back in curly braces: | ||
if len(s) > 1 and s[0] == '@': | ||
s = '{%s}' % s[1:] | ||
result += s | ||
return result.replace('}{', ' ') | ||
|
||
|
||
def _clean_text(text, cleaner_names): | ||
for name in cleaner_names: | ||
cleaner = getattr(cleaners, name) | ||
if not cleaner: | ||
raise Exception('Unknown cleaner: %s' % name) | ||
text = cleaner(text) | ||
return text | ||
|
||
|
||
def _symbols_to_sequence(symbols): | ||
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] | ||
|
||
|
||
def _arpabet_to_sequence(text): | ||
return _symbols_to_sequence(['@' + s for s in text.split()]) | ||
|
||
|
||
def _should_keep_symbol(s): | ||
return s in _symbol_to_id and s is not '_' and s is not '~' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
""" from https://github.com/keithito/tacotron """ | ||
|
||
''' | ||
Cleaners are transformations that run over the input text at both training and eval time. | ||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" | ||
hyperparameter. Some cleaners are English-specific. You'll typically want to use: | ||
1. "english_cleaners" for English text | ||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using | ||
the Unidecode library (https://pypi.python.org/pypi/Unidecode) | ||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update | ||
the symbols in symbols.py to match your data). | ||
''' | ||
|
||
import re | ||
from unidecode import unidecode | ||
from .numbers import normalize_numbers | ||
|
||
|
||
# Regular expression matching whitespace: | ||
_whitespace_re = re.compile(r'\s+') | ||
|
||
# List of (regular expression, replacement) pairs for abbreviations: | ||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ | ||
('mrs', 'misess'), | ||
('mr', 'mister'), | ||
('dr', 'doctor'), | ||
('st', 'saint'), | ||
('co', 'company'), | ||
('jr', 'junior'), | ||
('maj', 'major'), | ||
('gen', 'general'), | ||
('drs', 'doctors'), | ||
('rev', 'reverend'), | ||
('lt', 'lieutenant'), | ||
('hon', 'honorable'), | ||
('sgt', 'sergeant'), | ||
('capt', 'captain'), | ||
('esq', 'esquire'), | ||
('ltd', 'limited'), | ||
('col', 'colonel'), | ||
('ft', 'fort'), | ||
]] | ||
|
||
|
||
def expand_abbreviations(text): | ||
for regex, replacement in _abbreviations: | ||
text = re.sub(regex, replacement, text) | ||
return text | ||
|
||
|
||
def expand_numbers(text): | ||
return normalize_numbers(text) | ||
|
||
|
||
def lowercase(text): | ||
return text.lower() | ||
|
||
|
||
def collapse_whitespace(text): | ||
return re.sub(_whitespace_re, ' ', text) | ||
|
||
|
||
def convert_to_ascii(text): | ||
return unidecode(text) | ||
|
||
|
||
def basic_cleaners(text): | ||
'''Basic pipeline that lowercases and collapses whitespace without transliteration.''' | ||
text = lowercase(text) | ||
text = collapse_whitespace(text) | ||
return text | ||
|
||
|
||
def transliteration_cleaners(text): | ||
'''Pipeline for non-English text that transliterates to ASCII.''' | ||
text = convert_to_ascii(text) | ||
text = lowercase(text) | ||
text = collapse_whitespace(text) | ||
return text | ||
|
||
|
||
def english_cleaners(text): | ||
'''Pipeline for English text, including number and abbreviation expansion.''' | ||
text = convert_to_ascii(text) | ||
text = lowercase(text) | ||
text = expand_numbers(text) | ||
text = expand_abbreviations(text) | ||
text = collapse_whitespace(text) | ||
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
""" from https://github.com/keithito/tacotron """ | ||
|
||
import re | ||
|
||
|
||
valid_symbols = [ | ||
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', | ||
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', | ||
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', | ||
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', | ||
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', | ||
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', | ||
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' | ||
] | ||
|
||
_valid_symbol_set = set(valid_symbols) | ||
|
||
|
||
class CMUDict: | ||
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' | ||
def __init__(self, file_or_path, keep_ambiguous=True): | ||
if isinstance(file_or_path, str): | ||
with open(file_or_path, encoding='latin-1') as f: | ||
entries = _parse_cmudict(f) | ||
else: | ||
entries = _parse_cmudict(file_or_path) | ||
if not keep_ambiguous: | ||
entries = {word: pron for word, pron in entries.items() if len(pron) == 1} | ||
self._entries = entries | ||
|
||
|
||
def __len__(self): | ||
return len(self._entries) | ||
|
||
|
||
def lookup(self, word): | ||
'''Returns list of ARPAbet pronunciations of the given word.''' | ||
return self._entries.get(word.upper()) | ||
|
||
|
||
|
||
_alt_re = re.compile(r'\([0-9]+\)') | ||
|
||
|
||
def _parse_cmudict(file): | ||
cmudict = {} | ||
for line in file: | ||
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): | ||
parts = line.split(' ') | ||
word = re.sub(_alt_re, '', parts[0]) | ||
pronunciation = _get_pronunciation(parts[1]) | ||
if pronunciation: | ||
if word in cmudict: | ||
cmudict[word].append(pronunciation) | ||
else: | ||
cmudict[word] = [pronunciation] | ||
return cmudict | ||
|
||
|
||
def _get_pronunciation(s): | ||
parts = s.strip().split(' ') | ||
for part in parts: | ||
if part not in _valid_symbol_set: | ||
return None | ||
return ' '.join(parts) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
""" from https://github.com/keithito/tacotron """ | ||
|
||
import inflect | ||
import re | ||
|
||
|
||
_inflect = inflect.engine() | ||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') | ||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') | ||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') | ||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') | ||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') | ||
_number_re = re.compile(r'[0-9]+') | ||
|
||
|
||
def _remove_commas(m): | ||
return m.group(1).replace(',', '') | ||
|
||
|
||
def _expand_decimal_point(m): | ||
return m.group(1).replace('.', ' point ') | ||
|
||
|
||
def _expand_dollars(m): | ||
match = m.group(1) | ||
parts = match.split('.') | ||
if len(parts) > 2: | ||
return match + ' dollars' # Unexpected format | ||
dollars = int(parts[0]) if parts[0] else 0 | ||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 | ||
if dollars and cents: | ||
dollar_unit = 'dollar' if dollars == 1 else 'dollars' | ||
cent_unit = 'cent' if cents == 1 else 'cents' | ||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) | ||
elif dollars: | ||
dollar_unit = 'dollar' if dollars == 1 else 'dollars' | ||
return '%s %s' % (dollars, dollar_unit) | ||
elif cents: | ||
cent_unit = 'cent' if cents == 1 else 'cents' | ||
return '%s %s' % (cents, cent_unit) | ||
else: | ||
return 'zero dollars' | ||
|
||
|
||
def _expand_ordinal(m): | ||
return _inflect.number_to_words(m.group(0)) | ||
|
||
|
||
def _expand_number(m): | ||
num = int(m.group(0)) | ||
if num > 1000 and num < 3000: | ||
if num == 2000: | ||
return 'two thousand' | ||
elif num > 2000 and num < 2010: | ||
return 'two thousand ' + _inflect.number_to_words(num % 100) | ||
elif num % 100 == 0: | ||
return _inflect.number_to_words(num // 100) + ' hundred' | ||
else: | ||
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') | ||
else: | ||
return _inflect.number_to_words(num, andword='') | ||
|
||
|
||
def normalize_numbers(text): | ||
text = re.sub(_comma_number_re, _remove_commas, text) | ||
text = re.sub(_pounds_re, r'\1 pounds', text) | ||
text = re.sub(_dollars_re, _expand_dollars, text) | ||
text = re.sub(_decimal_number_re, _expand_decimal_point, text) | ||
text = re.sub(_ordinal_re, _expand_ordinal, text) | ||
text = re.sub(_number_re, _expand_number, text) | ||
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
""" from https://github.com/keithito/tacotron """ | ||
|
||
''' | ||
Defines the set of symbols used in text input to the model. | ||
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' | ||
from text import cmudict | ||
|
||
_pad = '_' | ||
_eos = '~' | ||
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' | ||
|
||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): | ||
_arpabet = ['@' + s for s in cmudict.valid_symbols] | ||
|
||
# Export all symbols: | ||
symbols = [_pad, _eos] + list(_characters) + _arpabet |