-
Notifications
You must be signed in to change notification settings - Fork 9
/
mondegreen.py
144 lines (126 loc) · 4.05 KB
/
mondegreen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
Try to produce manglings like "Tweeze denied beef worker isthmus".
"""
import re, sys, textwrap
from math import log10
from memo import memo
from pdist import Pw
from simpleverse import find_rime
longest = 20
match_cost = 25
fit_cost = 5
rarity_cost = 5
roughened_cost = 15
if 1:
roughened_cost = 5
roughener = {
'd': 't', 'dh': 't', 'th': 't',
'l': 'r',
'sh': 's', 'z': 's', 'zh': 's',
}
else:
rough_classes = [line.split() for line in """
b d g k p t
ch jh
f dh s sh th v z zh
m n ng
l r
hh w y
""".splitlines() if line]
roughener = {phone: rc[0]
for rc in rough_classes
for phone in rc}
def roughen(phones):
return tuple(p[-1] if p[-1].isdigit() else roughener.get(p, p)
for p in phones)
phones_of_word = {}
words_of_phones = {}
rough_words = {}
for line in open('cmudict.0.7a'):
if line.startswith(';'): continue
s = line.lower().split()
if not s: continue
word, phones = s[0], tuple(s[1:])
if word.endswith(')'):
word = word.rstrip('(0123456789)')
else:
phones_of_word[word] = phones
words_of_phones.setdefault(phones, []).append(word)
rough_words.setdefault(roughen(phones), []).append(word)
def pronounce(word):
if word not in phones_of_word:
phones_of_word[word] = (word,)
words_of_phones[word,] = [word]
return phones_of_word[word]
def pronounce_all(words):
return sum(map(pronounce, words), ())
def pronounce_line(words):
phones, bounds = (), ()
for word in words:
p = pronounce(word)
phones += p
bounds += (word,) + (None,)*(len(p)-1)
return phones, bounds
def roughen_line(phones, rhyming):
"Roughen phones, except maybe preserving the rime at the end."
i = find_rime(phones) if rhyming else len(phones)
return roughen(phones[:i]) + phones[i:]
## pronounce('yay')
#. ('y', 'ey1')
## pronounce_all(('darius',))
#. ('d', 'er0', 'ay1', 'ah0', 's')
## pronounce_all(tuple("the light".split()))
#. ('dh', 'ah0', 'l', 'ay1', 't')
def compute_best(phones, rough_phones, bounds, costs, seqs, i):
attempts = []
for L in range(1, min(i, longest) + 1):
assert len(phones[:i-L]) < len(phones)
subcost, subwords = costs[i-L], seqs[i-L]
subcost += fit_cost*(None is not bounds[i-L])
def add(word, common_cost):
attempts.append((common_cost - rarity_cost*log10(Pw(word)) + match_cost*(word == bounds[i-L]),
subwords + (word,)))
exacts = words_of_phones.get(phones[i-L:i], ())
for word in exacts:
add(word, subcost)
for word in rough_words.get(rough_phones[i-L:i], ()):
if word not in exacts:
add(word, subcost + roughened_cost)
return min(attempts) if attempts else (1e6, ('XXX',))
def transcribe(phones, rough_phones, bounds):
"""Return (cost,words) where pronounce_all(words) matches
`phones`. `cost` is lower the better the match. Try to find the
lowest cost."""
assert len(phones) == len(bounds)
costs, seqs = [0], [()]
for i in range(1, len(phones) + 1):
cost, seq = compute_best(phones, rough_phones, bounds, costs, seqs, i)
costs.append(cost)
seqs.append(seq)
return costs[-1], seqs[-1]
## -log10(Pw('the'))
#. 1.6506163745527287
## -log10(Pw('felonious'))
#. 7.04004028985589
## -log10(Pw('dar'))
#. 5.644601987828583
def pronounce_lines(lines, rhyming):
phones, rough_phones, bounds = (), (), ()
for line in lines:
p, b = pronounce_line(re.findall(r"['\w]+", line.lower()))
phones += p
rough_phones += roughen_line(p, rhyming)
bounds += b
return phones, rough_phones, bounds
def main(argv):
rhyming = (argv[1:] == ['--rhyme'])
phones, rough_phones, bounds = pronounce_lines(sys.stdin, rhyming)
# print text
# print phones
# print rough_phones
# print len(phones)
cost, words = transcribe(phones, rough_phones, bounds)
print cost
print textwrap.fill(' '.join(words).lower(), 60)
if __name__ == '__main__':
main(sys.argv)