forked from nl8590687/ASRT_SpeechRecognition
-
Notifications
You must be signed in to change notification settings - Fork 0
/
LanguageModel.py
245 lines (204 loc) · 7.43 KB
/
LanguageModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: nl8590687
语音识别的语言模型
基于马尔可夫模型的语言模型
"""
import platform as plat
class ModelLanguage(): # 语音模型类
def __init__(self, modelpath):
self.modelpath = modelpath
system_type = plat.system() # 由于不同的系统的文件路径表示不一样,需要进行判断
self.slash = ''
if(system_type == 'Windows'):
self.slash = '\\'
elif(system_type == 'Linux'):
self.slash = '/'
else:
print('*[Message] Unknown System\n')
self.slash = '/'
if(self.slash != self.modelpath[-1]): # 在目录路径末尾增加斜杠
self.modelpath = self.modelpath + self.slash
pass
def LoadModel(self):
self.dict_pinyin = self.GetSymbolDict('dict.txt')
self.model1 = self.GetLanguageModel(self.modelpath + 'language_model1.txt')
self.model2 = self.GetLanguageModel(self.modelpath + 'language_model2.txt')
self.pinyin = self.GetPinyin(self.modelpath + 'dic_pinyin.txt')
model = (self.dict_pinyin, self.model1, self.model2 )
return model
pass
def SpeechToText(self, list_syllable):
'''
为语音识别专用的处理函数
实现从语音拼音符号到最终文本的转换
'''
r=''
length = len(list_syllable)
if(length == 0): # 传入的参数没有包含任何拼音时
return ''
# 先取出一个字,即拼音列表中第一个字
str_tmp = [list_syllable[0]]
for i in range(0, length - 1):
# 依次从第一个字开始每次连续取两个字拼音
str_split = list_syllable[i] + ' ' + list_syllable[i+1]
#print(str_split,str_tmp,r)
# 如果这个拼音在汉语拼音状态转移字典里的话
if(str_split in self.pinyin):
# 将第二个字的拼音加入
str_tmp.append(list_syllable[i+1])
else:
# 否则不加入,然后直接将现有的拼音序列进行解码
str_decode = self.decode(str_tmp, 0.0000)
#print('decode ',str_tmp,str_decode)
if(str_decode != []):
r += str_decode[0][0]
# 再重新从i+1开始作为第一个拼音
str_tmp = [list_syllable[i+1]]
#print('最后:', str_tmp)
str_decode = self.decode(str_tmp, 0.0000)
#print('剩余解码:',str_decode)
if(str_decode != []):
r += str_decode[0][0]
return r
def decode(self,list_syllable, yuzhi = 0.0001):
'''
实现拼音向文本的转换
基于马尔可夫链
'''
#assert self.dic_pinyin == null or self.model1 == null or self.model2 == null
list_words = []
num_pinyin = len(list_syllable)
#print('======')
#print('decode function: list_syllable\n',list_syllable)
#print(num_pinyin)
# 开始语音解码
for i in range(num_pinyin):
#print(i)
ls = ''
if(list_syllable[i] in self.dict_pinyin): # 如果这个拼音在汉语拼音字典里的话
# 获取拼音下属的字的列表,ls包含了该拼音对应的所有的字
ls = self.dict_pinyin[list_syllable[i]]
else:
break
if(i == 0):
# 第一个字做初始处理
num_ls = len(ls)
for j in range(num_ls):
tuple_word = ['',0.0]
# 设置马尔科夫模型初始状态值
# 设置初始概率,置为1.0
tuple_word = [ls[j], 1.0]
#print(tuple_word)
# 添加到可能的句子列表
list_words.append(tuple_word)
#print(list_words)
continue
else:
# 开始处理紧跟在第一个字后面的字
list_words_2 = []
num_ls_word = len(list_words)
#print('ls_wd: ',list_words)
for j in range(0, num_ls_word):
num_ls = len(ls)
for k in range(0, num_ls):
tuple_word = ['',0.0]
tuple_word = list(list_words[j]) # 把现有的每一条短语取出来
#print('tw1: ',tuple_word)
tuple_word[0] = tuple_word[0] + ls[k] # 尝试按照下一个音可能对应的全部的字进行组合
#print('ls[k] ',ls[k])
tmp_words = tuple_word[0][-2:] # 取出用于计算的最后两个字
#print('tmp_words: ',tmp_words,tmp_words in self.model2)
if(tmp_words in self.model2): # 判断它们是不是再状态转移表里
#print(tmp_words,tmp_words in self.model2)
tuple_word[1] = tuple_word[1] * float(self.model2[tmp_words]) / float(self.model1[tmp_words[-2]])
# 核心!在当前概率上乘转移概率,公式化简后为第n-1和n个字出现的次数除以第n-1个字出现的次数
#print(self.model2[tmp_words],self.model1[tmp_words[-2]])
else:
tuple_word[1] = 0.0
continue
#print('tw2: ',tuple_word)
#print(tuple_word[1] >= pow(yuzhi, i))
if(tuple_word[1] >= pow(yuzhi, i)):
# 大于阈值之后保留,否则丢弃
list_words_2.append(tuple_word)
list_words = list_words_2
#print(list_words,'\n')
#print(list_words)
for i in range(0, len(list_words)):
for j in range(i + 1, len(list_words)):
if(list_words[i][1] < list_words[j][1]):
tmp = list_words[i]
list_words[i] = list_words[j]
list_words[j] = tmp
return list_words
pass
def GetSymbolDict(self, dictfilename):
'''
读取拼音汉字的字典文件
返回读取后的字典
'''
txt_obj = open(dictfilename, 'r', encoding='UTF-8') # 打开文件并读入
txt_text = txt_obj.read()
txt_obj.close()
txt_lines = txt_text.split('\n') # 文本分割
dic_symbol = {} # 初始化符号字典
for i in txt_lines:
list_symbol=[] # 初始化符号列表
if(i!=''):
txt_l=i.split('\t')
pinyin = txt_l[0]
for word in txt_l[1]:
list_symbol.append(word)
dic_symbol[pinyin] = list_symbol
return dic_symbol
def GetLanguageModel(self, modelLanFilename):
'''
读取语言模型的文件
返回读取后的模型
'''
txt_obj = open(modelLanFilename, 'r', encoding='UTF-8') # 打开文件并读入
txt_text = txt_obj.read()
txt_obj.close()
txt_lines = txt_text.split('\n') # 文本分割
dic_model = {} # 初始化符号字典
for i in txt_lines:
if(i!=''):
txt_l=i.split('\t')
if(len(txt_l) == 1):
continue
#print(txt_l)
dic_model[txt_l[0]] = txt_l[1]
return dic_model
def GetPinyin(self, filename):
file_obj = open(filename,'r',encoding='UTF-8')
txt_all = file_obj.read()
file_obj.close()
txt_lines = txt_all.split('\n')
dic={}
for line in txt_lines:
if(line == ''):
continue
pinyin_split = line.split('\t')
list_pinyin=pinyin_split[0]
if(list_pinyin not in dic and int(pinyin_split[1]) > 1):
dic[list_pinyin] = pinyin_split[1]
return dic
if(__name__=='__main__'):
ml = ModelLanguage('model_language')
ml.LoadModel()
#str_pinyin = ['zhe4','zhen1','shi4','ji2', 'hao3','de5']
#str_pinyin = ['jin1', 'tian1', 'shi4', 'xing1', 'qi1', 'san1']
#str_pinyin = ['ni3', 'hao3','a1']
#str_pinyin = ['wo3','dui4','shi4','mei2','cuo4','ni3','hao3']
#str_pinyin = ['wo3','dui4','shi4','tian1','mei2','na5','li3','hai4']
#str_pinyin = ['ba3','zhe4','xie1','zuo4','wan2','wo3','jiu4','qu4','shui4','jiao4']
#str_pinyin = ['wo3','qu4','a4','mei2','shi4','er2','la1']
#str_pinyin = ['wo3', 'men5', 'qun2', 'li3', 'xiong1', 'di4', 'jian4', 'mei4', 'dou1', 'zai4', 'shuo1']
#str_pinyin = ['su1', 'an1', 'ni3', 'sui4', 'li4', 'yun4', 'sui2', 'cong2', 'jiao4', 'ming2', 'tao2', 'qi3', 'yu2', 'peng2', 'ya4', 'yang4', 'chao1', 'dao3', 'jiang1', 'li3', 'yuan2', 'kang1', 'zhua1', 'zou3']
#str_pinyin = ['da4', 'jia1', 'hao3']
str_pinyin = ['kao3', 'yan2', 'yan1', 'yu3', 'ci2', 'hui4']
#r = ml.decode(str_pinyin)
r=ml.SpeechToText(str_pinyin)
print('语音转文字结果:\n',r)