-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtime_stamp.py
62 lines (53 loc) · 2.81 KB
/
time_stamp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from util import FunASRModel,generate_results
from sentences_method import generate_new_sentences
from automodel_rec_to_sentences import convert_format,remove_chinese_punctuation #把automodel 返回的rec转换成以前Pipline的格式。
from automodel_rec_to_sentences import calculate_length,segment_text,split_into_words #用来debug,检测两句是否一样长,是否有标点符号没去掉。
# 定义长文本写入函数
def write_long_txt_with_timestamp(wav_name, cut_line,hot_word,debug=True):
Model = FunASRModel()
model = Model.full_version()
response = generate_results(model=model,wav_name=wav_name,hot_word=hot_word)
if debug == True:
# print(remove_chinese_punctuation(response[0]["text"]))
sentences = convert_format(response,debug=True)
sentences_length = 0
for sentence in sentences:
# 把英文单词作为一个汉字长度来计算。
sentences_length += calculate_length(sentence)
print(sentences,sentences_length,
len(response[0]["timestamp"]))
# 比对长度,如果不一样,说明有多余的未加入的符号。
# 英文单词,不是按字母来算time_stamp的,而是按照单词来算time_stamp的,不管单词是不是有效。
# 比如ablilly,koliyaal,这算两个词,占用两个time_stamp[start,end]x2
# 因为有时候会识别出英文,所以需要让这个长度对齐。
# print(response[0]["timestamp"])
sentences = convert_format(response)
print("=====")
print(sentences)
print("=====")
# 拆分句子
sentences = generate_new_sentences(sentences=sentences,cutline=cut_line) ##这个会把一个句子中两句话分开,如果两句话的间隔超过cutline ,default =1000ms
lines = []
for i in sentences:
skip = False
start_time_list = []
end_time_list = []
## start - end too long
for j in i["ts_list"]:
## 遍历start_end的元组
start_time_list.append(j[0])
end_time_list.append(j[1])
# for index in range(len(start_time_list)-1): #这两个应该等长
lines.append(str(i["ts_list"][0][0])+"|"+str(i["ts_list"][-1][-1])+"|"+i["text"])
print(str(i["ts_list"][0][0])+"|"+str(i["ts_list"][-1][-1])+"|"+i["text"])
# else:
# continue
write_lines_to_file(f'./tmp/{wav_name}.txt', lines)
return response
# 写入行到文件
def write_lines_to_file(file_path, lines):
with open(file_path, 'w', encoding='utf-8') as file:
for line in lines:
file.write(line + '\n')
if __name__ == "__main__":
write_long_txt_with_timestamp(wav_name="example",cut_line=1000,hot_word="",debug=True)