-
Notifications
You must be signed in to change notification settings - Fork 0
/
eng-elan.py
203 lines (180 loc) · 8.21 KB
/
eng-elan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import os
import re, sys
from pathlib import Path
import aws, gcs, wh
import xml.etree.ElementTree as ET
import tempfile
from pydub import AudioSegment
# A helper function of create-transcriptions for utterance-level extraction
def utterance_level(output, transcriptions, threshold):
threshold = (int(float(threshold)))
# divide utterances based on the space between words
# except if fragments were already specified in a tier
i = 0
utterance = ""
start_utterance = 0
for t in transcriptions:
i += 1
end = t['end']
utterance += t['token']
# compare the lengths of the end of the first word and the beginning of the next,
# as long as there is a next word
if i < len(transcriptions):
next_start = transcriptions[i]['start']
else:
output.write(f'<span start="{start_utterance}" end="{end}"><v>{utterance}</v></span>\n')
if next_start > end + threshold:
output.write(f'<span start="{start_utterance}" end="{end}"><v>{utterance}</v></span>\n')
start_utterance = next_start
utterance = ""
else:
utterance += " "
def create_tier(output, transcriptions, level, threshold, tier, service):
# Open 'output_segments' for writing, and return the new speech
# segments transcriptions as the contents of <span> elements (see
# below).
with open(output, 'w', encoding='utf-8') as output_segs:
# Write document header.
output_segs.write('<?xml version="1.0" encoding="UTF-8"?>\n')
# Write out the adjusted annotations
output_segs.write(
'<TIERS xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="file:avatech-tiers.xsd">\n')
if level == 'Utterance' or level == 'Both':
output_segs.write(
'<TIER columns="English-utterances">\n')
if tier != "":
# when a tier is already specified, we consider it to be the utterance level.
for t in transcriptions:
utterance = ""
start = t['start']
end = t['end']
if service == "Whisper":
print(transcriptions)
print(t)
utterance = t['transcription'][0]['text']
else:
i = 0
for token in t['transcription']:
print(token)
utterance += token['token']
if i < len(t['transcription']):
utterance += " "
i += 1
output_segs.write(f'<span start="{start}" end="{end}"><v>{utterance}</v></span>\n')
elif service == 'Whisper':
for t in transcriptions:
output_segs.write(f'<span start="{t["start"]}" '
f'end="{t["end"]}"><v>{t["text"]}</v></span>\n')
else:
utterance_level(output_segs, transcriptions, threshold)
output_segs.write('</TIER>\n')
if level == 'Word' or level == 'Both':
# Whisper does not return timestamps at the word-level
if service != 'Whisper':
output_segs.write('<TIER columns="English-words">\n')
if tier == "":
for t in transcriptions:
start = t['start']
end = t['end']
token = t['token']
output_segs.write(f'<span start="{start}" end="{end}"><v>{token}</v></span>\n')
else:
startw = 0
endw = 0
for t in transcriptions:
start = t['start']
end = t['end']
i = 0
for token in t['transcription']:
#add words in their respective timelines respecting the preannotated tiers
if i == 0:
startw = start
else:
startw = start + token['start']
if i < len(t['transcription']):
endw = end
else:
endw = start + token['end']
w = token['token']
output_segs.write(f'<span start="{startw}" '
f'end="{endw}"><v>{w}</v></span>\n')
i += 1
output_segs.write('</TIER>\n')
output_segs.write('</TIERS>\n')
def transcribe(service, source, bucket, project, filename, ou_path):
if service == 'AWS':
aws.upload_file(source, bucket, filename)
uri_path = "s3://" + bucket + "/" + filename
# transcription now returns the information we need
return aws.transcribe_file(filename, uri_path, bucket, ou_path)
elif service == 'GCloud':
# upload wav file to the chosen service
gcs.upload_file(bucket, source, filename, project)
uri_path = "gs://" + bucket + "/" + filename
# transcription now returns the information we need
return gcs.transcribe_speech(uri_path)
else:
return wh.transcribe_speech(source)
# Split the audio given in smaller chunks
def split_audio(audio, tier, output):
tree = ET.parse(tier)
root = tree.getroot()
timestamps = []
audio = AudioSegment.from_wav(audio)
i = 1
# get transcription timestamps for all split
for times in root.iter("span"):
# only split empty tiers
for t in times:
if t.text == None:
# extract times of span
start_span = times.attrib['start']
start_span = int(float(start_span) * 1000)
end_span = times.attrib['end']
end_span = int(float(end_span) * 1000)
timestamps.append({"start": start_span, "end": end_span})
# split audio segments and save in output directory
for a in timestamps:
newAudio = audio[a["start"]:a["end"]]
path = output + str(i) + ".wav"
newAudio.export(path, format="wav")
a["path"] = path
i += 1
return timestamps
def main():
# Read in all of the parameters that ELAN passes to this local recognizer on
# standard input.
params = {}
for line in sys.stdin:
match = re.search(r'<param name="(.*?)".*?>(.*?)</param>', line)
if match:
params[match.group(1)] = match.group(2).strip()
print(params)
# upload wav file to the chosen service
filename = params["filename"] + ".wav"
if params['tier'] != "":
# get the split audios and timestamps
audios = split_audio(params["source"], params['tier'], params['output_path'])
i = 1
for a in audios:
# filenames must be unique for service
jobname = params["filename"] + str(i)
# create transcriptions for each audio and add to audio info
a['transcription'] = transcribe(params['transcription_service'], a['path'], params["bucket"],
params["project"], jobname, params["output_path"])
i += 1
#remove the audios once the transcriptions are done
os.remove(a['path'])
# now we add them to the eaf file within the chosen tier
create_tier(params['output_segments'], audios,
params['level'], params['threshold'], params['tier'], params['transcription_service'])
else:
transcriptions = transcribe(params['transcription_service'], params["source"], params["bucket"],
params["project"], filename, params["output_path"])
# now we add them to the eaf file within the chosen tier
create_tier(params['output_segments'], transcriptions,
params['level'], params['threshold'], params['tier'], params['transcription_service'])
# Finally, tell ELAN that we're done.
print('RESULT: DONE.', flush=True)
if __name__ == "__main__":
main()