-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcoder.py
411 lines (386 loc) · 12.4 KB
/
transcoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
#!/usr/bin/env python
#-*- coding:utf-8 -*-
""" Python version of transcoder.
Uses built-in library xml.etree.ElementTree,
rather than lxml.
Revised 02-20-2017 Regarding special handling of slp1 to deva;
search for regexCode variable, and fsmentry['regex'] for where this comes into play.
This kind of coding is ugly, and needs to be revised for greater generality.
"""
from __future__ import print_function
__program_name__ = 'transcoder.py'
__author__ = 'Jim Funderburk'
__email__ = '[email protected]'
__copyright__ = 'Copyright 2011, Jim Funderburk'
__license__ = 'GPL http://www.gnu.org/licenses/gpl.txt'
__date__ = '2011-12'
# Python Standard Library
import os
import sys
import codecs
#import locale
import re
#import logging
from unicodedata import normalize
#from operator import itemgetter
#from lxml import etree
import xml.etree.ElementTree as ET
## Jim Funderburk recoding into php of Java code developed by
## Ralph Bunker.
## This software is made available under the Creative Commons
## Creative Commons Attribution Non-Commercial Share Alike license available in full at <ptr target="http:##creativecommons.org/licenses/by-nc-sa/3.0/legalcode"/>, and summarized at <ptr target="http:##creativecommons.org/licenses/by-nc-sa/3.0/"/>. Permission is granted to build upon this work non-commercially, as long as credit is explicitly acknowledged exactly as described herein and derivative work is distributed under the same license.
## Assume transcoder xml files are in directory ../data/transcoder,
## relative to the directory containing this transcoder.php file
## two global variables
# Assume transcoder xml files are in directory ../data/transcoder,
# relative to the directory containing this transcoder.py file
global transcoder_dir,transcoder_fsmarr
transcoder_dir =os.path.dirname(os.path.abspath(__file__))
transcoder_dir = os.path.dirname(transcoder_dir) ## parent
transcoder_dir += "/data/transcoder"
transcoder_fsmarr = {} # a dictionary. keys are from+to
global python_version
python_version = sys.version[0] # first character: 2 or 3
if python_version == '3':
xrange = range
unichr = chr
def transcoder_fsm(sfrom,to) :
global transcoder_dir,transcoder_fsmarr
fromto = sfrom + "_" + to
if (fromto in transcoder_fsmarr) :
return
regexCode=None
regexpairs = [('slp1','deva'),('hkt','tamil')]
if sfrom.startswith('slp1') and to.startswith('deva'):
regexCode = 'slp1_deva'
elif sfrom.startswith('deva') and to.startswith('slp1'):
regexCode = 'deva_slp1'
elif sfrom.startswith('hkt') and to.startswith('tamil'):
regexCode = 'hkt_tamil'
filein = transcoder_dir + '/' + fromto + ".xml"
if (not os.path.exists(filein)) :
# print("file does not exist = " + filein)
return
# print("file exists = " + filein)
tree = ET.parse(filein)
xml = tree.getroot()
attributes = xml.attrib
# for a in attributes:
# print(a + "," + attributes[a])
start = attributes['start'] ## required
entries = list(xml) ## children
fsm = {} ## finite state machine to construct
fsm['start']=start
# fsmentries is a list of fsmentry elements, each of which is a hash
# corresponding to one of the 'e' elements in the xml file.
fsmentries = [] # initially an empty list
n = 0
for e in entries:
if (e.tag != 'e'):
# skip comments
continue
x = e.find("in")
inval = x.text
if not inval: #
inval=''
conlook = False
match = re.match(r'^([^/]+)/\^',inval)
if match :
## In transcoding from slp1 to devanagari, it is necessary to do a
## 'look-ahead' when deciding how to code a consonant. If the
## consonant is not followed by a vowel, then a vigraha has to be emitted.
## The input codes inval in such cases as:
## k/^([^aAiIuUfFxXeEoO^/\\])
## Which is to be intepreted as: starting at the next character,
## check if the input string does NOT match the regular expression
## [^aAiIuUfFxXeEoO^/\\].
## Note that the last 3 elements '^', '/', and '\' are present only
## because of accents.
## except in these two cases, we process this entry no further
## 02-22-2017. Allow some other names for from and to
#if ( (fromto != 'slp1_deva') and (fromto != 'hkt_tamil')and
# (fromto != 'deva_slp1')) :
if not regexCode:
continue
inval = match.group(1)
conlook=True
x = e.find("s") # s = state name of this entry. Can be a comma-delimited list
sval = x.text
startStates = re.split(",",sval)
x = e.find("out") # out = the transformation of the input
outval = x.text
if (outval == None): # apparently parser returns this from <out></out>
outval=''
x = e.find("next") # next state, this is optional. Its absence means use sval
if x is not None:
nextState = x.text
else:
nextState = startStates[0]
# inval, outval may be strings representing unicode.
# the format expected is \uxxxx\uyyyy etc. where xxxx and yyyy are
# four hex digits.
newinval = to_unicode(inval)
newoutval = to_unicode(outval)
# constuct this fsmentry as a hash of mixed values
fsmentry = {}
fsmentry['starts'] = startStates
fsmentry['in'] = newinval
# fsmentry['regex'] is defined only when conlook is true
if conlook:
fsmentry['regex']=regexCode
fsmentry['out']=newoutval
fsmentry['next']=nextState
# Dec 5, 2013 save raw inval/outval
fsmentry['inraw']=inval
fsmentry['outraw']=outval
fsmentry['e-elt'] = ET.tostring(e)
fsmentries.append(fsmentry)
n += 1
fsm['fsm']=fsmentries
## make associative array states, whose keys are characters,
## and whose value at a key is an array of subscripts into fsmentries.
## i is a subscript for a key provided that the fsmentries[i]['in'] =
## first character of key
states={}
ientry=0
for fsmentry in fsmentries:
inval = fsmentry['in']
#print("inval=",inval)
# special logic for deva_slp1 for <in></in> <out>a</out>,
# where inval is empty string
if (len(inval)>0):
c = inval[0] # first character of inval
else:
c = inval # empty string
if (c in states):
state=states[c]
state.append(ientry)
states[c]=state
else :
state = []
state.append(ientry)
states[c]=state
ientry += 1
fsm['states']=states
transcoder_fsmarr[fromto]=fsm
#debug
if (False):
print("filein=",filein)
filedbg = "dbg_%s.txt" %fromto
print("transcoder.py. Dbg info written to",filedbg)
fdbg = codecs.open(filedbg,"w","utf-8")
fdbg.write("fsmentries=...\n")
keys = ['starts','in','regex','out','next','inraw','outraw']
for i in xrange(0,len(fsmentries)):
fsmentry = fsmentries[i]
s = []
#for key in fsmentry:
for key in keys:
if key not in fsmentry: # regex
continue
val = fsmentry[key]
if key == 'starts':
val = ' '.join(val)
s.append("%s => %s" %(key,val))
sout = ' , '.join(s)
out = "fsmentry[%s]=%s" %(i,sout)
#print(out.encode('utf-8'))
fdbg.write("%s\n" % out)
fdbg.write(" e-elt=%s\n" % fsmentry['e-elt'])
#print("states=...")
fdbg.write("states=...\n")
for c in states:
state = states[c]
y = []
for i in state:
y.append('%s' % i)
x = ' '.join(y)
out = "c=%s, state=%s" %(c,x)
#print(out.encode('utf-8'))
fdbg.write("%s\n" % out)
fdbg.close()
def to_unicode(x):
# x is assumed to be a string with one of two forms
# (a) \uxxxx\uyyyy this is interpreted as unicode
# (b) other - this is returned without change
global python_version
if (x == r"\u"): # a case where notation is confusing
return x
match = re.match('\\\\u',x)
if match:
y = re.split('\\\\u',x)
ans=''
for z in y:
if (z == ''):
continue
z1 = z
z2 = ''
if (len(z) > 4):
z1 = z[:4]
z2 = z[4:]
zint= int(z1,16)
zuni = unichr(zint)
ans += zuni
ans += z2
return ans
else:
return x
vowel_signs = ['\u094d','\u093e','\u093f','\u0940','\u0941','\u0942','\u0943','\u0944','\u0962','\u0963','\u0947','\u0948','\u094b','\u094c']
vowel_signs_unicode=[]
for vowel_sign in vowel_signs:
vowel_sign1 = to_unicode(vowel_sign)
vowel_signs_unicode.append(vowel_sign1)
def transcoder_processString(line,from1,to) :
global transcoder_dir,transcoder_fsmarr
if (from1 == to) :
return line
fromto = from1 + "_" + to
if (fromto in transcoder_fsmarr):
fsm = transcoder_fsmarr[fromto]
else:
transcoder_fsm(from1,to)
if (fromto in transcoder_fsmarr):
fsm = transcoder_fsmarr[fromto]
else:
return line
currentState=fsm['start']
fsmentries = fsm['fsm']
states = fsm['states']
n=0 ## current character position in line
result='' ## returned value
m=len(line)
while (n < m) :
c = line[n] # character at position n
if (c not in states):
result += c
currentState=fsm['start']
n += 1
continue
isubs = states[c]
best=""
nbest=0
bestFE = None
for isub in isubs :
fsmentry=fsmentries[isub]
startStates=fsmentry['starts']
k=-1
nstartStates=len(startStates)
j=0
while (j < nstartStates):
if (startStates[j] == currentState) :
k=j
j=nstartStates
j += 1
if (k == -1) :continue
match = transcoder_processString_match(line,n,m,fsmentry)
nmatch=len(match)
## echo "chk2: n=n, c='c', nmatch=nmatch<br>\n"
#out = "chk2: n=%s, c='%s', nmatch=%s" %(n,c,nmatch)
#print(out.encode('utf-8'))
if (nmatch > nbest) :
best = match
nbest=nmatch
bestFE=fsmentry
if (bestFE) :
result += bestFE['out']
n += nbest
currentState=bestFE['next']
else :
## Default condition. emit the character and change state to start
result += c
currentState=fsm['start']
n += 1
return result
def transcoder_processString_match(line,n,m,fsmentry) :
match="" ## value returned
edge = fsmentry['in']
nedge=len(edge)
j=n
k=0
b=True
while ( (j < m) and (k < nedge) and b) :
if(line[j] == edge[k]) :
j += 1
k += 1
else :
b=False
if (not b) :
return match
if (k != nedge) :
return match
match=edge
if (not 'regex' in fsmentry):
return match
## additional logic when fsmentry['regex'] is DEVA or TAMIL
## see discussion of 'regex' in transcoder_fsm
## This logic only works with slp1_deva xml file.
## Also, it ignores the use of '/^\' as vowel accents.
nmatch=len(match)
n1=n+nmatch
if (n1 == m) :
return match
d = line[n1]
#if (fsmentry['regex'] == 'deva') :
if (fsmentry['regex'] == 'slp1_deva') :
#test = re.match('[^aAiIuUfFxXeEoO^\/\\\\]',d)
test = re.match(r'[^aAiIuUfFxXeEoO^\/\\\\]',d)
if (test) :
return match
return ""
if (fsmentry['regex'] == 'hkt_tamil') :
test = re.match('[^aAiIuUeEoO]',d)
if (test):
return match
return ""
if (fsmentry['regex'] == 'deva_slp1'):
for vowel_sign1 in vowel_signs_unicode:
vowel_sign1_len = len(vowel_sign1)
found=True
for j in xrange(0,vowel_sign1_len):
k = n1 + j
if k >= m:
found=False
continue
if vowel_sign1[j] != line[k]:
found = False
continue
if found:
# the consonant is followed by $vowel_sign.
# return empty string to indicate rule failure.
# This program logic cannot distinguish between
# a mismatch, and an empty string.
# In particular, we don't handle virama properly otherwise,
# so we do this special test to correct the problem
# if ($j == 0) {return $match;} # case of virama
return "" # case of a vowel sign
# the consonant is not followed by either virama or a vowel sign.
return match # fell through for vowel_sign1
return ""
def transcoder_processElements(line,from1,to,tagname):
global transcoder_from,transcoder_to
transcoder_from = from1
transcoder_to = to
## Assume parts of line to be converted are marked in an xml way.
## For example, if tagname = 'SA':
## and line = 'The word <SA>rAma</SA> refers to a person',
## returned would be 'The word XXX refers to a person',
## where XXX is the transformation of the the string 'rAma' acc. to from,to
## ans = preg_replace("/<tagname>(.*?)<\/tagname>/e",
## "transcoder_processString('\\1','from','to')",line)
#regex = str.format('<{0}>(.*?)</{0}>',tagname)
regex = '<%s>(.*?)</%s>'%(tagname,tagname)
ans = re.sub(regex,transcoder_processElements_callback,line)
return ans
def transcoder_processElements_callback(match) :
global transcoder_from,transcoder_to
return transcoder_processString(match.group(1),transcoder_from,transcoder_to)
def transcoder_set_dir(dir) :
## may return FALSE if string dir is improper in some way
global transcoder_dir
path = os.path.abspath(dir)
if os.path.exists(path):
transcoder_dir = path
return transcoder_dir
def transcoder_get_dir() :
global transcoder_dir
return transcoder_dir