-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcfr_adj.py
204 lines (195 loc) · 5.73 KB
/
cfr_adj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
""" cfr_adj.py Sep 29, 2014
Oct 17, 2014: Separate collations by dictionary
[email protected] Oct 18, 2014: Use 'dictionaries' subdirectory.
Put correction forms in dictionaries/X/ directory
Usage: python cfr_adj.py cfr.tsv correctionform.txt
Note: cfr.tsv is created from Google Spreadsheet
'Sanskrit-Lexicon Correction form (Responses)'
by 'File/Download as tab-separated values'
Jul 18, 2015 Sort records by time, since Google doesn't append
new records from Correction form to the end.
Aug 2, 2015 Correct construction of sorttime.
"""
import re,sys,os
import codecs
def oneline(x):
parts = re.split(r'[\r\n]',x)
y = ' '.join(parts)
return y
class CFR(object):
def __init__(self,line,n):
parts = line.split('\t')
self.line = line
self.n = n
if len(parts)!= 8:
print "# of tab-parts should be 8, but is",len(parts)
out = "Error 1 for line %s:\n%s" %(n,line)
print out.encode('utf-8')
exit(1)
self.time = oneline(parts[0])
# Jul 18, 2015 - Generate a sortable timefield
# Assume time is mm/dd/yyyy hh:mm:ss
# Change to yyyymmdd-hh:mm:ss-nnnn (nnnnnn = self.n)
try:
timeparts = re.split(r'[/: ]',self.time)
mm = int(timeparts[0])
dd = int(timeparts[1])
yyyy = int(timeparts[2])
h = int(timeparts[3])
m = int(timeparts[4])
s = int(timeparts[5])
self.sorttime = "%4d%02d%02d-%02d%02d%02d-%06d" %(yyyy,mm,dd,h,m,s,n)
except:
if n != 1:
print "ERROR time='%s'" % self.time
print n,line.encode('utf-8')
print re.split(r'[/: ]',self.time)
exit()
else: # case n=1
(mm,dd,yyyy,h,m,s) = (0,0,0,0,0,0)
self.sorttime = "%4d%02d%02d-%02d%02d%02d-%06d" %(yyyy,mm,dd,h,m,s,n)
self.dict = oneline(parts[1])
if self.dict == "APES":
self.dict = "AE"
elif self.dict == "PWG2013":
self.dict = "PWG"
self.lnum = oneline(parts[2])
self.hw = oneline(parts[3])
self.old = oneline(parts[4])
self.new = oneline(parts[5])
self.comment = oneline(parts[6])
email = oneline(parts[7])
self.email = email
eparts = email.split(r':')
if eparts >= 2:
self.user = eparts[0]
self.status = ':'.join(eparts[1:])
else:
self.user = email
self.status = ''
self.user = self.user.strip()
#if self.user == '':
# self.user='NONE'
self.useradj = re.sub(r'@.*$','',self.user)
def listform(self):
return [self.time,self.dict,self.lnum,self.hw,self.old,self.new,self.comment,self.useradj,self.status]
def outputrec(rec,i):
outar=[]
(date,time) = rec.time.split(' ')
out = "Case %s: %s dict=%s, L=%s, hw=%s, user=%s" %(
rec.n,date,rec.dict,rec.lnum,rec.hw,rec.useradj)
outar.append(out)
outar.append("old = %s" % rec.old)
outar.append("new = %s" % rec.new)
if rec.comment != 'Typo':
outar.append('comment = %s' % rec.comment)
if rec.status == '':
rec.status = 'PENDING'
outar.append('status = %s' % rec.status)
outar.append('-'*72)
outar.append('')
return outar
def generate_output(dcode,filename,recs):
allarr =[] # array of all output lines
if dcode == "ALL":
out = "Sanskrit Lexicon Correction Form History"
else:
out = "Sanskrit Lexicon Correction Form History for %s" % dcode
allarr.append(out)
import datetime
today = datetime.date.today()
date = today.strftime("%B %d, %Y")
out = "As of %s" % date
allarr.append(out)
idxpending = len(allarr) # prepare place-holder
allarr.append("DUMMY")
allarr.append("")
#fout.write("%s\n" % out)
#fout.write("\n")
m = len(recs)
npending=0
nfound = 0
# recs is in ascending order of sorttime. Read array backwards
# so new data at the top.
for i in xrange(m-1,-1,-1):
rec = recs[i]
if not (dcode in ['ALL',rec.dict]):
continue
nfound = nfound + 1
outar = outputrec(rec,i)
for out in outar:
allarr.append(out)
if out == 'status = PENDING':
npending=npending + 1
# Fill allarr[idxpending]
allarr[idxpending]="%s correction records, with %s PENDING" %(nfound,npending)
# generate fileout from dcode and filename
if dcode == "ALL":
fileout = filename
else:
dir = "%s/%s" %("dictionaries",dcode)
fileout = "%s/%s_%s" %(dir,dcode,filename)
if not os.path.isdir(dir):
if os.path.exists(dir):
print "ERROR: %s exists, but is not a directory" % dir
exit(1)
os.mkdir(dir,0755)
fout = codecs.open(fileout,'w','utf-8')
for out in allarr:
fout.write("%s\n" % out)
fout.close()
return npending
def adjust(filein,fileout):
f = codecs.open(filein,'r','utf-8')
n = 0
recsin=[]
dictmap = {}
for line in f:
line = line.rstrip('\r\n')
n = n + 1
rec = CFR(line,n)
if n == 1:
hrec = rec
continue
recsin.append(rec)
d = rec.dict
if d not in dictmap:
dictmap[d] = []
dictmap[d].append(rec)
f.close()
# sort recsin in order of sorttime
recs=sorted(recsin,key = lambda rec:rec.sorttime)
# change 'n' based on sort order
for j in xrange(0,len(recs)):
rec = recs[j]
out = "%s,%s,%s" %(rec.sorttime,rec.n,rec.lnum)
rec.case = rec.n # new
rec.n = j+1
knowndicts = ["AE","AP","AP90","BEN","BHS","BOR","BUR","CAE","CCS",
"GRA","MW","MW72","PUI","PW","PWG",
"SCH","SHS","SKD","STC","VCP","VEI","WIL","GST","PD","MD",
"MCI","YAT","MWE","INM","IEG","PE","ACC","BOP","KRM"]
npending = generate_output("ALL",fileout,recs)
print n,"lines read from",filein
print npending,"cases are pending"
#print "dbg exiting early"
for d in dictmap:
d = d.upper() # Jan 25, 2017
if d not in knowndicts:
out = "UNKNOWN DICTIONARY: %s %s" %(d,len(dictmap[d]))
print out.encode('utf-8')
m = len(recs)
print "DBG: m=",m
for i in xrange(0,m):
rec = recs[i]
if rec.dict == d:
outar=outputrec(rec,i)
for out in outar:
print out.encode('utf-8')
else:
generate_output(d,fileout,recs)
#-----------------------------------------------------
if __name__=="__main__":
filein = sys.argv[1]
fileout = sys.argv[2]
adjust(filein,fileout)