forked from audacity/audacity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsmartmsgmerge.py
executable file
·265 lines (241 loc) · 7.11 KB
/
smartmsgmerge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#!/usr/bin/python
#
# smartmsgmerge.py
#
# Written by Dominic Mazzoni, 2006
# GNU General Public License 2.0
#
# This is a replacement for the GNU gettext "msgmerge" program, which
# is typically used to update a .po file (def) to the latest .pot file
# (ref). This program is not command-line compatible; it takes no
# flags but simply the def, ref, and output file names.
#
# It uses a much faster and also much stricter policy for finding new
# fuzzy matches: the edit-distance must be no more than 4%, or for very
# short strings, no more than 1 character.
#
# This makes it safe for you to enable fuzzy strings in your .mo file
# without worrying that they'll be too terrible.
#
# It also fixes translations where the beginning and ending newlines
# do not match the original string.
#
import sys, os
if len(sys.argv) != 4:
print "Usage: %s def.po ref.pot out.po" % sys.argv[0]
sys.exit()
def_filename = sys.argv[1]
ref_filename = sys.argv[2]
out_filename = sys.argv[3]
# Each object will contain the comments, msgid (untranslated),
# msgstr (translated), and a fuzzy flag. For simplicity this is
# not a typechecked class, just a dummy dynamic container class.
class obj:
pass
# Compute the edit-distance between str1 and str2, taking a couple
# of shortcuts such that it returns 999 quickly if the edit-distance
# is clearly not going to be less than 10 percent.
def edit_distance(str1, str2):
l1 = len(str1)
l2 = len(str2)
# Exit if the difference in the string lenghts is 10% or more
if l1*1.0/l2 < 0.9 or l1*1.0/l2 > 1.1:
return 999
# Compute a beam width of +/- 5% - the path through the matrix cannot
# go outside the main diagonal +/- the beam.
beam = int(0.5 + 0.1 * ((l1 + l2) / 2))
# Create a 2D array
d = [None]*(l1+1)
for i in range(l1+1):
d[i] = [999]*(l2+1)
# Initialize the first row and column
for i in range(l1+1):
d[i][0] = i
for j in range(l2+1):
d[0][j] = j
# Dynamic programming
for i in range(1, l1+1):
# Quick short-circuit after 30 rows; stop if things are
# looking really bad
if i==30 and l2>=30 and d[29][29] > 20:
return 999
for j in range(max(1, i-beam), min(l2+1, i+beam+1)):
if str1[i-1] == str2[j-1]:
cost = 0
else:
cost = 1
d[i][j] = min(
d[i-1][j] + 1, # deletion
d[i][j-1] + 1, # insertion
d[i-1][j-1] + cost # substitution
)
return d[l1][l2]
# Take a string and format it on a bunch of separate lines in quotes
def quote(str):
if str=="":
return "\"\"\n"
q = ""
p = str.find("\\n")
while p >= 0:
line = str[:p]
str = str[p+2:]
q += "\"%s\\n\"\n" % line
p = str.find("\\n")
if len(str) > 0:
q += "\"%s\"\n" % str
return q
# Take a bunch of separate lines in quotes and turn them into a single string
def unquote(str):
u = ""
for line in str.split("\n"):
line = line.strip()
if len(line)>=2:
if line[0]=='"' and line[-1]=='"':
u += line[1:-1]
else:
print "Error with:"
print '**%s**' % line
sys.exit()
return u
# Parse one file in the .po / .pot format, returning a hash of all
# msgids and a list of all msgids in order.
def parse(fname):
h = {}
l = []
msgid = ""
msgstr = ""
comments = ""
fuzzy = False
first = True
line_no = 0
# Read the lines of the file and make sure it always ends in a
# blank line
lines = open(fname).readlines()
lines.append("\n")
for line in (lines + ["\n"]):
# Handle DOS line endings
if len(line)>=2 and line[-2]=='\r' and line[-1]=='\n':
line = line[:-2]+'\n'
line_no += 1
if line=="\n":
if len(msgid)==0 and not first:
# We found a blank line or comments in the middle of nowhere
comments = ""
fuzzy = False
msgstr = ""
continue
# Otherwise, a blank line in the middle of the file
# signifies the end of a translation
msgid = unquote(msgid)
msgstr = unquote(msgstr)
if msgid in h:
print "Duplicate msgid in %s:" % (fname)
print quote(msgid)
print "Found on line %d, previously defined on line %d" % \
(line_no, h[msgid].line_no)
sys.exit()
o = obj()
o.comments = comments
o.msgid = msgid
o.msgstr = msgstr
o.fuzzy = fuzzy
o.line_no = line_no
h[msgid] = o
l.append(msgid)
comments = ""
msgstr = ""
msgid = ""
fuzzy = False
first = False
elif len(line)>=8 and line[:8] == "#, fuzzy":
fuzzy = True
comments += line
elif line[0] == '#':
comments += line
elif len(line)>6 and line[:6]=="msgid ":
msgid += line[6:]
elif len(line)>7 and line[:7]=="msgstr ":
msgstr += line[7:]
else:
if len(msgstr):
msgstr += line
else:
msgid += line
return (h, l)
(def_h, def_l) = parse(def_filename)
(ref_h, ref_l) = parse(ref_filename)
# Handle the exact matches
final_h = {}
for msgid in ref_l:
if msgid in def_h:
final_h[msgid] = def_h[msgid]
# Try for fuzzy matches
for ref_msgid in [x for x in ref_l if x not in final_h]:
min_ed = 999
min_msgid = None
for def_msgid in [x for x in def_l if x not in final_h]:
if len(def_h[def_msgid].msgstr) < 3:
continue
ed = edit_distance(ref_msgid, def_msgid)
if ed < min_ed:
min_ed = ed
min_msgid = def_msgid
if min_msgid != None:
pct = min_ed * 100.0 / min(len(ref_msgid), len(min_msgid))
if min_ed == 1 or pct <= 4.0:
refstr = ref_msgid
if len(refstr)>40:
refstr = refstr[:37]+"..."
minstr = min_msgid
if len(minstr)>40:
minstr = minstr[:37]+"..."
print "Found fuzzy match:"
print " %s" % refstr
print " %s" % minstr
print " def_len=%d, ref_len=%d, edit_distance=%d" % \
(len(min_msgid), len(ref_msgid), min_ed)
o = obj()
def_h[min_msgid]
o.msgid = ref_msgid
o.comments = ref_h[ref_msgid].comments
o.msgstr = def_h[min_msgid].msgstr
o.fuzzy = True
final_h[ref_msgid] = o
# Generate output file
translated = 0
fuzzy = 0
empty = 0
out_fp = open(out_filename, "w")
for msgid in ref_l:
if msgid in final_h:
o = final_h[msgid]
if o.fuzzy:
fuzzy += 1
elif msgid != "":
translated += 1
else:
o = ref_h[msgid]
empty += 1
msgstr = o.msgstr
# Fix leading and trailing newlines
if len(msgid)>4 and len(msgstr)>4:
# Add newline if missing
if msgid[:2]=="\\n" and msgstr[:2]!="\\n":
msgstr = "\\n" + msgstr
if msgid[-2:]=="\\n" and msgstr[-2:]!="\\n":
msgstr = msgstr + "\\n"
# Remove newline if extraneous
if msgid[:2]!="\\n" and msgstr[:2]=="\\n":
msgstr = msgstr[2:]
if msgid[-2:]!="\\n" and msgstr[-2:]=="\\n":
msgstr = msgstr[:-2]
# Write the entry
out_fp.write(o.comments)
if o.fuzzy and o.comments.find("fuzzy")==-1:
out_fp.write("#, fuzzy\n")
out_fp.write("msgid " + quote(msgid))
out_fp.write("msgstr " + quote(msgstr))
out_fp.write("\n")
# Print stats
print "Translated: %d Fuzzy: %d Empty: %d" % (translated, fuzzy, empty)
print "Wrote output to %s" % out_filename