locale/smartmsgmerge.py

#!/usr/bin/python
#
# smartmsgmerge.py
#
# Written by Dominic Mazzoni, 2006
# GNU General Public License 2.0
#
# This is a replacement for the GNU gettext "msgmerge" program, which
# is typically used to update a .po file (def) to the latest .pot file
# (ref).  This program is not command-line compatible; it takes no
# flags but simply the def, ref, and output file names.
#
# It uses a much faster and also much stricter policy for finding new
# fuzzy matches: the edit-distance must be no more than 4%, or for very
# short strings, no more than 1 character.
#
# This makes it safe for you to enable fuzzy strings in your .mo file
# without worrying that they'll be too terrible.
#
# It also fixes translations where the beginning and ending newlines
# do not match the original string.
#

import sys, os

if len(sys.argv) != 4:
  print "Usage: %s def.po ref.pot out.po" % sys.argv[0]
  sys.exit()

def_filename = sys.argv[1]
ref_filename = sys.argv[2]
out_filename = sys.argv[3]

# Each object will contain the comments, msgid (untranslated),
# msgstr (translated), and a fuzzy flag.  For simplicity this is
# not a typechecked class, just a dummy dynamic container class.
class obj:
  pass

# Compute the edit-distance between str1 and str2, taking a couple
# of shortcuts such that it returns 999 quickly if the edit-distance
# is clearly not going to be less than 10 percent.
def edit_distance(str1, str2):
  l1 = len(str1)
  l2 = len(str2)

  # Exit if the difference in the string lenghts is 10% or more
  if l1*1.0/l2 < 0.9 or l1*1.0/l2 > 1.1:
    return 999

  # Compute a beam width of +/- 5% - the path through the matrix cannot
  # go outside the main diagonal +/- the beam.
  beam = int(0.5 + 0.1 * ((l1 + l2) / 2))

  # Create a 2D array
  d = [None]*(l1+1)
  for i in range(l1+1):
    d[i] = [999]*(l2+1)

  # Initialize the first row and column
  for i in range(l1+1):
    d[i][0] = i
  for j in range(l2+1):
    d[0][j] = j

  # Dynamic programming
  for i in range(1, l1+1):
    # Quick short-circuit after 30 rows; stop if things are
    # looking really bad
    if i==30 and l2>=30 and d[29][29] > 20:
      return 999
    for j in range(max(1, i-beam), min(l2+1, i+beam+1)):
      if str1[i-1] == str2[j-1]:
        cost = 0
      else:
        cost = 1
      d[i][j] = min(
                    d[i-1][j] + 1,       # deletion
                    d[i][j-1] + 1,       # insertion
                    d[i-1][j-1] + cost   # substitution
                   )
  return d[l1][l2]

# Take a string and format it on a bunch of separate lines in quotes
def quote(str):
  if str=="":
    return "\"\"\n"
  q = ""
  p = str.find("\\n")
  while p >= 0:
    line = str[:p]
    str = str[p+2:]
    q += "\"%s\\n\"\n" % line
    p = str.find("\\n")
  if len(str) > 0:
    q += "\"%s\"\n" % str
  return q

# Take a bunch of separate lines in quotes and turn them into a single string
def unquote(str):
  u = ""
  for line in str.split("\n"):
    line = line.strip()
    if len(line)>=2:
      if line[0]=='"' and line[-1]=='"':
        u += line[1:-1]
      else:
        print "Error with:"
        print '**%s**' % line
        sys.exit()
  return u

# Parse one file in the .po / .pot format, returning a hash of all
# msgids and a list of all msgids in order.
def parse(fname):
  h = {}
  l = []
  msgid = ""
  msgstr = ""
  comments = ""
  fuzzy = False
  first = True
  line_no = 0

  # Read the lines of the file and make sure it always ends in a
  # blank line
  lines = open(fname).readlines()
  lines.append("\n")

  for line in (lines + ["\n"]):
    # Handle DOS line endings
    if len(line)>=2 and line[-2]=='\r' and line[-1]=='\n':
      line = line[:-2]+'\n'

    line_no += 1
    if line=="\n":
      if len(msgid)==0 and not first:
        # We found a blank line or comments in the middle of nowhere
        comments = ""
        fuzzy = False
        msgstr = ""
        continue
      # Otherwise, a blank line in the middle of the file
      # signifies the end of a translation
      msgid = unquote(msgid)
      msgstr = unquote(msgstr)
      if msgid in h:
        print "Duplicate msgid in %s:" % (fname)
        print quote(msgid)
        print "Found on line %d, previously defined on line %d" % \
          (line_no, h[msgid].line_no)
        sys.exit()
      o = obj()
      o.comments = comments
      o.msgid = msgid
      o.msgstr = msgstr
      o.fuzzy = fuzzy
      o.line_no = line_no
      h[msgid] = o
      l.append(msgid)
      comments = ""
      msgstr = ""
      msgid = ""
      fuzzy = False
      first = False
    elif len(line)>=8 and line[:8] == "#, fuzzy":
      fuzzy = True
      comments += line
    elif line[0] == '#':
      comments += line
    elif len(line)>6 and line[:6]=="msgid ":
      msgid += line[6:]
    elif len(line)>7 and line[:7]=="msgstr ":
      msgstr += line[7:]
    else:
      if len(msgstr):
        msgstr += line
      else:
        msgid += line
  return (h, l)

(def_h, def_l) = parse(def_filename)
(ref_h, ref_l) = parse(ref_filename)

# Handle the exact matches
final_h = {}
for msgid in ref_l:
  if msgid in def_h:
    final_h[msgid] = def_h[msgid]

# Try for fuzzy matches
for ref_msgid in [x for x in ref_l if x not in final_h]:
  min_ed = 999
  min_msgid = None
  for def_msgid in [x for x in def_l if x not in final_h]:
    if len(def_h[def_msgid].msgstr) < 3:
      continue
    ed = edit_distance(ref_msgid, def_msgid)
    if ed < min_ed:
      min_ed = ed
      min_msgid = def_msgid
  if min_msgid != None:
    pct = min_ed * 100.0 / min(len(ref_msgid), len(min_msgid))
    if min_ed == 1 or pct <= 4.0:
      refstr = ref_msgid
      if len(refstr)>40:
        refstr = refstr[:37]+"..."
      minstr = min_msgid
      if len(minstr)>40:
        minstr = minstr[:37]+"..."
      print "Found fuzzy match:"
      print "   %s" % refstr
      print "   %s" % minstr
      print "     def_len=%d, ref_len=%d, edit_distance=%d" % \
          (len(min_msgid), len(ref_msgid), min_ed)
      o = obj()
      def_h[min_msgid]
      o.msgid = ref_msgid
      o.comments = ref_h[ref_msgid].comments
      o.msgstr = def_h[min_msgid].msgstr
      o.fuzzy = True
      final_h[ref_msgid] = o

# Generate output file
translated = 0
fuzzy = 0
empty = 0
out_fp = open(out_filename, "w")
for msgid in ref_l:
  if msgid in final_h:
    o = final_h[msgid]
    if o.fuzzy:
      fuzzy += 1
    elif msgid != "":
      translated += 1
  else:
    o = ref_h[msgid]
    empty += 1

  msgstr = o.msgstr
  # Fix leading and trailing newlines
  if len(msgid)>4 and len(msgstr)>4:
    # Add newline if missing
    if msgid[:2]=="\\n" and msgstr[:2]!="\\n":
      msgstr = "\\n" + msgstr
    if msgid[-2:]=="\\n" and msgstr[-2:]!="\\n":
      msgstr = msgstr + "\\n"

    # Remove newline if extraneous
    if msgid[:2]!="\\n" and msgstr[:2]=="\\n":
      msgstr = msgstr[2:]
    if msgid[-2:]!="\\n" and msgstr[-2:]=="\\n":
      msgstr = msgstr[:-2]

  # Write the entry
  out_fp.write(o.comments)
  if o.fuzzy and o.comments.find("fuzzy")==-1:
    out_fp.write("#, fuzzy\n")
  out_fp.write("msgid " + quote(msgid))
  out_fp.write("msgstr " + quote(msgstr))
  out_fp.write("\n")

# Print stats
print "Translated: %d Fuzzy: %d Empty: %d" % (translated, fuzzy, empty)
print "Wrote output to %s" % out_filename