From 5a4cda14cb7e4e1afa6da6f44fb4b485af311bae Mon Sep 17 00:00:00 2001
From: Aaron Stacy <aaronstacy@google.com>
Date: Tue, 23 Aug 2016 10:30:50 -0500
Subject: [PATCH] Add tool for managing string translations

Since we've got more than just the standard strings in resource files
and it would be beneficial to involve RST in the translation process,
I've cooked up a script to copy/paste strings to/from a Google Sheet.
This should make it easier to solicit strings from folks at RST, and
then add them back into the codebase.
---
 bin/strings_to_csv.py |  84 ----------------
 bin/stringtool.py     | 225 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 225 insertions(+), 84 deletions(-)
 delete mode 100755 bin/strings_to_csv.py
 create mode 100755 bin/stringtool.py

diff --git a/bin/strings_to_csv.py b/bin/strings_to_csv.py
deleted file mode 100755
index 1eb0a42..0000000
--- a/bin/strings_to_csv.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env python
-
-import collections, csv, itertools, json, os, re, sys
-import xml.etree.ElementTree
-
-TRANSLATABLE_FIELDS = ('name', 'description')
-DEFAULT_LANGUAGE = 'en'
-LOCALES = ('uk', 'ru')
-STRINGS_FILE_RE = re.compile(r'^strings\.xml$')
-
-
-def encode_tuple((k, v), encoding='utf-8'):
-  return (k.encode(encoding), v.encode(encoding) if v != None else v)
-
-
-def encode_dict(d, encoding='utf-8'):
-  return dict(map(encode_tuple, d.iteritems()))
-
-
-def db_translations():
-  db = json.load(open('app/testdata/checklistappdev-export.json'))
-  for index, item in enumerate(db['checklists']['basic']):
-    for field in TRANSLATABLE_FIELDS:
-      record = {
-        'location': 'db[{}]'.format(index),
-        'field': field,
-        DEFAULT_LANGUAGE: item[field]
-      }
-      for locale in LOCALES:
-        record[locale] = item.get('alt', {}).get(locale, {}).get(field, None)
-      yield record
-
-
-def strings_xml_filenames():
-  for (directory, _, files) in os.walk('app/src/main/res'):
-    for file_ in files:
-      if STRINGS_FILE_RE.match(file_):
-        yield os.path.join(directory, file_)
-
-
-def language_from_filename(filename):
-  values_dir = os.path.split(os.path.dirname(filename))[1]
-  try:
-    return values_dir.rsplit('-')[1]
-  except IndexError:
-    return DEFAULT_LANGUAGE
-
-
-def filename_without_locale(filename):
-  res_dir = os.path.split(os.path.dirname(filename))[0]
-  return os.path.join(res_dir, 'values', os.path.basename(filename))
-
-
-def language_translation_from_filename(filename):
-  language = language_from_filename(filename)
-  for event, element in xml.etree.ElementTree.iterparse(filename):
-    if element.tag == 'string':
-      yield {
-        'location': filename_without_locale(filename),
-        'field': element.attrib['name'],
-        language: element.text,
-      }
-
-
-def code_translations():
-  translation_map = collections.defaultdict(lambda: {})
-  for filename in strings_xml_filenames():
-    for language_translation in language_translation_from_filename(filename):
-      key = language_translation['location'] + language_translation['field']
-      translation_map[key].update(language_translation)
-  return translation_map.values()
-
-
-def translations_to_csv(translations, outfile):
-  fieldnames = ('location', 'field', DEFAULT_LANGUAGE) + LOCALES
-  writer = csv.DictWriter(outfile, fieldnames=fieldnames)
-  writer.writeheader()
-  for translation in translations:
-    writer.writerow(encode_dict(translation))
-
-
-if __name__ == '__main__':
-  translations_to_csv(itertools.chain(code_translations(), db_translations()),
-      outfile=sys.stdout)
diff --git a/bin/stringtool.py b/bin/stringtool.py
new file mode 100755
index 0000000..a2f64a3
--- /dev/null
+++ b/bin/stringtool.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""\
+Tool for converting to/from string resources. This app has strings that need to
+be translated in 2 locations:
+
+- The Firebase database (mirrored in app/testdata/checklistappdev-export.json)
+- The strings.xml resource files
+
+As a general process for soliciting translations and updating the app, we would
+like to be able to copy all of these to a Google sheet, where translators can
+input translations, and then back to the source code. This tool handles both
+with a minimal diff footprint to make reviewing easier.
+
+Examples:
+
+  # Output all of the strings to a CSV for pasting into Google Sheets
+  $ stringtool.py tocsv > strings.csv
+
+  # Given a CSV from Google Sheets, update this repository with missing strings
+  $ cat strings.csv | stringtool.py tostrings
+"""
+
+import argparse, collections, copy, csv, HTMLParser, itertools, json, os, re, sys
+import xml.dom.minidom
+import xml.etree.ElementTree
+
+JSON_DB_FILENAME = 'app/testdata/checklistappdev-export.json'
+TRANSLATABLE_FIELDS = ('name', 'description')
+DEFAULT_LANGUAGE = 'en'
+LOCALES = ('uk', 'ru', 'ar', 'es')
+STRINGS_FILE_RE = re.compile(r'^strings\.xml$')
+
+
+def encode_tuple((k, v), encoding='utf-8'):
+  return (k.encode(encoding), v.encode(encoding) if v != None else v)
+
+
+def encode_dict(d, encoding='utf-8'):
+  return dict(map(lambda t: encode_tuple(t, encoding=encoding), d.iteritems()))
+
+
+def decode_tuple((k, v), encoding='utf-8'):
+  return (k.decode(encoding), v.decode(encoding) if v != None else v)
+
+
+def decode_dict(d, encoding='utf-8'):
+  return dict(map(lambda t: decode_tuple(t, encoding=encoding), d.iteritems()))
+
+
+def db_translations():
+  db = None
+  with open(JSON_DB_FILENAME) as infile:
+    db = json.load(open(JSON_DB_FILENAME))
+  for index, item in enumerate(db['checklists']['basic']):
+    for field in TRANSLATABLE_FIELDS:
+      record = {
+        'location': 'db'.format(index),
+        'field': '[{}]{}'.format(index, field),
+        DEFAULT_LANGUAGE: item[field]
+      }
+      for locale in LOCALES:
+        record[locale] = item.get('alt', {}).get(locale, {}).get(field, None)
+      yield record
+
+
+def strings_xml_filenames():
+  for (directory, _, files) in os.walk('app/src/main/res'):
+    for file_ in files:
+      if STRINGS_FILE_RE.match(file_):
+        yield os.path.join(directory, file_)
+
+
+def language_from_filename(filename):
+  values_dir = os.path.split(os.path.dirname(filename))[1]
+  try:
+    return values_dir.rsplit('-')[1]
+  except IndexError:
+    return DEFAULT_LANGUAGE
+
+
+def filename_without_locale(filename):
+  """Something like 'res/values-ru/strings.xml' → 'res/values/strings.xml'"""
+  res_dir = os.path.split(os.path.dirname(filename))[0]
+  return os.path.join(res_dir, 'values', os.path.basename(filename))
+
+
+def translation_to_filename_with_locale(translation, locale):
+  """Given a translation object and a locale, return the locale specific path,
+  i.e. res/values-{LOCALE}/strings.xml"""
+  location = translation['location']
+  res_dir = os.path.split(os.path.dirname(location))[0]
+  values_dir = 'values' if locale == DEFAULT_LANGUAGE else 'values-' + locale
+  return os.path.join(res_dir, values_dir, os.path.basename(location))
+
+
+def language_translation_from_filename(filename):
+  language = language_from_filename(filename)
+  for event, element in xml.etree.ElementTree.iterparse(filename):
+    if element.tag == 'string':
+      # a <string> element may contain markup elements, so we can't just take
+      # the 'text'. beyond that, ~awesome~ etree will html escape non-ascii
+      # stuff, so we've got to decode the result.
+      text = u'' + (element.text or u'')
+      text += u''.join(xml.etree.ElementTree.tostring(e) for e in element)
+      html_parser = HTMLParser.HTMLParser()
+      text = html_parser.unescape(text)
+      yield {
+        'location': filename_without_locale(filename),
+        'field': element.attrib['name'],
+        language: text,
+      }
+
+
+def code_translations():
+  translation_map = collections.defaultdict(lambda: {})
+  for filename in strings_xml_filenames():
+    for language_translation in language_translation_from_filename(filename):
+      key = language_translation['location'] + language_translation['field']
+      translation_map[key].update(language_translation)
+  return translation_map.values()
+
+
+def translations_to_csv(translations, outfile):
+  fieldnames = ('location', 'field', DEFAULT_LANGUAGE) + LOCALES
+  writer = csv.DictWriter(outfile, fieldnames=fieldnames)
+  writer.writeheader()
+  for translation in translations:
+    writer.writerow(encode_dict(translation))
+
+
+def csv_to_translations(infile):
+  return (decode_dict(translation, encoding=infile.encoding or 'utf-8')
+      for translation in csv.DictReader(infile))
+
+
+def group_translations_by_file(translations):
+  file_map = collections.defaultdict(lambda: [])
+  for translation in translations:
+    if translation['location'] == 'db':
+      file_map[translation['location']].append(translation)
+    else:
+      for locale in LOCALES + (DEFAULT_LANGUAGE,):
+        if locale in translation:
+          filename = translation_to_filename_with_locale(translation, locale)
+          file_map[filename].append(translation)
+  return dict(file_map.iteritems())
+
+
+def update_db_object_with_translations(db, translations):
+  db = copy.deepcopy(db)
+  checklist = db['checklists']['basic']
+  for translation in translations:
+    raw_field = translation['field']
+    index = int(raw_field.lstrip('[').split(']')[0])
+    field = raw_field.split(']')[1]
+    checklist[index][field] = translation[DEFAULT_LANGUAGE]
+    for locale in LOCALES:
+      if translation.get(locale, '') != '':
+        if locale not in checklist[index]['alt']:
+          checklist[index]['alt'][locale] = {}
+        checklist[index]['alt'][locale][field] = translation[locale]
+  return db
+
+
+def update_db_with_translations(translations):
+  db = None
+  with open(JSON_DB_FILENAME) as infile:
+    db = json.load(infile, object_pairs_hook=collections.OrderedDict)
+  db = update_db_object_with_translations(db, translations)
+  with open(JSON_DB_FILENAME, 'w') as outfile:
+    json.dump(db, outfile, separators=(',', ': '), indent=2, encoding='utf-8',
+              ensure_ascii=False)
+
+
+# a better programmer would break this up into logical pieces of work, but xml
+# is such a bummer, so this is a big hack.
+def update_xml_with_translations(filename, translations):
+  locale = language_from_filename(filename)
+  translation_map = collections.OrderedDict()
+  copyright_notice = None
+  with open(filename) as infile:
+    for event, element in xml.etree.ElementTree.iterparse(infile):
+      if element.tag == 'string':
+        text = element.text or ''
+        text += ''.join(xml.etree.ElementTree.tostring(e) for e in element)
+        translation_map[element.attrib['name']] = text
+      infile.seek(0)
+      copyright_notice = xml.dom.minidom.parse(infile).childNodes[0].toxml()
+  for translation in translations:
+    if translation.get(locale, '') != '':
+      translation_map[translation['field']] = translation[locale]
+  with open(filename, 'w') as outfile:
+    outfile.write(copyright_notice.encode('utf-8') + '\n\n<resources>\n')
+    for key, value in translation_map.iteritems():
+
+      value = value.encode('utf-8')
+      outfile.write('  <string name="{}">{}</string>\n'.format(key, value))
+    outfile.write('</resources>\n')
+
+
+def strings_to_csv(outfile=sys.stdout):
+  translations_to_csv(itertools.chain(code_translations(), db_translations()),
+      outfile=outfile)
+
+
+def csv_to_strings(infile=sys.stdin):
+  translations_by_file = group_translations_by_file(csv_to_translations(infile))
+  for filename, translations in translations_by_file.iteritems():
+    if filename == 'db':
+      update_db_with_translations(map(encode_dict, translations_by_file['db']))
+    else:
+      update_xml_with_translations(filename, translations)
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser(description=__doc__,
+      formatter_class=argparse.RawDescriptionHelpFormatter)
+  parser.add_argument('action', choices=('tostrings', 'tocsv'))
+  if parser.parse_args().action == 'tostrings':
+    csv_to_strings()
+  else:
+    strings_to_csv()
+