Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a script to build DSL dictionaries #44

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions LANGUAGES.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
ar Arabic Modern Standard
de German
en_UK English Received Pronunciation
en_US English General American
eo Esperanto
es_ES Spanish Spain
es_MX Spanish Mexico
fa Persian
fi Finnish
fr_FR French France
fr_QC French Québec
is Icelandic
ja Japanese
jam Jamaican Creole
km Khmer
ko Korean
ma Malay Malaysian and Indonesian
nb Norwegian Bokmål
nl Dutch
or Odia
ro Romanian
sv Swedish
sw Swahili
tts Isan
vi_C Vietnamese Central
vi_N Vietnamese Northern
vi_S Vietnamese Southern
yue Cantonese
zh_hans Mandarin Simplified
zh_hant Mandarin Traditional
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ vi_C | Vietnamese (Central)
vi_N | Vietnamese (Northern)
vi_S | Vietnamese (Southern)
yue | Cantonese
zh | Mandarin
zh_hans | Mandarin (Simplified)
zh_hant | Mandarin (Traditional)

## Applications

Expand Down
105 changes: 105 additions & 0 deletions build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python3

import argparse
import os
import sys

def overwrite_maybe(filename, data):
"""Check that the contents of the file need to change and write new
data if so. Do nothing otherwise.
"""

try:
with open(filename, "rb") as f:
contents = f.read()
except FileNotFoundError:
pass
else:
if contents == data: return

sys.stderr.write(f"writing {filename}\n")
with open(f"{filename}.tmp", "wb") as f:
f.write(data)

os.rename(f"{filename}.tmp", filename)

language = {}
filename = "LANGUAGES.tsv"
with open(filename, "r") as f:
lineno = 0
for line in f.readlines():
lineno += 1
try:
if line[-1:] == "\n": line = line[:-1]
code, name, *rest = line.split("\t", 2)
variant = ""
if len(rest) == 0:
pass
elif len(rest) == 1:
variant = rest[0]
else:
raise ValueError("can't parse LANGUAGES.tsv")
language[code] = name, variant
except:
sys.stderr.write(f"while processing line {lineno} of {filename}\n")
raise

def load_ipa(code):
result = {}
filename = os.path.join("data", f"{code}.txt")
with open(filename, "r") as f:
lineno = 0
for line in f.readlines():
lineno += 1
try:
if line[-1:] == "\n": line = line[:-1]
word, transcription = line.split("\t")
result[word] = transcription
except:
sys.stderr.write(f"while processing line {lineno} of {filename}\n")
raise
return result

def write_dsl(code, name, description, ipa):
output = f'#NAME "IPA Dictionary - {code}: {description}"\n'
output += f'#INDEX_LANGUAGE "{name}"\n'
output += f'#CONTENTS_LANGUAGE "{name}"\n\n'
for word, transcription in ipa.items():
output += f"{word}\n\t[m1]{transcription}[/m]\n\n"

# write DSL files both in UTF8 and UTF16, because some software requires
# them to be encoded in the latter
for enc in ["utf8", "utf16"]:
data = bytes(output, enc)
os.makedirs(f"dsl_{enc}", exist_ok = True)
overwrite_maybe(os.path.join(f"dsl_{enc}", f"{code}_ipa.dsl"), data)

parser = argparse.ArgumentParser(prog="build", description="build stuff from data files", add_help = True)
targets = parser.add_argument_group(title="targets", description="build targets, at least one must be specified (default: all)")
targets.add_argument("--dsl", dest="cmds", action="append_const", const="dsl", help="build DSL dictionaries")
parser.add_argument("codes", metavar="CODE", nargs="*", type=str, help="language codes to process (default: all)")

args = parser.parse_args(sys.argv[1:])

if args.cmds is not None and len(args.cmds) > 0:
cmds = args.cmds
else:
cmds = ["dsl"]

if len(args.codes) > 0:
codes = args.codes
else:
codes = language.keys()

for code in codes:
ipa = load_ipa(code)
name, variant = language[code]
description = name
if variant:
description += f" ({variant})"

for cmd in cmds:
if cmd == "dsl":
write_dsl(code, name, description, ipa)
else:
raise NotImplementedError(cmd)