wiktwords

#!/usr/bin/env python3
#
# Main program for extracting a dictionary from wiktionary.  This has
# mostly been used with enwiktionary, but should be usable with other
# wiktionaries as well.
#
# Copyright (c) 2018-2020 Tatu Ylonen.  See LICENSE and https://ylonen.org

import os
import re
import sys
import json
import hashlib
import argparse
import collections
from wiktextract import (WiktionaryConfig, parse_wiktionary,
                         wiktionary_languages, parse_page)

# Pages whose titles have any of these prefixes are ignored.
ignore_prefixes = set(["Index", "Help", "MediaWiki", "Citations",
                       "Reconstruction", "Concordance",
                       "Rhymes", "Thread",
                       "Summary", "File",
                       "Transwiki",
])

# Pages with these prefixes are captured.
recognized_prefixes = set(["Category", "Appendix", "Wiktionary",
                           "Thesaurus", "Module", "Template"])


def capture_page(title, text, pages_dir):
    """Checks if the page needs special handling (and maybe saving).
    Returns True if the page should be processed normally as a
    dictionary entry."""
    assert isinstance(title, str)
    assert isinstance(text, str)
    assert pages_dir is None or isinstance(pages_dir, str)

    analyze = False
    m = re.match(r"^([A-Z][a-z][-a-zA-Z0-9_]+):(.+)$", title)
    if not m:
        if len(title) > 100:
            h = hashlib.sha256()
            h.update(title.encode("utf-8"))
            title = title[:100] + "-" + h.hexdigest()[:10]
        title = "Words:" + title[:2] + "/" + title
        analyze = True
    else:
        prefix, tail = m.groups()
        if prefix in ignore_prefixes:
            analyze = False
        elif prefix not in recognized_prefixes:
            print("UNRECOGNIZED PREFIX", title)
        # if prefix == "Category":
        #     m = re.match(r"^(Category:[^_ :]+)[_ :]*(.*)", title)
        #     if m:
        #         title = m.group(1) + ":" + m.group(2)
        #     # XXX Capture some?

    if pages_dir is not None:
        title = re.sub(r"[^-\w_.:/]", "_", title)
        title = re.sub(r":", "/", title)
        path = pages_dir + "/" + title + ".txt"
        path = re.sub(r"/\.+", "/", path)
        path = re.sub(r"//+", "/", path)
        dirpath = os.path.dirname(path)
        os.makedirs(dirpath, exist_ok=True)
        with open(path, "w") as f:
            f.write(text)

    return analyze


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Multilingual Wiktionary data extractor")
    parser.add_argument("path", type=str, nargs="?", default=None,
                        help="Input file (.../enwiktionary-<date>-"
                        "pages-articles.xml[.bz2])")
    parser.add_argument("--out", type=str, default=None,
                        help="Path where to write output (- for stdout)")
    parser.add_argument("--errors", type=str,
                        help="File in which to save error information")
    parser.add_argument("--language", type=str, action="append", default=[],
                        help="Language to capture (can specify multiple tiems, "
                        "defaults to English and Translingual)")
    parser.add_argument("--all-languages", action="store_true", default=False,
                        help="Extract words for all languages")
    parser.add_argument("--list-languages", action="store_true", default=False,
                        help="Print list of supported languages")
    parser.add_argument("--pages-dir", type=str, default=None,
                        help="Directory under which to store all pages")
    parser.add_argument("--all", action="store_true", default=False,
                        help="Capture all data for the selected languages")
    parser.add_argument("--translations", action="store_true", default=False,
                        help="Capture translations")
    parser.add_argument("--pronunciations", action="store_true", default=False,
                        help="Capture pronunciation information")
    parser.add_argument("--linkages", action="store_true", default=False,
                        help="Capture linkages (hypernyms, synonyms, etc)")
    parser.add_argument("--compounds", action="store_true", default=False,
                        help="Capture compound words using each word")
    parser.add_argument("--redirects", action="store_true", default=False,
                        help="Capture redirects")
    parser.add_argument("--examples", action="store_true", default=False,
                        help="Capture examples")
    parser.add_argument("--statistics", action="store_true", default=False,
                        help="Print statistics")
    parser.add_argument("--page", type=str,
                        help="Parse a single Wiktionary page (for debugging)")
    parser.add_argument("--verbose", action="store_true", default=False,
                        help="Print verbose status messages (for debugging)")
    args = parser.parse_args()

    # The --all option turns on capturing all data types
    if args.all:
        args.translations = True
        args.pronunciations = True
        args.linkages = True
        args.compounds = True
        args.redirects = True
        args.examples = True

    # If --list-languages has been specified, just print the list of supported
    # languages
    if args.list_languages:
        print("Supported languages:")
        for lang in sorted(wiktionary_languages):
            print("    {}".format(lang))
        sys.exit(0)

    if not args.path and not args.page:
        print("The PATH argument for wiktionary dump file is mandatory")
        print("(unless --page is specified).")
        sys.exit(1)

    # Default to English and Translingual if language not specified.
    if not args.language:
        args.language = ["English", "Translingual"]
    else:
        for x in args.language:
            if x not in wiktionary_languages:
                print("Invalid language:", x)
                sys.exit(1)

    if args.all_languages:
        args.language = None
        print("Capturing words for all available languages")
    else:
        print("Capturing words for:", ", ".join(args.language))

    # Open output file.
    out_path = args.out
    if out_path and out_path != "-":
        if out_path.startswith("/dev/"):
            out_tmp_path = out_path
        else:
            out_tmp_path = out_path + ".tmp"
        out_f = open(out_tmp_path, "w", buffering=1024*1024, encoding="utf8")
    else:
        out_tmp_path = out_path
        out_f = sys.stdout

    word_count = 0

    def word_cb(data):
        global word_count
        word_count += 1
        out_f.write(json.dumps(data, ensure_ascii = False))
        out_f.write("\n")
        if not out_path or out_path == "-":
            out_f.flush()

    def capture_cb(title, text):
        return capture_page(title, text, args.pages_dir)

    config = WiktionaryConfig(capture_languages=args.language,
                              capture_translations=args.translations,
                              capture_pronunciation=args.pronunciations,
                              capture_linkages=args.linkages,
                              capture_compounds=args.compounds,
                              capture_redirects=args.redirects,
                              capture_examples=args.examples,
                              verbose=args.verbose)

    try:
        if args.page:
            # Parse a single Wiktionary page (extracted using --pages-dir)
            if args.path:
                print("Normal input file should not be specified with --path")
                sys.exit(1)
            with open(args.page, "r") as f:
                text = f.read()
            ret = parse_page("TESTPAGE", text, config)
            for x in ret:
                word_cb(x)
        else:
            # Parse the normal full Wiktionary data dump
            parse_wiktionary(args.path, config, word_cb, capture_cb)
    finally:
        if out_path and out_path != "-":
            out_f.close()

    if out_path != out_tmp_path:
        try:
            os.remove(out_path)
        except FileNotFoundError:
            pass
        os.rename(out_tmp_path, out_path)

    if args.statistics:
        print("")
        print("LANGUAGE COUNTS")
        for k, cnt in sorted(config.language_counts.items(),
                             key=lambda x: -x[1]):
            print("  {:>7d} {}".format(cnt, k))
            if cnt < 1000:
                break
        print("  ...")
        print("")

        print("")
        print("POS HEADER USAGE")
        for k, cnt in sorted(config.pos_counts.items(),
                             key=lambda x: -x[1]):
            print("  {:>7d} {}".format(cnt, k))

        print("")
        print("POS SUBSECTION HEADER USAGE")
        for k, cnt in sorted(config.section_counts.items(),
                             key=lambda x: -x[1]):
            print("  {:>7d} {}".format(cnt, k))

        print("")
        print("{} WORDS CAPTURED".format(word_count))

    if args.errors:
        with open(args.errors, "w") as f:
            json.dump({
                "errors": config.errors,
                "warnings": config.warnings,
                "debugs": config.debugs,
                "unrecognized_counts": config.unrecognized_template_counts,
                "unrecognized_samples": config.unrecognized_template_samples,
                "unrendered_counts": config.unrendered_template_counts,
                "unrendered_samples": config.unrendered_template_samples,
                "unknown_value_counts": config.unknown_value_counts,
                }, f)

    def dump_un(title, limit, counts, samples):
        items = []
        counts_ht = {}
        samples_ht = {}
        for k, v in counts.items():
            counts_ht[k] = v
        lst = list(sorted(counts_ht.items(), reverse=True,
                          key=lambda x: x[1]))
        if lst:
            print(title)
            for k, cnt in lst[:limit]:
                print("{:5d} {}".format(cnt, k))
                for kk, vv in samples[k].items():
                    for sample in vv:
                        print("        {}".format(sample))

    def dump_val(title, limit, counts):
        items = []
        counts_ht = collections.defaultdict(int)
        for la, la_val in counts.items():
            for name, name_val in la_val.items():
                for value, cnt in name_val.items():
                    counts_ht[name, value] += cnt
        for la, la_val in counts.items():
            for name, name_val in la_val.items():
                for value, v in name_val.items():
                    counts_ht[name, value] = v
        lst = list(sorted(counts_ht.items(), reverse=True,
                          key=lambda x: x[1]))
        if lst:
            print("")
            print(title)
            for (k, kk), v in lst[:limit]:
                print("  {:5d} {}: {}".format(v, k, kk))

    dump_un("TOP 20 UNRECOGNIZED TEMPLATES", 20,
            config.unrecognized_template_counts,
            config.unrecognized_template_samples)

    dump_un("TOP 20 UNRENDERED TEMPLATES", 20,
            config.unrendered_template_counts,
            config.unrendered_template_samples)

    dump_val("TOP 20 UNKNOWN VALUES", 20,
             config.unknown_value_counts)