luatest2.py

# Tests/experiments related to WikiText parsing and Lua extension invocation
#
# Copyright (c) 2020 Tatu Ylonen.  See file LICENSE and https://ylonen.org

import re
import sys
import copy
import html
import json
import time
import base64
import os.path
import textwrap
import traceback
import collections
import html.entities
import lupa
from lupa import LuaRuntime

from wiktextract import wikitext
from wiktextract.wikitext import WikiNode, NodeKind
from wiktextract.wikiparserfns import (PARSER_FUNCTIONS, call_parser_function,
                                       tag_fn)
from wiktextract import languages

#import pstats
#import cProfile


# List of search paths for Lua libraries.
builtin_lua_search_paths = [
    "lua",
    "lua/mediawiki-extensions-Scribunto/includes/engines/LuaCommon/lualib",
]

MAX_LEN = 75

langs = collections.defaultdict(int)

PAIRED_HTML_TAGS = set(k for k, v in wikitext.ALLOWED_HTML_TAGS.items()
                       if not v.get("no-end-tag") and not v.get("close-next"))

KNOWN_LANGUAGE_TAGS = set(x["code"] for x in languages.all_languages
                          if x.get("code") and x.get("name"))
LANGUAGE_CODE_TO_NAME = { x["code"]: x["name"]
                          for x in languages.all_languages
                          if x.get("code") and x.get("name") }

def canonicalize_template_name(name):
    """Canonicalizes a template name by making its first character uppercase
    and replacing underscores by spaces and sequences of whitespace by a single
    whitespace."""
    assert isinstance(name, str)
    name = re.sub(r"_", " ", name)
    name = re.sub(r"\s+", " ", name)
    name = name.strip()
    if name[:9].lower() == "template:":
        name = name[9:]
    name = name.capitalize()
    return name


def canonicalize_parserfn_name(name):
    """Canonicalizes a parser function name by making its first character
    uppercase and replacing underscores by spaces and sequences of
    whitespace by a single whitespace."""
    assert isinstance(name, str)
    name = re.sub(r"_", " ", name)
    name = re.sub(r"\s+", " ", name)
    name = name.strip()
    if name not in PARSER_FUNCTIONS:
        name = name.lower()  # Parser function names are case-insensitive
    return name


def template_to_body(title, text):
    """Extracts the portion to be transcluded from a template body.  This
    returns an str."""
    assert isinstance(title, str)
    assert isinstance(text, str)
    # Preprocess the template, handling, e.g., <nowiki> ... </nowiki> and
    # HTML comments
    text = wikitext.preprocess_text(text)
    # Remove all text inside <noinclude> ... </noinclude>
    text = re.sub(r"(?is)<\s*noinclude\s*>.*?<\s*/\s*noinclude\s*>",
                  "", text)
    text = re.sub(r"(?is)<\s*noinclude\s*/\s*>", "", text)
    # <onlyinclude> tags, if present, include the only text that will be
    # transcluded.  All other text is ignored.
    onlys = list(re.finditer(r"(?is)<\s*onlyinclude\s*>(.*?)"
                             r"<\s*/\s*onlyinclude\s*>|"
                             r"<\s*onlyinclude\s*/\s*>",
                             text))
    if onlys:
        text = "".join(m.group(1) or "" for m in onlys)
    # Remove <includeonly>.  They mark text that is not visible on the page
    # itself but is included in transclusion.  Also text outside these tags
    # is included in transclusion.
    text = re.sub(r"(?is)<\s*(/\s*)?includeonly\s*(/\s*)?>", "", text)
    # Sanity checks for certain unbalanced tags.  However, it appears some
    # templates intentionally produce these and intend them to be displayed.
    # Thus don't warn, and we may even need to arrange for them to be properly
    # parsed as text.
    if False:
       m = re.search(r"(?is)<\s*(/\s*)?noinclude\s*(/\s*)?>", text)
       if m:
           print("{}: unbalanced {}".format(title, m.group(0)))
       m = re.search(r"(?is)<\s*(/\s*)?onlyinclude\s*(/\s*)?>", text)
       if m:
           print("{}: unbalanced {}".format(title, m.group(0)))
    return text


def analyze_template(name, body):
    """Analyzes a template body and returns a set of the canonicalized
    names of all other templates it calls and a boolean that is True
    if it should be pre-expanded before final parsing and False if it
    need not be pre-expanded.  The pre-expanded flag is determined
    based on that body only; the caller should propagate it to
    templates that include the given template.  This does not work for
    template and template function calls where the name is generated by
    other expansions."""
    assert isinstance(body, str)
    included_templates = set()
    pre_expand = False

    # Determine if the template starts with a list item
    contains_list = re.search(r"(?s)^[#*;:]", body) is not None

    # Remove paired tables
    prev = body
    while True:
        unpaired_text = re.sub(
            r"(?s)(^|\n)\{\|([^\n]|\n+[^{|]|\n+\|[^}]|\n+\{[^|])*?\n+\|\}",
            r"", prev)
        if unpaired_text == prev:
            break
        prev = unpaired_text
    #print("unpaired_text {!r}".format(unpaired_text))

    # Determine if the template contains an unpaired table
    contains_unpaired_table = re.search(r"(?s)(^|\n)(\{\||\|\})",
                                        unpaired_text) is not None

    # Determine if the template contains table element tokens outside
    # paired table start/end.  We only try to look for these outside templates,
    # as it is common to write each template argument on its own line starting
    # with a "|".
    outside = unpaired_text
    while True:
        #print("=== OUTSIDE ITER")
        prev = outside
        while True:
            newt = re.sub(r"(?s)\{\{\{([^{}]|\}[^}]|\}\}[^}])*?\}\}\}",
                          "", prev)
            if newt == prev:
                break
            prev = newt
        #print("After arg elim: {!r}".format(newt))
        newt = re.sub(r"(?s)\{\{([^{}]|\}[^}])*?\}\}", "", newt)
        #print("After templ elim: {!r}".format(newt))
        if newt == outside:
            break
        outside = newt
    # For now, we'll ignore !! and ||
    m = re.search(r"(?s)(^|\n)(\|\+|\|-|\||\!)", outside)
    contains_table_element = m is not None
    # if contains_table_element:
    #     print("contains_table_element {!r} at {}"
    #           "".format(m.group(0), m.start()))
    #     print("... {!r} ...".format(outside[m.start() - 10:m.end() + 10]))
    #     print(repr(outside))

    # Check for unpaired HTML tags
    tag_cnts = collections.defaultdict(int)
    for m in re.finditer(r"(?si)<\s*(/\s*)?({})\b\s*[^>]*(/\s*)?>"
                         r"".format("|".join(PAIRED_HTML_TAGS)), outside):
        start_slash = m.group(1)
        tagname = m.group(2)
        end_slash = m.group(3)
        if start_slash:
            tag_cnts[tagname] -= 1
        elif not end_slash:
            tag_cnts[tagname] += 1
    contains_unbalanced_html = any(v != 0 for v in tag_cnts.values())
    # if contains_unbalanced_html:
    #     print(name, "UNBALANCED HTML")
    #     for k, v in tag_cnts.items():
    #         if v != 0:
    #             print("  {} {}".format(v, k))

    # Determine whether this template should be pre-expanded
    pre_expand = (contains_list or contains_unpaired_table or
                  contains_table_element or contains_unbalanced_html)

    # if pre_expand:
    #     print(name,
    #           {"list": contains_list,
    #            "unpaired_table": contains_unpaired_table,
    #            "table_element": contains_table_element,
    #            "unbalanced_html": contains_unbalanced_html,
    #            "pre_expand": pre_expand,
    #     })

    # Determine which other templates are called from unpaired text.
    # None of the flags we currently gather propagate outside a paired
    # table start/end.
    for m in re.finditer(r"(?s)(^|[^{])(\{\{)?\{\{([^{]*?)(\||\}\})",
                         unpaired_text):
        name = m.group(3)
        name = re.sub(r"(?si)<\s*nowiki\s*/\s*>", "", name)
        name = canonicalize_template_name(name)
        if not name:
            continue
        included_templates.add(name)

    return included_templates, pre_expand

print("Loading specials (templates & modules)")
with open("tempXXXspecials.json") as f:
    specials = json.load(f)
print("Analyzing templates", len(specials))

# Extract module and template definitions from the collected special pages.
# We also determine which templates need to be pre-expanded to allow parsing
# the resulting structure.  (This determination is somewhat heuristic and
# is not guaranteed to always produce optimal results.  However, it
# significantly improves the parseability of the resulting structure of a
# page.)
modules = {}
templates = {}
templates["!"] = "&vert;"
contains_list_set = set()
contains_table_element_set = set()
contains_unpaired_table = set()
need_pre_expand = set()
included_map = collections.defaultdict(set)
expand_q = []
redirects = {}
for tag, title, text in specials:
    # XXX should this be enabled? title = html.unescape(title)
    if tag == "#redirect":
        redirects[title] = text
        continue
    if tag == "Scribunto":
        text = html.unescape(text)
        modules[title] = text
        continue
    if title.endswith("/testcases"):
        continue
    if title.startswith("User:"):
        continue
    if tag != "Template":
        continue

    # print(tag, title)
    name = canonicalize_template_name(title)
    #if name != "Sv-decl-noun":
    #    continue
    text = html.unescape(text)
    body = template_to_body(title, text)
    assert isinstance(body, str)
    included_templates, pre_expand = analyze_template(name, body)
    for x in included_templates:
        included_map[x].add(name)
    if pre_expand:
        need_pre_expand.add(name)
        expand_q.append(name)
    templates[name] = body

# Propagate pre_expand from lower-level templates to all templates that
# refer to them
while expand_q:
    name = expand_q.pop()
    if name not in included_map:
        continue
    for inc in included_map[name]:
        if inc in need_pre_expand:
            continue
        #print("propagating EXP {} -> {}".format(name, inc))
        need_pre_expand.add(inc)
        expand_q.append(name)

# Copy template definitions to redirects to them
for k, v in redirects.items():
    if not k.startswith("Template:"):
        continue
    k = k[9:]
    if not v.startswith("Template:"):
        continue
    v = v[9:]
    k = canonicalize_template_name(k)
    v = canonicalize_template_name(v)
    if v not in templates:
        # print("{} redirects to non-existent template {}".format(k, v))
        continue
    if k in templates:
        # print("{} -> {} is redirect but already in templates".format(k, v))
        continue
    templates[k] = templates[v]
    if v in need_pre_expand:
        need_pre_expand.add(k)

# for name in templates.keys():
#     if name in need_pre_expand:
#         print("EXP", name)
#     #else:
#     #    print("   ", name)

class FmtCtx(object):
    __slots__ = (
        "indent",  # Current indent for text
        "inpara",  # True if inside paragraph
        "nowrap",  # True to suppress line wrap at current position
        "parts",   # Text accumulated here (list of str)
        "pos",     # Character position on current line
        "space",   # True if space should be inserted before next text
        "variables",  # Dictionary mapping template arg name (str) to value
    )
    def __init__(self):
        self.indent = 0
        self.inpara = False
        self.nowrap = False
        self.parts = []
        self.pos = 0
        self.space = False
        self.variables = {}

    def add(self, txt):
        assert isinstance(txt, str)
        for w in re.split(r"(\s+)", txt):
            if not w:
                continue
            if w.isspace():
                self.space = True
                continue
            # We are adding a non-space segment
            w = html.unescape(w)
            if (self.pos > 0 and self.pos + len(w) + 1 > MAX_LEN and
                not self.nowrap and
                (self.space or
                 (not self.parts[-1][-1].isalnum() or
                  not self.parts[-1][-1].isalnum()))):
                self.parts.append("\n")
                self.pos = 0
                self.space = False
                if self.pos == 0 and self.indent > 0:
                    self.parts.append(" " * self.indent)
                    self.pos += self.indent
            if self.space:
                self.space = False
                if (self.pos > 0 and not self.nowrap and self.parts and
                      not self.parts[-1][-1].isspace()):
                    self.parts.append(" ")
                    self.pos += 1
            self.parts.append(w)
            self.pos += len(w)
            self.inpara = True
            self.nowrap = False

    def add_prefix(self, txt):
        assert isinstance(txt, str)
        self.newline()
        if self.pos == 0 and self.indent > 0:
            self.parts.append(" " * self.indent)
            self.pos += self.indent
        self.parts.append(txt)
        self.pos += len(txt)
        self.nowrap = True
        self.inpara = True
        self.space = False

    def newline(self):
        if self.parts and not self.parts[-1].endswith("\n"):
            self.parts.append("\n")
            self.pos = 0

    def newpara(self):
        if self.inpara:
            self.newline()
            self.parts.append("\n")
            self.inpara = False
            self.nowrap = False

def list_to_text(ctx, node):
    prefix = node.args
    number = 1
    ctx.newline()
    for child in node.children:
        if not isinstance(child, WikiNode) or child.kind != NodeKind.LIST_ITEM:
            print("Unexpected item under list: {}".format(child.kind))
            continue
        if prefix.endswith("#"):
                p = "{}. ".format(number)
                number += 1
        elif prefix.endswith("*"):
            p = "* "
        elif prefix.endswith(":"):
            p = "    "
        else:
            p = "? "
        ctx.add_prefix(p)
        ctx.indent += len(p)
        to_text_recurse(ctx, child)
        ctx.indent -= len(p)
        ctx.newline()

def link_to_text(ctx, node):
    assert isinstance(node, WikiNode)
    trail = to_text(ctx, node.children).strip()
    page = to_text(ctx, node.args[0]).strip()
    if len(node.args) > 1:
        txt = to_text(ctx, node.args[1]).strip()
    else:
        txt = page
    if not txt:
        # Pipe trick
        txt = page
        idx = txt.find(":")
        if idx >= 0:
            txt = txt[idx + 1:]
        idx = txt.find("(")
        if idx >= 0:
            txt = txt[:idx]
        else:
            idx = txt.find(",")
            if idx >= 0:
                txt = txt[:idx]
        txt = txt.strip()
    # XXX should we do some inflection with trail links?
    ctx.add(txt + trail)


def template_to_text(ctx, node):
    assert isinstance(ctx, FmtCtx)
    assert isinstance(node, WikiNode)

    # Clean up template name to canonical form
    name = to_text(ctx, node.args[0])
    name = canonicalize_template_name(name)

    body = templates.get(name)

    if body is None:
        print("Reference to undefined template {!r}".format(name))
        ctx.add("{{")
        to_text_list(ctx, node.args[0])
        for x in node.args[1:]:
            ctx.add("|")
            to_text_list(ctx, x)
        ctx.add("}}")
        return

    assert isinstance(body, (list, tuple))
    old_vars = ctx.variables
    new_vars = old_vars.copy()
    argnum = 1
    for x in node.args:
        txt = to_text(ctx, x)
        m = re.match(r"(?s)^([^][#<>\{\}\|])=(.*)$", txt)
        if m:
            argname = m.group(1).strip()
            argvalue = m.group(2).strip()
        else:
            argname = str(argnum)
            argnum += 1
            argvalue = txt
        new_vars[argname] = argvalue

    ctx.variables = new_vars
    to_text_list(ctx, body)
    ctx.variables = old_vars


def templatevar_to_text(ctx, node):
    ctx.add("{{{")
    to_text_list(ctx, node.args[0])
    for x in node.args[1:]:
        ctx.add("|")
        to_text_list(ctx, x)
    ctx.add("}}}")

def parserfn_to_text(ctx, node):
    ctx.add("{{")
    to_text_list(ctx, node.args[0])
    ctx.add(":")
    if len(node.args) > 1:
        to_text_list(ctx, node.args[1])
        for x in node.args[2:]:
            ctx.add("|")
            to_text_list(ctx, x)
    ctx.add("}}")

def title_to_text(ctx, node, underline):
    txt = to_text(ctx, node.args[0]).strip()
    ctx.newpara()
    ctx.add_prefix(txt)
    ctx.newline()
    ctx.add_prefix(underline * len(txt))
    ctx.newpara()
    to_text_list(ctx, node.children)

def to_text_list(ctx, lst):
    assert isinstance(lst, (list, tuple))
    for x in lst:
        to_text_recurse(ctx, x)

def to_text_recurse(ctx, node):
    assert isinstance(node, (str, WikiNode))
    if isinstance(node, str):
        ctx.add(node)
        return
    kind = node.kind
    if kind == NodeKind.LEVEL2:
        title_to_text(ctx, node, "#")
    elif kind == NodeKind.LEVEL3:
        title_to_text(ctx, node, "=")
    elif kind == NodeKind.LEVEL4:
        title_to_text(ctx, node, "-")
    elif kind == NodeKind.LEVEL5:
        title_to_text(ctx, node, "~")
    elif kind == NodeKind.LEVEL6:
        title_to_text(ctx, node, ".")
    elif kind == NodeKind.LIST:
        list_to_text(ctx, node)
    elif kind == NodeKind.HLINE:
        ctx.newpara()
        ctx.add_prefix("-" * (MAX_LEN - ctx.indent))
        ctx.newpara()
    elif kind == NodeKind.PRE:
        ctx.newpara()
        txt = "".join(node.children).strip()
        for line in txt.split("\n"):
            ctx.add_prefix(line)
            ctx.newline()
        ctx.newpara()
    elif kind == NodeKind.LINK:
        link_to_text(ctx, node)
    elif kind == NodeKind.TEMPLATE:
        template_to_text(ctx, node)
    elif kind == NodeKind.TEMPLATEVAR:
        templatevar_to_text(ctx, node)
    elif kind == NodeKind.PARSERFN:
        parserfn_to_text(ctx, node)
    elif kind == NodeKind.URL:
        if len(node.args) > 1:
            to_text_list(ctx, node.args[1])
        else:
            to_text_list(ctx, node.args[0])
    elif kind == NodeKind.TABLE:
        parts.append("<XXX TABLE>")
    elif kind == NodeKind.MAGIC_WORD:
        # Magic word - generate no output
        # XXX check if some should generate output
        pass
    else:
        to_text_list(ctx, node.children)


def to_text(ctx, lst):
    """Converts content from ``lst`` to text (str).  Template variables are
    taken from ``ctx``, but otherwise a new context will be used and ``ctx``
    will not be modified."""
    assert isinstance(ctx, FmtCtx)
    assert isinstance(lst, (list, tuple))
    new_ctx = FmtCtx()
    new_ctx.variables = ctx.variables.copy()
    for x in lst:
        to_text_recurse(new_ctx, x)
    return "".join(new_ctx.parts)


def analyze_node(node):
    if isinstance(node, str):
        return
    assert isinstance(node, WikiNode)
    kind = node.kind
    if kind == NodeKind.LEVEL2:
        ctx = FmtCtx()
        title = to_text(ctx, node.args[0]).strip()
        langs[title] += 1

    for x in node.children:
        analyze_node(x)


class ExpandCtx(object):
    __slots__ = (
        "title",         # current page title
        "template_name", # name of template currently being expanded
        "templates",     # dict temlate name -> definition
        "template_fn",   # None or function to expand template
        "invoke_fn",     # None or function to invoke Lua macro
    )
    def __init__(self, title, templates, template_fn, invoke_fn):
        """Creates an expansion context used for expanding some or all
        templates from WikiText."""
        assert isinstance(title, str)
        assert isinstance(templates, dict)
        assert template_fn is None or callable(template_fn)
        assert invoke_fn is None or callable(invoke_fn)
        self.title = title
        self.template_name = None
        self.templates = templates
        self.template_fn = template_fn
        self.invoke_fn = invoke_fn


def expand_listed_templates(title, text, expand_templates, invoke_fn):
    """Expands templates whose names are in ``template_names`` and their
    arguments (including also all other templates referenced from the
    arguments).  This may call Lua code.  This returns text with the named
    templates expanded; other templates are not expanded."""
    assert isinstance(title, str)
    assert isinstance(text, str)
    assert isinstance(expand_templates, (set, dict))
    assert callable(invoke_fn)
    ctx = ExpandCtx(title, templates, None, invoke_fn)  # template_fn
    # Magic prefix for encoding already processed templates and template
    # arguments
    magic = base64.b64encode(os.urandom(16), altchars=b"#!").decode("utf-8")
    cookies = []
    rev_ht = {}

    def save_value(kind, args):
        """Saves a value of a particular kind and returns a unique magic
        cookie for it."""
        assert kind in ("T", "A", "P", "L")  # Template, arg, parserfn, link
        assert isinstance(args, (list, tuple))
        args = tuple(args)
        v = (kind, args)
        if v in rev_ht:
            return "!" + magic + kind + str(rev_ht[v]) + "!"
        idx = len(cookies)
        cookies.append(v)
        rev_ht[v] = idx
        ret = "!" + magic + kind + str(idx) + "!"
        return ret

    def repl_arg(m):
        """Replacement function for template arguments."""
        orig = m.group(1)
        args = orig.split("|")
        return save_value("A", args)

    def repl_templ(m):
        """Replacement function for templates {{...}} and template
        functions."""
        orig = m.group(1)
        args = orig.split("|")
        name = args[0].strip()
        if name[:10].lower() == "safesubst:":
            name = name[10:]
        ofs = name.find(":")
        if ofs > 0:
            # It might be a parser function call
            fn_name = canonicalize_parserfn_name(name[:ofs])
            # Check if it is a recognized parser function name
            if fn_name in PARSER_FUNCTIONS:
                return save_value("P", [fn_name, name[ofs + 1:]] + args[1:])
        # As a compatibility feature, recognize parser functions also as the
        # first argument of a template, whether there are more arguments or
        # not.  This is used for magic words and some parser functions have
        # an implicit compatibility template that essentially does this.
        fn_name = canonicalize_parserfn_name(name)
        if fn_name in PARSER_FUNCTIONS:
            return save_value("P", [fn_name] + args)
        # Otherwise it is a normal template expansion
        return save_value("T", args)

    def repl_link(m):
        """Replacement function for links [[...]]."""
        orig = m.group(1)
        return save_value("L", (orig,))

    def encode(text):
        """Encode all templates, template arguments, and parser function calls
        in the text, from innermost to outermost."""
        while True:
            prev = text
            text = re.sub(r"\[\[([^][{}]+)\]\]", repl_link, text)
            while True:
                prev2 = text
                text = re.sub(r"(?s)\{\{\{(([^{}]|\}[^}]|\}\}[^}])*?)\}\}\}",
                              repl_arg, text)
                if text == prev2:
                    break
            text = re.sub(r"(?s)\{\{(([^{}]|\}[^}])+?)\}\}",
                          repl_templ, text)
            if text == prev:
                break
            prev = text
        return text

    def unexpanded_template(tname, ht):
        """Formats an unexpanded template (whose arguments may have been
        partially or fully expanded)."""
        assert isinstance(tname, str)
        assert isinstance(ht, dict)
        args = [tname]
        more_args = []
        for k, v in ht.items():
            if isinstance(k, int):
                while len(args) <= k:
                    args.append("")
                args[k] = v
            else:
                more_args.append("{}={}".format(k, v))
        args += list(sorted(more_args))
        return "{{" + "|".join(args) + "}}"

    def unexpanded_parserfn(fn_name, args):
        """Formats an unexpanded parser function call (whose arguments may
        have been partially or fully expanded)."""
        assert isinstance(fn_name, str)
        assert isinstance(args, (list, tuple))
        return "{{" + fn_name + ":" + "|".join(args) + "}}"

    def expand(coded, argmap, stack, parent):
        assert isinstance(coded, str)
        assert isinstance(argmap, dict)
        assert isinstance(stack, list)
        assert isinstance(parent, (tuple, type(None)))
        parts = []
        pos = 0
        for m in re.finditer(r"!{}(.)(\d+)!".format(magic), coded):
            new_pos = m.start()
            if new_pos > pos:
                parts.append(coded[pos:new_pos])
            pos = m.end()
            kind = m.group(1)
            idx = int(m.group(2))
            kind2, args = cookies[idx]
            assert isinstance(args, tuple)
            assert kind == kind2
            if kind == "T":
                # Template transclusion
                stack.append("TEMPLATE_NAME")
                tname = expand(args[0], argmap, stack, parent)
                stack.pop()
                name = canonicalize_template_name(tname)
                stack.append(name)
                ht = {}
                num = 1
                for i in range(1, len(args)):
                    arg = args[i]
                    stack.append("ARG{}".format(i))
                    arg = expand(arg, argmap, stack, parent)
                    stack.pop()
                    ofs = arg.find("=")
                    if ofs <= 0:
                        k = num
                        num += 1
                    else:
                        k = arg[:ofs].strip()
                        if k.isdigit():
                            k = int(k)
                            if k < 1 or k > 1000:
                                print("{}: invalid argument number {}"
                                      "".format(title, k))
                                k = 1000
                            if num <= k:
                                num = k + 1
                        arg = arg[ofs + 1:]
                    ht[k] = arg

                # Check if this template is defined
                if name not in ctx.templates:
                    stack.pop()
                    print("{}: uses undefined template {!r} at {}"
                          "".format(title, tname, stack))
                    parts.append(unexpanded_template(tname, ht))
                    continue

                # Limit recursion depth
                if len(stack) >= 20:
                    stack.pop()
                    print("{}: too deep expansion of templates via {}"
                          "".format(title, stack))
                    parts.append(unexpanded_template(tname, ht))
                    continue

                # If this template is not one of those we want to expand,
                # return it unexpanded (but with arguments possibly expanded)
                if name not in expand_templates:
                    parts.append(unexpanded_template(tname, ht))
                    continue

                # Expand the body, either using ``template_fn`` or using
                # normal template expansion
                if ctx.template_fn is not None:
                    t = template_fn(name, ht)
                else:
                    body = ctx.templates[name]
                    encoded_body = encode(body)
                    t = expand(encoded_body, ht, stack,
                               (name, ht))

                assert isinstance(t, str)
                stack.pop()  # template name
                parts.append(t)
            elif kind == "A":
                # Template argument reference
                if len(args) > 2:
                    print("{}: too many args ({}) in argument reference {!r}"
                          "".format(title, len(args), args))
                stack.append("ARG_NAME")
                k = expand(args[0], argmap, stack, parent).strip()
                stack.pop()
                stack.append("ARG{}-DEFVAL".format(k))
                if len(args) >= 2:
                    defval = expand(args[1], argmap, stack, parent)
                else:
                    defval = None
                stack.pop()
                if k.isdigit():
                    k = int(k)
                v = argmap.get(k, defval)
                if v is not None:
                    parts.append(v)
                    continue
                # The argument is not defined (or name is empty)
                assert defval is None
                arg = "{{{" + str(k) + "}}}"
                parts.append(arg)
            elif kind == "P":
                # Parser function call
                stack.append("PARSERFN_FN")
                fn_name = expand(args[0], argmap, stack, parent)
                stack.pop()
                fn_name = canonicalize_parserfn_name(fn_name)
                stack.append(fn_name)
                expanded_args = []
                for i in range(1, len(args)):
                    arg = args[i]
                    stack.append("ARG{}".format(i))
                    arg = expand(arg, argmap, stack, parent)
                    stack.pop()
                    expanded_args.append(arg)
                if fn_name == "#invoke" and ctx.invoke_fn is not None:
                    ret = ctx.invoke_fn(expanded_args, stack, parent)
                else:
                    ret = call_parser_function(fn_name, expanded_args,
                                               title, stack)
                stack.pop()  # fn_name
                # XXX if lua code calls frame:preprocess(), then we should
                # apparently encode and expand the return value, similarly to
                # template bodies (without argument expansion)
                parts.append(ret)
            elif kind == "L":
                # Link to another page
                content = args[0]
                stack.append("[[link]]")
                content = expand(content, argmap, stack, parent)
                stack.pop()
                parts.append("[[" + content + "]]")
            else:
                print("{}: unsupported cookie kind {!r} in {}"
                      "".format(title, kind, m.group(0)))
                parts.append(m.group(0))
        parts.append(coded[pos:])
        return "".join(parts)

    # Encode all template calls, template arguments, and parser function
    # calls on the page.  This is an inside-out operation.
    # print("Encoding")
    encoded = encode(text)

    # Recursively expand the selected templates.  This is an outside-in
    # operation.
    # print("Expanding")
    expanded = expand(encoded, {}, [title], None)

    return expanded


def lua_loader(modname):
    """This function is called from the Lua sandbox to load a Lua module.
    This will load it from either the user-defined modules on special
    pages or from a built-in module in the file system.  This returns None
    if the module could not be loaded."""
    # print("Loading", modname)
    if modname.startswith("Module:"):
        modname = modname[7:]
    if modname in modules:
        return modules[modname]
    path = modname
    path = re.sub(r":", "/", path)
    path = re.sub(r" ", "_", path)
    # path = re.sub(r"\.", "/", path)
    path = re.sub(r"//+", "/", path)
    path = re.sub(r"\.\.", ".", path)
    if path.startswith("/"):
        path = path[1:]
    path += ".lua"
    for prefix in builtin_lua_search_paths:
        p = prefix + "/" + path
        if os.path.isfile(p):
            with open(p, "r") as f:
                data = f.read()
            return data
    print("MODULE NOT FOUND:", modname)
    return None


def mw_text_decode(text, decodeNamedEntities=False):
    """Implements the mw.text.decode function for Lua code."""
    if decodeNamedEntities:
        return html.unescape(text)

    # Otherwise decode only selected entities
    parts = []
    pos = 0
    for m in re.finditer(r"&(lt|gt|amp|quot|nbsp);", text):
        if pos < m.start():
            parts.append(text[pos:m.start()])
        pos = m.end()
        tag = m.group(1)
        if tag == "lt":
            parts.append("<")
        elif tag == "gt":
            parts.append(">")
        elif tag == "amp":
            parts.append("&")
        elif tag == "quot":
            parts.append('"')
        elif tag == "nbsp":
            parts.append("\xa0")
        else:
            assert False
    parts.append(text[pos:])
    return "".join(parts)

def mw_text_encode(text, charset='<>&\xa0'):
    """Implements the mw.text.encode function for Lua code."""
    parts = []
    for ch in text:
        if ch in charset:
            chn = ord(ch)
            if chn in html.entities.codepoint2name:
                parts.append("&" + html.entities.codepoint2name.get(ch) + ";")
            else:
                parts.append(ch)
        else:
            parts.append(ch)
    return "".join(parts)


def get_page_info(title):
    """Retrieves information about a page identified by its table (with
    namespace prefix.  This returns a lua table with fields "id", "exists",
    and "redirectTo".  This is used for retrieving information about page
    titles."""
    assert isinstance(title, str)

    # XXX actually look at information collected in phase 1 to determine
    page_id = 0  # XXX collect required info in phase 1
    page_exists = False  # XXX collect required info in Phase 1
    redirect_to = redirects.get(title, None)

    # whether the page exists and what its id might be
    dt = {
        "id": page_id,
        "exists": page_exists,
        "redirectTo": redirect_to,
    }
    return lua.table_from(dt)


def fetch_language_name(code):
    if code in LANGUAGE_CODE_TO_NAME:
        return LANGUAGE_CODE_TO_NAME[code]
    return None


def fetch_language_names(include):
    include = str(include)
    if include == "all":
        ret = LANGUAGE_CODE_TO_NAME
    else:
        ret = {"en": "English"}
    return lua.table_from(dt)


# Load Lua sandbox code.
lua_sandbox = open("lua/lua_sandbox.lua").read()

def filter_attribute_access(obj, attr_name, is_setting):
    print("FILTER:", attr_name, is_setting)
    if isinstance(attr_name, unicode):
        if not attr_name.startswith("_"):
            return attr_name
    raise AttributeError("access denied")

lua = LuaRuntime(unpack_returned_tuples=True,
                 register_eval=False,
                 attribute_filter=filter_attribute_access)
lua.execute(lua_sandbox)
lua.eval("lua_set_loader")(lua_loader,
                           mw_text_decode,
                           mw_text_encode,
                           get_page_info,
                           fetch_language_name,
                           fetch_language_names)


def invoke_fn(invoke_args, stack, parent):
    """This is called to expand a #invoke parser function."""
    assert isinstance(invoke_args, (list, tuple))
    assert isinstance(stack, list)
    assert isinstance(parent, (tuple, type(None)))
    # print("#invoke", invoke_args, "parent", parent, "stack", stack)
    if len(invoke_args) < 2:
        print("#invoke {}: too few arguments at {}"
              "".format(invoke_args, stack))
        return "{{" + invoke_args[0] + ":" + "|".join(invoke_args[1:]) + "}}"

    # Get module and function name
    modname = invoke_args[0]
    modfn = invoke_args[1]

    def getArgument(frame_args, k):
        assert isinstance(frame_args, dict)
        v = frame_args[k]
        if v is None:
            return v
        obj = {"expand": lambda obj: v}
        return lua.table_from(obj)

    def value_with_expand(frame, expander, x):
        assert isinstance(frame, dict)
        assert isinstance(expander, str)
        assert isinstance(x, str)
        obj = {"expand": lambda obj: frame[expander](x)}
        return lua.table_from(obj)

    def make_frame(pframe, title, args):
        assert isinstance(title, str)
        assert isinstance(args, (list, tuple, dict))
        # Convert args to a dictionary with default value None
        if isinstance(args, dict):
            frame_args = args
        else:
            assert isinstance(args, (list, tuple))
            frame_args = {}
            num = 1
            for arg in args:
                ofs = arg.find("=")
                if ofs <= 0:
                    k = num
                    num += 1
                else:
                    k = arg[:ofs].strip()
                    if k.isdigit():
                        k = int(k)
                        if k < 1 or k > 1000:
                            k = 1000
                        if num <= k:
                            num = k + 1
                    arg = arg[ofs + 1:]
                frame_args[k] = arg
        frame_args = lua.table_from(frame_args)

        def extensionTag(frame, lua_args):
            #print(list(lua_args.items()))
            name = lua_args["name"] or ""
            content = lua_args["content"] or ""
            args = lua_args["args"] or ""
            #print("extensionTag frame={} name={} content={} args={}"
            #      "".format(frame, name, content, args))
            return tag_fn(title, "#tag", [name, content + "".join(args)],
                          ["[make_frame]"])


        # Create frame object as dictionary with default value None
        frame = {}
        frame["getParent"] = lambda self: pframe
        frame["getTitle"] = lambda self: title
        frame["args"] = frame_args
        frame["getArgument"] = lambda self, x: getArgument(frame_args, x)
        frame["newParserValue"] = \
            lambda self, x: value_with_expand(self, "preprocess", x)
        frame["newTemplateParserValue"] = \
            lambda self, x: value_with_expand(self, "expand", x)
        frame["newChild"] = lambda self, title="", args="": \
            make_frame(self, title, args)
        frame["extensionTag"] = extensionTag
        # argumentPairs is set in lua_sandbox.lua
        # XXX callParserFunction(name=None, args=None) used 30
        # XXX expandTemplate(title=None, args=None) used 113
        # XXX extensionTag(name=None, content=None, args=None) used 29
        # XXX preprocess(text=None) used 67
        return lua.table_from(frame)

    # Create parent frame (for page being processed) and current frame
    # (for module being called)
    if parent is not None:
        page_title, page_args = parent
        pframe = make_frame(None, page_title, page_args)
    else:
        pframe = None
    frame = make_frame(pframe, modname, invoke_args[2:])

    # Call the Lua function in the given module
    sys.stdout.flush()
    ok, text = lua.eval("lua_invoke")(modname, modfn, frame)
    if ok:
        return str(text)
    print("LUA ERROR IN #invoke {} at {}".format(invoke_args, stack))
    if isinstance(text, Exception):
        parts = [str(text)]
        lst = traceback.format_exception(etype=type(text),
                                         value=text,
                                         tb=text.__traceback__)
        for x in lst:
            parts.append("\t" + x.strip())
        text = "\n".join(parts)
    elif not isinstance(text, str):
        text = str(text)
    parts = []
    in_traceback = 0
    for line in text.split("\n"):
        s = line.strip()
        if s == "[C]: in function 'xpcall'":
            break
        parts.append(line)
    print("\n".join(parts))
    return ("&lt;&lt;LUA EXECUTION ERROR in {}.{}&gt;&gt;"
            "".format(modname, modfn))


# Process test page
page = open("tests/animal.txt").read()
page_title = "animal"
page = wikitext.preprocess_text(page)
print("=== Expanding templates")
page = expand_listed_templates(page_title, page, templates, invoke_fn)

# XXX expand only need_pre_expand here (now expanding all for testing purposes)
print(page)
print("=== Parsing")
tree = wikitext.parse("animal", page)
print("=== Printing")
wikitext.print_tree(tree)
#analyze_node(tree)
#for k, v in sorted(langs.items(), key=lambda x: x[1], reverse=True):
#    print(v, k)
sys.exit(1)

ctx = FmtCtx()
print(to_text(ctx, tree.children).strip())
print("============================")

sys.exit(1)

print("Total length of modules:", sum(len(x) for x in modules.values()))
print("Total length of templates:", sum(len(x) for x in templates.values()))

param_re = re.compile(
    r"(?s)\{\{\{\s*([^|{}]+?)\s*"
    r"(\|\s*(([^{}]|\}[^{}]|\}\}[^{}])*?)\s*)?\}\}\}")

template_re = re.compile(
    r"(?s)\{\{([^|{}]+?)"
    r"((\|([^|{}]+?))*?)"
    r"\}\}")

# Recursively substitutes all occurrences of ``regexp`` by the result of
# fn(m) for its match object on original text ``text``.  This keeps repeating
# the substitutions until there are no more changes.  However this imposes
# an arbitrary limit of 100 iterations to ensure termination.
def iter_sub(regexp, fn, text):
    for iter in range(0, 100):
        prev = text
        text = re.sub(regexp, fn, text)
        if text == prev:
            break
    return text

def invoke_script(name, ht):
    if name not in modules:
        print("UNRECOGNIZED SCRIPT:", name)
        return "<<UNRECOGNIZED SCRIPT: {}>>".format(name)
    if name.find('"') >= 0:
        print("Invalid lua module name:", name)
        return "<<INVALID LUA MODULE NAME: {}>>".format(name)
    fn_name = ht.get(1)
    if not fn_name or fn_name.find('"') >= 0:
        print("Invalid function name in lua call:", fn_name)
        return "<<INVALID FUNCTION NAME: {}>>".format(fn_name)

    text = lua.eval("lua_invoke")(name, fn_name, ht)
    print("LUA EXPANSION RETURNED:", text)
    text = maybe_automatic_newline(text)
    return text

def expand(text, param_ht):
    def expand_template_param(m):
        name = m.group(1)
        defval = m.group(3) or ""
        print("EXPAND_TEMPLATE", repr(name), repr(defval))
        return param_ht.get(name, defval)

    def invoke_special(name, ht):
        if name.startswith("#invoke:"):
            return invoke_script(name[8:], ht)
        print("UNHANDLED SPECIAL TEMPLATE:", name)
        return "<<UNHANDLED SPECIAL {}>>".format(name)

    def expand_template_body(name, body, ht):
        """Expands template body, processing only included material."""
        assert isinstance(name, str)
        assert isinstance(body, (list, tuple))
        assert isinstance(ht, dict)

        # XXX aren't these old junk? cf. template_to_body???

        # First handle <onlyinclude> by removing everything outside it
        if re.search("(?i)<\s*onlyinclude\s*>", body):
            lst = []
            for m in re.finditer(
                    r"(?si)<\s*onlyinclude\s*>(.*?)<\s*/\s*onlyinclude\s*>"):
                lst.append(m.group(1))
            body = "".join(lst)

        # Remove material inside <noinclude> ... </noinclude>
        body = re.sub(r"(?si)<\s*noinclude\s*>.*?<\s*/\s*noinclude\s*>",
                      "", body)

        # Remove <includeonly> tags
        body = re.sub(r"(?si)<\s*(/\s*)?includeonly\s*>", "", body)

        # XXX should we add parameter for template name?

        text = expand(body, ht)
        text = maybe_automatic_newline(text)
        return text

    def maybe_automatic_newline(text):
        if text and (text[0] in "#*:;" or text.startswith("{|")):
            text = "\n" + text
        return text

    def invoke_template(name, ht):
        if name.startswith("#"):
            # XXX this is not correct?  They are PARSERFN and not all
            # PARSERFN start with #.
            return invoke_special(name, ht)
        if name in templates:
            return expand_template_body(name, templates[name], ht)
        print("UNRECOGNIZED TEMPLATE:", name)
        return "<<<UNRECOGNIZED {}>>>".format(name)

    def expand_template(m):
        template_name = m.group(1).strip()
        params = m.group(2).strip()
        ht = {}
        i = 1
        for param in params.split("|")[1:]:
            param = param.strip()
            mm = re.match(r"(?s)^([^=]+)=(.*)$", param)
            if mm:
                param_name = mm.group(1).strip()
                v = mm.group(2).strip()
            else:
                param_name = str(i)
                i += 1
                v = param
            ht[param_name] = v
        print("{} {}".format(template_name, ht))
        return invoke_template(template_name, ht)

    # Expand parameters
    text = iter_sub(param_re, expand_template_param, text)

    # Expand templates
    text = iter_sub(template_re, expand_template, text)

    return text

def test(orig, param_ht):
    v = expand(orig, param_ht)
    print("{}: {!r} -> {!r}".format(param_ht, orig, v))

test("foo", {})
test("{{{foo", {})
test("{{{foo}}}", {})
test("a{{{foo}}}b", {"foo": "bar"})
templates["foo"] = "FOO{{{1|-noarg}}}"
test("a{{foo}}b", {})
test("a{{foo||}}b", {})
test("a{{foo|11|22}}b", {})
test("a{{foo|x=xx|11|22}}b", {})
test("{{Arachnida Hypernyms}}", {})
sys.exit(1)

all_defs = "".join(modules.values())

cnts = collections.defaultdict(int)

for m in re.finditer(r"""[^a-zA-Z0-9._]([a-zA-Z0-9._]+)""", all_defs):
    t = m.group(1)
    if not t:
        continue
    if t.isdigit():
        continue
    cnts[t] += 1

for k, v in sorted(cnts.items(), key=lambda x: x[1], reverse=True):
    print(v, k)

# XXX implement mw.language

# XXX implement mw.uri.decode

# XXX try to reduce unnecessary evaluation of arguments; only evaluate
# them when actually used (esp. frame.args on #invoke)

# XXX cite-meta calls #invoke for string:sub and gives wrong argument type

# XXX must convert arguments to #invoke to lua strings from Python strings
# Lua strings e.g. have a gsub method

# XXX mw.ustring is not same as ustring; see
# https://www.mediawiki.org/wiki/Extension:Scribunto/Lua_reference_manual#Ustring_patterns and notes on e.g. upper and lower calling mw.language versions