cnxmlplus2html

#!/usr/bin/env python
from lxml import etree
import sys
import os
import argparse
import copy
import inspect

from XmlValidator import XmlValidator
from XmlValidator import utils
from xml.sax.saxutils import unescape, escape


def escape_code(document):
    '''Given document as string, return with code elements replaced by their
    escaped string'''
    envs = [('<code>', r'</code>'),
            (r'\(', r'\)'),
            (r'\[', r'\]'),
            (r'\begin{align*}', r'\end{align*}')]
    for env in envs:
        start, end = env
        code_start = document.split(start)
        for i, code_block in enumerate(code_start):
            code_end = code_block.find(end)
            if not code_end > 1:
                continue
            code = code_block[0:code_end]
            # escape the > < and &
            newcode = escape(code)
            code_start[i] = newcode + code_start[i][code_end:]

        document = start.join(code_start)
    return document

if __name__ == "__main__":
    MY_PATH = os.path.realpath(os.path.dirname(__file__))
    SPEC_PATH = os.path.dirname(inspect.getfile(XmlValidator))

    # Parse command line arguments
    argumentParser = argparse.ArgumentParser(
        description='Convert a CNXML+ document to a HTML document.')
    argumentParser.add_argument(
        '--spec', dest='specFilename',
        default="spec.xml",
        help='Filename of the XML specification document.')
    argumentParser.add_argument(
        '--audience', dest='audience',
        default="learner",
        help='Target audience of the transform ( learner | teacher | correct ).')
    argumentParser.add_argument(
        '-o', dest='outputFilename',
        help='Write output to given filename rather than stdout.')
    argumentParser.add_argument(
        'filename', nargs='+',
        help='One or more filenames to process.')
    commandlineArguments = argumentParser.parse_args()

    if commandlineArguments.outputFilename is None:
        outputFile = sys.stdout
    else:
        outputFile = open(commandlineArguments.outputFilename, 'wt')

    validator = XmlValidator(
        open(os.path.join(SPEC_PATH, commandlineArguments.specFilename), 'rt').read())

    mathml_transform = utils.MmlTex()

    conversionFunctions = {}  # Cache

    def cache_conversion_function(iSpec):
        global conversionFunctions, validator, utils, commandlineArguments

        if isinstance(iSpec, basestring):
            # xpath given rather than node
            specEntry = None
            for child in validator.spec:
                xpathNode = child.find('xpath')
                if xpathNode is None:
                    continue
                if xpathNode.text == iSpec:
                    specEntry = child
                    break
        else:
            specEntry = iSpec

        conversionFunction = conversionFunctions.get(specEntry)

        if conversionFunction is None:
            # Cache conversion function
            conversionFunctionNodes = specEntry.xpath(
                './/conversion-callback[contains(@name, "html") and contains(@name, "%s") and not(contains(@name, "html5"))]' % commandlineArguments.audience)
            if len(conversionFunctionNodes) == 0:
                conversionFunctionNodes = specEntry.xpath(
                    './/conversion-callback[contains(@name, "html") and contains(@name, "%s")]' % commandlineArguments.audience)
                if len(conversionFunctionNodes) == 0:
                    utils.warning_message(
                        'No conversion entry for ' + specEntry.find('xpath').text)
                    conversionFunctionSource = 'conversionFunction = lambda self: None'
                else:
                    conversionFunctionSource = conversionFunctionNodes[
                        0].text.strip()
                    if conversionFunctionSource == '':
                        conversionFunctionSource = 'conversionFunction = lambda self: None'
                    else:
                        conversionFunctionSource = 'def conversionFunction(self):\n' + '\n'.join(
                            ['\t' + line for line in conversionFunctionSource.split('\n')]) + '\n'
            else:
                if len(conversionFunctionNodes) != 1:
                    utils.error_message(
                        'More than 1 conversion entry for ' + specEntry.find('xpath').text)
                conversionFunctionSource = conversionFunctionNodes[
                    0].text.strip()
                if conversionFunctionSource == '':
                    conversionFunctionSource = 'conversionFunction = lambda self: None'
                else:
                    conversionFunctionSource = 'def conversionFunction(self):\n' + '\n'.join(
                        ['\t' + line for line in conversionFunctionSource.split('\n')]) + '\n'

            from lxml import etree
            from XmlValidator import utils
            import xml
            localVars = {
                'copy': copy,
                'os': os,
                'etree': etree,
                'utils': utils,
                'convert_using': convert_using,
                'warning_message': utils.warning_message,
                'error_message': utils.error_message,
                'mathml_transform': mathml_transform,
                'escape_latex': utils.escape_latex,
                'latex_math_function_check': utils.latex_math_function_check,
            }
            exec(conversionFunctionSource, localVars)
            conversionFunction = localVars['conversionFunction']
            conversionFunctions[specEntry] = conversionFunction

        return conversionFunction

    def convert_using(iNode, iPath):
        f = cache_conversion_function(iPath)
        return f(iNode)

    def convert_image(iSourceFilename, iDestinationFilename):
        import subprocess
        p = subprocess.Popen(
            ['convert', iSourceFilename, iDestinationFilename])
        p.wait()

    def traverse(iNode, iValidator):
        global conversionFunctions

        children = iNode.getchildren()
        for child in children:
            traverse(child, iValidator)

        # Get associated conversion function
        specEntry = iValidator.documentSpecEntries.get(iNode)
        if specEntry is None:
            utils.error_message(
                'Unhandled element at ' + utils.get_full_dom_path(iNode, iValidator.spec))
        conversionFunction = cache_conversion_function(specEntry)
        parent = iNode.getparent()
        try:
            converted = conversionFunction(iNode)
        except Exception as Error:
            print 'Error: %s %s\nNode: %s\n Parent: %s\n line: %s' % (Error, type(Error), iNode.tag, parent.tag, iNode.sourceline)
            sys.exit(1)

        if isinstance(converted, basestring):
            if parent is None:
                return (converted)
            else:
                from lxml import etree
                dummyNode = etree.Element('dummy')
                dummyNode.text = unescape(converted)
                utils.etree_replace_with_node_list(parent, iNode, dummyNode)
        elif converted is not None:
            if parent is None:
                return unescape(converted)
            else:
                utils.etree_replace_with_node_list(
                    iNode.getparent(), iNode, converted)

    for filename in commandlineArguments.filename:
        if filename == '-':
            fp = sys.stdin
        else:
            fp = open(filename, 'rt')
        validator.validate(
            fp.read(),
            iCleanUp=True)
        document = validator.dom

        Title = filename.replace('.cnxmlplus', '').replace('-', ' ')
# capitalise first letter

        # print etree.tostring(traverse(document, spec), encoding="utf-8",
        # xml_declaration=True)
        outputdoc = traverse(document, validator).encode('utf-8')
        htmloutputdoc = '''<!DOCTYPE html>
    <html>
      <head>
        <title>{Title}</title>
      </head>
      <body>
        {Content}
      </body>
    </html>'''.format(Content=outputdoc, Title=Title)
        htmloutputdoc = escape_code(htmloutputdoc)
        print(htmloutputdoc)
#       outputFile.flush()

        # clean up
#       for f in ["figure-autopp.cb",
#                 "figure.aux",
#                 "figure.cb",
#                 "figure.cb2",
#                 "figure.epsi",
#                 "figure.log",
#                 "figure.pdf",
#                 "figure-pics.pdf",
#                 "figure.png",
#                 "figure.ps",
#                 "figure.tex"]:
#           if os.path.exists(f):
#               os.remove(f)