From a06dd866ce95392ae2706fa97ef76f7e975208df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?=
 <1929830+PonteIneptique@users.noreply.github.com>
Date: Sat, 24 Aug 2024 17:59:16 +0200
Subject: [PATCH] Ability to represent Collection and Resources (#3)

* [WIP] Adding a catalog solution

* [WIP] Some basic work-around for children/parents

* [WIP] Moved to a Catalog object

* [WIP] Test catalog ingestion

* Adding flask for the coming up adapter for WEB API

* [WIP] Heavy changes to package structure, and the processor can now use different version

* [WIP] Fixing tests

* Fixing stuff

* Adding test for relationships

* Updating checkout version

* [WIP] Cataloguing

* Parents are parenting
---
 .github/workflows/test.yml                |  2 +-
 dapitains/{local => app}/__init__.py      |  0
 dapitains/constants.py                    | 24 +++++-
 dapitains/metadata/__init__.py            |  0
 dapitains/metadata/classes.py             | 51 ++++++++++++
 dapitains/metadata/xml_parser.py          | 97 +++++++++++++++++++++++
 dapitains/tei/__init__.py                 |  0
 dapitains/{local => tei}/citeStructure.py | 15 ++--
 dapitains/{local => tei}/tei.py           | 13 ++-
 requirements.txt                          |  5 +-
 tests/catalog/example-collection.xml      |  3 +-
 tests/catalog/example-sub-collection.xml  | 23 ++++++
 tests/test_catalog.py                     | 65 +++++++++++++++
 tests/test_citeStructure.py               |  2 +-
 tests/test_tei.py                         |  2 +-
 15 files changed, 278 insertions(+), 24 deletions(-)
 rename dapitains/{local => app}/__init__.py (100%)
 create mode 100644 dapitains/metadata/__init__.py
 create mode 100644 dapitains/metadata/classes.py
 create mode 100644 dapitains/metadata/xml_parser.py
 create mode 100644 dapitains/tei/__init__.py
 rename dapitains/{local => tei}/citeStructure.py (96%)
 rename dapitains/{local => tei}/tei.py (96%)
 create mode 100644 tests/catalog/example-sub-collection.xml
 create mode 100644 tests/test_catalog.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8fe5ced..f057f9a 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -12,7 +12,7 @@ jobs:
       matrix:
         python-version: ["3.8", "3.9", "3.10", "3.11"]
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
diff --git a/dapitains/local/__init__.py b/dapitains/app/__init__.py
similarity index 100%
rename from dapitains/local/__init__.py
rename to dapitains/app/__init__.py
diff --git a/dapitains/constants.py b/dapitains/constants.py
index 6e6ee66..b85f31e 100644
--- a/dapitains/constants.py
+++ b/dapitains/constants.py
@@ -1,12 +1,28 @@
+import logging
+import os
+
 try:
-    from saxonche import PySaxonProcessor, PyXdmNode, PyXPathProcessor
+    saxon_version = os.getenv("pysaxon", "HE")
+    saxon_license = os.getenv("pysaxon_license", "")
+    logging.info(f"Using SaxonLib {saxon_version}")
+    if saxon_version == "HE":
+        import saxonche as saxonlib
+        PROCESSOR = saxonlib.PySaxonProcessor()
+    elif saxon_version == "PE":
+        import saxoncpe as saxonlib
+        PROCESSOR = saxonlib.PySaxonProcessor(license=saxon_license)
+    elif saxon_version == "PE":
+        import saxoncee as saxonlib
+        PROCESSOR = saxonlib.PySaxonProcessor(license=saxon_license)
 except ImportError:
-    print("PySaxonC-HE not found")
+    print("Unable to import the required PySaxonC version, resorting to PySaxonC-HE")
+    import saxonche as saxonlib
+    PROCESSOR = saxonlib.PySaxonProcessor()
+
 
-PROCESSOR = PySaxonProcessor()
 
 
-def get_xpath_proc(elem: PyXdmNode) -> PyXPathProcessor:
+def get_xpath_proc(elem: saxonlib.PyXdmNode) -> saxonlib.PyXPathProcessor:
     """ Builds an XPath processor around a given element, with the default TEI namespace
 
     :param elem: An XML node, root or not
diff --git a/dapitains/metadata/__init__.py b/dapitains/metadata/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dapitains/metadata/classes.py b/dapitains/metadata/classes.py
new file mode 100644
index 0000000..a77f694
--- /dev/null
+++ b/dapitains/metadata/classes.py
@@ -0,0 +1,51 @@
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+
+@dataclass
+class DublinCore:
+    term: str
+    value: str
+    language: Optional[str] = None
+
+    def json(self):
+        return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value, "language": self.language}
+
+
+class Extension(DublinCore):
+    term: str
+    value: str
+    language: Optional[str] = None
+
+    def json(self):
+        return {"property": self.term, "value": self.value, "language": self.language}
+
+
+@dataclass
+class Collection:
+    identifier: str
+    title: str
+    description: Optional[str] = None
+    dublin_core: List[DublinCore] = field(default_factory=list)
+    extension: List[Extension] = field(default_factory=list)
+    resource: bool = False
+    filepath: Optional[str] = None
+
+    def json(self):
+        return {
+            "identifier": self.identifier,
+            "title": self.title,
+            "description": self.description,
+            "dublin_core": self.dublin_core,
+            "extension": self.extension,
+            "resource": self.resource,
+            "filepath": self.filepath
+        }
+
+@dataclass
+class CitableUnit:
+    resource: str
+    reference: str
+    children: List[str] = field(default_factory=list)
+    dublin_core: List[DublinCore] = field(default_factory=list)
+    extension: List[Extension] = field(default_factory=list)
diff --git a/dapitains/metadata/xml_parser.py b/dapitains/metadata/xml_parser.py
new file mode 100644
index 0000000..c33c8a4
--- /dev/null
+++ b/dapitains/metadata/xml_parser.py
@@ -0,0 +1,97 @@
+import os.path
+import re
+from typing import Dict, Optional, List, Tuple, Any
+from dataclasses import dataclass, field
+import lxml.etree as ET
+from dapitains.metadata.classes import DublinCore, Extension, Collection
+
+
+_re_tag = re.compile(r"[{}]")
+
+
+@dataclass
+class Catalog:
+    relationships: List[Tuple[str, str]] = field(default_factory=list)
+    objects: Dict[str, Collection] = field(default_factory=dict)
+
+
+def parse_metadata(xml: ET.Element) -> Tuple[Dict[str, Any], List[str]]:
+    """ Parse Metadata
+
+    :param xml: Collection/Resource tag
+    :returns: Main metadata obj Resource or Collection objects
+    """
+    obj = {
+        "identifier": xml.attrib["identifier"],
+        "title": xml.xpath("./title[1]/text()")[0],
+        "description": (xml.xpath("./description[1]/text()") or [None])[0]
+    }
+    # Treat Dublin Core
+    dublin_core = []
+    for node in xml.xpath("./dublinCore/*"):
+        tag = node.tag.split("}")[-1]
+        language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
+        text = node.text
+        dublin_core.append(DublinCore(tag, text, language))
+    if dublin_core:
+        obj["dublin_core"] = dublin_core
+
+    # Treat Extension
+    extensions = []
+    for node in xml.xpath("./extension/*"):
+        tag = _re_tag.sub("", node.tag)
+        language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
+        text = node.text
+        extensions.append(Extension(tag, text, language))
+    if extensions:
+        obj["extensions"] = extensions
+
+    # Parents
+    parents = []
+    for node in xml.xpath("./parent/text()"):
+        parents.append(str(node))
+
+    return obj, parents
+
+
+def parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection:
+    """ Parse a Collection or Resource object
+
+    :param xml: Parsed Collection or Resource by LXML
+    :param basedir: Directory used to resolve filepath, that are relative to the main object
+    :param tree: Catalog that is updated with objects.
+    """
+    obj, parents = parse_metadata(xml)
+    obj = Collection(**obj, resource=xml.tag == "resource")
+    for parent in parents:
+        tree.relationships.append((parent, obj.identifier))
+    tree.objects[obj.identifier] = obj
+    if xml.attrib.get("filepath") and obj.resource:
+        obj.filepath = os.path.normpath(os.path.join(basedir, xml.attrib["filepath"]))
+    for member in xml.xpath("./members/*"):
+        if member.xpath("./title"):
+            child = parse_collection(member, basedir, tree)
+            tree.relationships.append((obj.identifier, child.identifier))
+        else:
+            _, child = ingest_catalog(os.path.join(basedir, member.attrib["filepath"]), tree)
+            tree.relationships.append((obj.identifier, child.identifier))
+    return obj
+
+
+def ingest_catalog(path: str, tree: Optional[Catalog] = None) -> Tuple[Catalog, Collection]:
+    """ Ingest a collection description file.
+
+    :param path: Path to a Collection XML File, see the schema at tests/catalog/schema.rng
+    :param tree: Current catalog, which is either updated or created
+    :return: Catalog and root collection found at path.
+
+    >>> ingest_catalog("../../tests/catalog/example-collection.xml")
+    """
+    xml = ET.parse(path)
+    current_dir = os.path.abspath(os.path.dirname(path))
+
+    root: ET.Element = xml.getroot()
+    tree = tree or Catalog()
+    root_collection = parse_collection(root, basedir=current_dir, tree=tree)
+    return tree, root_collection
+
diff --git a/dapitains/tei/__init__.py b/dapitains/tei/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dapitains/local/citeStructure.py b/dapitains/tei/citeStructure.py
similarity index 96%
rename from dapitains/local/citeStructure.py
rename to dapitains/tei/citeStructure.py
index 6547972..79e866d 100644
--- a/dapitains/local/citeStructure.py
+++ b/dapitains/tei/citeStructure.py
@@ -1,10 +1,9 @@
 import re
 from typing import Dict, List, Optional
 from dataclasses import dataclass, field
-from saxonche import PyXdmNode, PyXPathProcessor
 from collections import namedtuple, defaultdict
 from functools import cmp_to_key
-from dapitains.constants import PROCESSOR, get_xpath_proc
+from dapitains.constants import get_xpath_proc, saxonlib
 
 
 @dataclass
@@ -45,7 +44,7 @@ class CitableUnit:
     citeType: str
     ref: str
     children: List["CitableUnit"] = field(default_factory=list)
-    node: Optional[PyXdmNode] = None
+    node: Optional[saxonlib.PyXdmNode] = None
     dublinCore: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list))
     extension: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list))
 
@@ -69,7 +68,7 @@ def to_dts(self):
 _simple_node = namedtuple("SimpleNode", ["citation", "xpath", "struct"])
 
 
-def get_children_cite_structures(elem: PyXdmNode) -> List[PyXdmNode]:
+def get_children_cite_structures(elem: saxonlib.PyXdmNode) -> List[saxonlib.PyXdmNode]:
     xpath = get_xpath_proc(elem=elem).evaluate("./citeStructure")
     if xpath is not None:
         return list(iter(xpath))
@@ -82,7 +81,7 @@ class CiteStructureParser:
     ToDo: Add the ability to use CiteData. This will mean moving from len(element) to len(element.xpath("./citeStructure"))
     ToDo: Add the ability to use citationTree labels
     """
-    def __init__(self, root: PyXdmNode):
+    def __init__(self, root: saxonlib.PyXdmNode):
         self.root = root
         self.xpath_matcher: Dict[str, str] = {}
         self.regex_pattern, cite_structure = self.build_regex_and_xpath(
@@ -189,7 +188,7 @@ def _dispatch(
             self,
             child_xpath: str,
             structure: CitableStructure,
-            xpath_processor: PyXPathProcessor,
+            xpath_processor: saxonlib.PyXPathProcessor,
             unit: CitableUnit):
         # target = self.generate_xpath(child.ref)
         if len(structure.children) == 1:
@@ -207,7 +206,7 @@ def _dispatch(
 
     def find_refs(
             self,
-            root: PyXdmNode,
+            root: saxonlib.PyXdmNode,
             structure: CitableStructure = None,
             unit: Optional[CitableUnit] = None
     ) -> List[CitableUnit]:
@@ -245,7 +244,7 @@ def find_refs(
 
     def find_refs_from_branches(
             self,
-            root: PyXdmNode,
+            root: saxonlib.PyXdmNode,
             structure: List[CitableStructure],
             unit: Optional[CitableUnit] = None
     ) -> List[CitableUnit]:
diff --git a/dapitains/local/tei.py b/dapitains/tei/tei.py
similarity index 96%
rename from dapitains/local/tei.py
rename to dapitains/tei/tei.py
index 01c8406..6ed7299 100644
--- a/dapitains/local/tei.py
+++ b/dapitains/tei/tei.py
@@ -1,10 +1,9 @@
-from dapitains.local.citeStructure import CiteStructureParser
-from dapitains.constants import PROCESSOR, get_xpath_proc
+from dapitains.tei.citeStructure import CiteStructureParser
+from dapitains.constants import PROCESSOR, get_xpath_proc, saxonlib
 from typing import Optional, List, Tuple, Dict
 from lxml.etree import fromstring
 from lxml.objectify import Element, SubElement
 from lxml import objectify
-from saxonche import PyXdmNode, PyXPathProcessor
 import re
 from dapitains.errors import UnknownTreeName
 
@@ -31,7 +30,7 @@ def xpath_walk(xpath: List[str]) -> Tuple[str, List[str]]:
     return current, queue
 
 
-def is_traversing_xpath(parent: PyXdmNode, xpath: str) -> bool:
+def is_traversing_xpath(parent: saxonlib.PyXdmNode, xpath: str) -> bool:
     """ Check if an XPath is traversing more than one level
 
     :param parent:
@@ -49,7 +48,7 @@ def is_traversing_xpath(parent: PyXdmNode, xpath: str) -> bool:
     return False
 
 
-def xpath_walk_step(parent: PyXdmNode, xpath: str) -> Tuple[PyXdmNode, bool]:
+def xpath_walk_step(parent: saxonlib.PyXdmNode, xpath: str) -> Tuple[saxonlib.PyXdmNode, bool]:
     """ Perform an XPath on an element to find a child that is part of the XPath.
     If the child is a direct member of the path, returns a False boolean indicating to move
         onto the next element.
@@ -71,7 +70,7 @@ def xpath_walk_step(parent: PyXdmNode, xpath: str) -> Tuple[PyXdmNode, bool]:
         return xpath_proc.evaluate_single(xpath), False
 
 
-def copy_node(node: PyXdmNode, include_children=False, parent: Optional[Element] = None):
+def copy_node(node: saxonlib.PyXdmNode, include_children=False, parent: Optional[Element] = None):
     """ Copy an XML Node
 
     :param node: Etree Node
@@ -124,7 +123,7 @@ def normalize_xpath(xpath: List[str]) -> List[str]:
 
 
 def reconstruct_doc(
-    root: PyXdmNode,
+    root: saxonlib.PyXdmNode,
     start_xpath: List[str],
     new_tree: Optional[Element] = None,
     end_xpath: Optional[List[str]] = None
diff --git a/requirements.txt b/requirements.txt
index 3fdfa2c..980dd76 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,5 @@
 saxonche==12.5.0
-lxml
\ No newline at end of file
+lxml
+flask
+flask-sqlalchemy
+click
\ No newline at end of file
diff --git a/tests/catalog/example-collection.xml b/tests/catalog/example-collection.xml
index 122a343..72d2047 100644
--- a/tests/catalog/example-collection.xml
+++ b/tests/catalog/example-collection.xml
@@ -7,7 +7,8 @@
         <abstract xmlns="http://purl.org/dc/terms/" xml:lang="fr">Et je peux traduire en français</abstract>
     </dublinCore>
     <members>
-        <resource identifier="https://foo.bar/text" path="../tei/base_tei.xml">
+        <collection filepath="./example-sub-collection.xml"/>
+        <resource identifier="https://foo.bar/text" filepath="../tei/base_tei.xml">
             <title>A simple resource</title>
             <description>With a description</description>
             <dublinCore>
diff --git a/tests/catalog/example-sub-collection.xml b/tests/catalog/example-sub-collection.xml
new file mode 100644
index 0000000..f5798c4
--- /dev/null
+++ b/tests/catalog/example-sub-collection.xml
@@ -0,0 +1,23 @@
+<collection identifier="https://example.org/collection1">
+    <title>My First Collection</title>
+    <dublinCore>
+        <creator xmlns="http://purl.org/dc/terms/">John Doe</creator>
+        <subject xmlns="http://purl.org/dc/terms/">History</subject>
+        <date xmlns="http://purl.org/dc/terms/">2023-08-24</date>
+    </dublinCore>
+    <members>
+        <resource identifier="https://example.org/resource1" filepath="../tei/multiple_tree.xml">
+            <title>Historical Document</title>
+            <description>A document about historical events.</description>
+            <parent>https://foo.bar/default</parent>
+            <dublinCore>
+                <subject xmlns="http://purl.org/dc/terms/">World War II</subject>
+                <language xmlns="http://purl.org/dc/terms/">en</language>
+            </dublinCore>
+            <extensions>
+                <rating xmlns="https://example.org/rating">5 stars</rating>
+                <comment xmlns="https://example.org/comment">Very informative document.</comment>
+            </extensions>
+        </resource>
+    </members>
+</collection>
diff --git a/tests/test_catalog.py b/tests/test_catalog.py
new file mode 100644
index 0000000..c376624
--- /dev/null
+++ b/tests/test_catalog.py
@@ -0,0 +1,65 @@
+import os.path
+
+from dapitains.metadata.xml_parser import ingest_catalog
+from dapitains.metadata.classes import *
+
+
+local_dir = os.path.join(os.path.dirname(__file__))
+
+
+def test_ingestion():
+    tree, _ = ingest_catalog(f"{local_dir}/catalog/example-collection.xml")
+
+    assert tree.objects == {
+        "https://foo.bar/default": Collection(
+            identifier='https://foo.bar/default',
+            title='A collection', description=None,
+            dublin_core=[
+                DublinCore(term='abstract', value='This is a perfect example of an absract.', language=None),
+                DublinCore(term='abstract', value='Et je peux traduire en français', language='fr')], extension=[],
+            resource=False,
+            filepath=None
+        ),
+        "https://example.org/collection1": Collection(
+            identifier='https://example.org/collection1',
+            title='My First Collection',
+            description=None,
+            dublin_core=[
+                DublinCore(term='creator', value='John Doe', language=None),
+                DublinCore(term='subject', value='History', language=None),
+                DublinCore(term='date', value='2023-08-24', language=None)
+            ],
+            extension=[],
+            resource=False,
+            filepath=None
+        ),
+        "https://example.org/resource1": Collection(
+            identifier='https://example.org/resource1',
+            title='Historical Document',
+            description='A document about historical events.',
+            dublin_core=[
+                DublinCore(term='subject', value='World War II', language=None),
+                DublinCore(term='language', value='en', language=None)
+            ],
+            extension=[], resource=True,
+            filepath=os.path.abspath(f"{local_dir}/tei/multiple_tree.xml")
+        ),
+        "https://foo.bar/text": Collection(
+            identifier='https://foo.bar/text',
+            title='A simple resource',
+            description='With a description',
+            dublin_core=[
+                DublinCore(term='title', value='A simple resource', language=None)
+            ],
+            extension=[],
+            resource=True,
+            filepath=os.path.abspath(f"{local_dir}/tei/base_tei.xml")
+        )
+    }
+
+    assert sorted(tree.relationships) == [
+        ('https://example.org/collection1', 'https://example.org/resource1'),
+        ('https://foo.bar/default', 'https://example.org/collection1'),
+        ('https://foo.bar/default', 'https://example.org/resource1',),
+        ('https://foo.bar/default', 'https://foo.bar/text')
+    ]
diff --git a/tests/test_citeStructure.py b/tests/test_citeStructure.py
index c0f3d39..8154fd9 100644
--- a/tests/test_citeStructure.py
+++ b/tests/test_citeStructure.py
@@ -1,4 +1,4 @@
-from dapitains.local.citeStructure import CiteStructureParser
+from dapitains.tei.citeStructure import CiteStructureParser
 from dapitains.constants import PROCESSOR, get_xpath_proc
 import os.path
 import pytest
diff --git a/tests/test_tei.py b/tests/test_tei.py
index 8f0120d..a9f367e 100644
--- a/tests/test_tei.py
+++ b/tests/test_tei.py
@@ -1,7 +1,7 @@
 import os.path
 
 import pytest
-from dapitains.local.tei import Document
+from dapitains.tei.tei import Document
 from lxml.etree import tostring
 
 local_dir = os.path.join(os.path.dirname(__file__), "tei")