From a06dd866ce95392ae2706fa97ef76f7e975208df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?=
<1929830+PonteIneptique@users.noreply.github.com>
Date: Sat, 24 Aug 2024 17:59:16 +0200
Subject: [PATCH] Ability to represent Collection and Resources (#3)
* [WIP] Adding a catalog solution
* [WIP] Some basic work-around for children/parents
* [WIP] Moved to a Catalog object
* [WIP] Test catalog ingestion
* Adding flask for the coming up adapter for WEB API
* [WIP] Heavy changes to package structure, and the processor can now use different version
* [WIP] Fixing tests
* Fixing stuff
* Adding test for relationships
* Updating checkout version
* [WIP] Cataloguing
* Parents are parenting
---
.github/workflows/test.yml | 2 +-
dapitains/{local => app}/__init__.py | 0
dapitains/constants.py | 24 +++++-
dapitains/metadata/__init__.py | 0
dapitains/metadata/classes.py | 51 ++++++++++++
dapitains/metadata/xml_parser.py | 97 +++++++++++++++++++++++
dapitains/tei/__init__.py | 0
dapitains/{local => tei}/citeStructure.py | 15 ++--
dapitains/{local => tei}/tei.py | 13 ++-
requirements.txt | 5 +-
tests/catalog/example-collection.xml | 3 +-
tests/catalog/example-sub-collection.xml | 23 ++++++
tests/test_catalog.py | 65 +++++++++++++++
tests/test_citeStructure.py | 2 +-
tests/test_tei.py | 2 +-
15 files changed, 278 insertions(+), 24 deletions(-)
rename dapitains/{local => app}/__init__.py (100%)
create mode 100644 dapitains/metadata/__init__.py
create mode 100644 dapitains/metadata/classes.py
create mode 100644 dapitains/metadata/xml_parser.py
create mode 100644 dapitains/tei/__init__.py
rename dapitains/{local => tei}/citeStructure.py (96%)
rename dapitains/{local => tei}/tei.py (96%)
create mode 100644 tests/catalog/example-sub-collection.xml
create mode 100644 tests/test_catalog.py
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8fe5ced..f057f9a 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -12,7 +12,7 @@ jobs:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v2
with:
diff --git a/dapitains/local/__init__.py b/dapitains/app/__init__.py
similarity index 100%
rename from dapitains/local/__init__.py
rename to dapitains/app/__init__.py
diff --git a/dapitains/constants.py b/dapitains/constants.py
index 6e6ee66..b85f31e 100644
--- a/dapitains/constants.py
+++ b/dapitains/constants.py
@@ -1,12 +1,28 @@
+import logging
+import os
+
try:
- from saxonche import PySaxonProcessor, PyXdmNode, PyXPathProcessor
+ saxon_version = os.getenv("pysaxon", "HE")
+ saxon_license = os.getenv("pysaxon_license", "")
+ logging.info(f"Using SaxonLib {saxon_version}")
+ if saxon_version == "HE":
+ import saxonche as saxonlib
+ PROCESSOR = saxonlib.PySaxonProcessor()
+ elif saxon_version == "PE":
+ import saxoncpe as saxonlib
+ PROCESSOR = saxonlib.PySaxonProcessor(license=saxon_license)
+ elif saxon_version == "PE":
+ import saxoncee as saxonlib
+ PROCESSOR = saxonlib.PySaxonProcessor(license=saxon_license)
except ImportError:
- print("PySaxonC-HE not found")
+ print("Unable to import the required PySaxonC version, resorting to PySaxonC-HE")
+ import saxonche as saxonlib
+ PROCESSOR = saxonlib.PySaxonProcessor()
+
-PROCESSOR = PySaxonProcessor()
-def get_xpath_proc(elem: PyXdmNode) -> PyXPathProcessor:
+def get_xpath_proc(elem: saxonlib.PyXdmNode) -> saxonlib.PyXPathProcessor:
""" Builds an XPath processor around a given element, with the default TEI namespace
:param elem: An XML node, root or not
diff --git a/dapitains/metadata/__init__.py b/dapitains/metadata/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dapitains/metadata/classes.py b/dapitains/metadata/classes.py
new file mode 100644
index 0000000..a77f694
--- /dev/null
+++ b/dapitains/metadata/classes.py
@@ -0,0 +1,51 @@
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+
+@dataclass
+class DublinCore:
+ term: str
+ value: str
+ language: Optional[str] = None
+
+ def json(self):
+ return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value, "language": self.language}
+
+
+class Extension(DublinCore):
+ term: str
+ value: str
+ language: Optional[str] = None
+
+ def json(self):
+ return {"property": self.term, "value": self.value, "language": self.language}
+
+
+@dataclass
+class Collection:
+ identifier: str
+ title: str
+ description: Optional[str] = None
+ dublin_core: List[DublinCore] = field(default_factory=list)
+ extension: List[Extension] = field(default_factory=list)
+ resource: bool = False
+ filepath: Optional[str] = None
+
+ def json(self):
+ return {
+ "identifier": self.identifier,
+ "title": self.title,
+ "description": self.description,
+ "dublin_core": self.dublin_core,
+ "extension": self.extension,
+ "resource": self.resource,
+ "filepath": self.filepath
+ }
+
+@dataclass
+class CitableUnit:
+ resource: str
+ reference: str
+ children: List[str] = field(default_factory=list)
+ dublin_core: List[DublinCore] = field(default_factory=list)
+ extension: List[Extension] = field(default_factory=list)
diff --git a/dapitains/metadata/xml_parser.py b/dapitains/metadata/xml_parser.py
new file mode 100644
index 0000000..c33c8a4
--- /dev/null
+++ b/dapitains/metadata/xml_parser.py
@@ -0,0 +1,97 @@
+import os.path
+import re
+from typing import Dict, Optional, List, Tuple, Any
+from dataclasses import dataclass, field
+import lxml.etree as ET
+from dapitains.metadata.classes import DublinCore, Extension, Collection
+
+
+_re_tag = re.compile(r"[{}]")
+
+
+@dataclass
+class Catalog:
+ relationships: List[Tuple[str, str]] = field(default_factory=list)
+ objects: Dict[str, Collection] = field(default_factory=dict)
+
+
+def parse_metadata(xml: ET.Element) -> Tuple[Dict[str, Any], List[str]]:
+ """ Parse Metadata
+
+ :param xml: Collection/Resource tag
+ :returns: Main metadata obj Resource or Collection objects
+ """
+ obj = {
+ "identifier": xml.attrib["identifier"],
+ "title": xml.xpath("./title[1]/text()")[0],
+ "description": (xml.xpath("./description[1]/text()") or [None])[0]
+ }
+ # Treat Dublin Core
+ dublin_core = []
+ for node in xml.xpath("./dublinCore/*"):
+ tag = node.tag.split("}")[-1]
+ language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
+ text = node.text
+ dublin_core.append(DublinCore(tag, text, language))
+ if dublin_core:
+ obj["dublin_core"] = dublin_core
+
+ # Treat Extension
+ extensions = []
+ for node in xml.xpath("./extension/*"):
+ tag = _re_tag.sub("", node.tag)
+ language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
+ text = node.text
+ extensions.append(Extension(tag, text, language))
+ if extensions:
+ obj["extensions"] = extensions
+
+ # Parents
+ parents = []
+ for node in xml.xpath("./parent/text()"):
+ parents.append(str(node))
+
+ return obj, parents
+
+
+def parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection:
+ """ Parse a Collection or Resource object
+
+ :param xml: Parsed Collection or Resource by LXML
+ :param basedir: Directory used to resolve filepath, that are relative to the main object
+ :param tree: Catalog that is updated with objects.
+ """
+ obj, parents = parse_metadata(xml)
+ obj = Collection(**obj, resource=xml.tag == "resource")
+ for parent in parents:
+ tree.relationships.append((parent, obj.identifier))
+ tree.objects[obj.identifier] = obj
+ if xml.attrib.get("filepath") and obj.resource:
+ obj.filepath = os.path.normpath(os.path.join(basedir, xml.attrib["filepath"]))
+ for member in xml.xpath("./members/*"):
+ if member.xpath("./title"):
+ child = parse_collection(member, basedir, tree)
+ tree.relationships.append((obj.identifier, child.identifier))
+ else:
+ _, child = ingest_catalog(os.path.join(basedir, member.attrib["filepath"]), tree)
+ tree.relationships.append((obj.identifier, child.identifier))
+ return obj
+
+
+def ingest_catalog(path: str, tree: Optional[Catalog] = None) -> Tuple[Catalog, Collection]:
+ """ Ingest a collection description file.
+
+ :param path: Path to a Collection XML File, see the schema at tests/catalog/schema.rng
+ :param tree: Current catalog, which is either updated or created
+ :return: Catalog and root collection found at path.
+
+ >>> ingest_catalog("../../tests/catalog/example-collection.xml")
+ """
+ xml = ET.parse(path)
+ current_dir = os.path.abspath(os.path.dirname(path))
+
+ root: ET.Element = xml.getroot()
+ tree = tree or Catalog()
+ root_collection = parse_collection(root, basedir=current_dir, tree=tree)
+ return tree, root_collection
+
diff --git a/dapitains/tei/__init__.py b/dapitains/tei/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dapitains/local/citeStructure.py b/dapitains/tei/citeStructure.py
similarity index 96%
rename from dapitains/local/citeStructure.py
rename to dapitains/tei/citeStructure.py
index 6547972..79e866d 100644
--- a/dapitains/local/citeStructure.py
+++ b/dapitains/tei/citeStructure.py
@@ -1,10 +1,9 @@
import re
from typing import Dict, List, Optional
from dataclasses import dataclass, field
-from saxonche import PyXdmNode, PyXPathProcessor
from collections import namedtuple, defaultdict
from functools import cmp_to_key
-from dapitains.constants import PROCESSOR, get_xpath_proc
+from dapitains.constants import get_xpath_proc, saxonlib
@dataclass
@@ -45,7 +44,7 @@ class CitableUnit:
citeType: str
ref: str
children: List["CitableUnit"] = field(default_factory=list)
- node: Optional[PyXdmNode] = None
+ node: Optional[saxonlib.PyXdmNode] = None
dublinCore: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list))
extension: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list))
@@ -69,7 +68,7 @@ def to_dts(self):
_simple_node = namedtuple("SimpleNode", ["citation", "xpath", "struct"])
-def get_children_cite_structures(elem: PyXdmNode) -> List[PyXdmNode]:
+def get_children_cite_structures(elem: saxonlib.PyXdmNode) -> List[saxonlib.PyXdmNode]:
xpath = get_xpath_proc(elem=elem).evaluate("./citeStructure")
if xpath is not None:
return list(iter(xpath))
@@ -82,7 +81,7 @@ class CiteStructureParser:
ToDo: Add the ability to use CiteData. This will mean moving from len(element) to len(element.xpath("./citeStructure"))
ToDo: Add the ability to use citationTree labels
"""
- def __init__(self, root: PyXdmNode):
+ def __init__(self, root: saxonlib.PyXdmNode):
self.root = root
self.xpath_matcher: Dict[str, str] = {}
self.regex_pattern, cite_structure = self.build_regex_and_xpath(
@@ -189,7 +188,7 @@ def _dispatch(
self,
child_xpath: str,
structure: CitableStructure,
- xpath_processor: PyXPathProcessor,
+ xpath_processor: saxonlib.PyXPathProcessor,
unit: CitableUnit):
# target = self.generate_xpath(child.ref)
if len(structure.children) == 1:
@@ -207,7 +206,7 @@ def _dispatch(
def find_refs(
self,
- root: PyXdmNode,
+ root: saxonlib.PyXdmNode,
structure: CitableStructure = None,
unit: Optional[CitableUnit] = None
) -> List[CitableUnit]:
@@ -245,7 +244,7 @@ def find_refs(
def find_refs_from_branches(
self,
- root: PyXdmNode,
+ root: saxonlib.PyXdmNode,
structure: List[CitableStructure],
unit: Optional[CitableUnit] = None
) -> List[CitableUnit]:
diff --git a/dapitains/local/tei.py b/dapitains/tei/tei.py
similarity index 96%
rename from dapitains/local/tei.py
rename to dapitains/tei/tei.py
index 01c8406..6ed7299 100644
--- a/dapitains/local/tei.py
+++ b/dapitains/tei/tei.py
@@ -1,10 +1,9 @@
-from dapitains.local.citeStructure import CiteStructureParser
-from dapitains.constants import PROCESSOR, get_xpath_proc
+from dapitains.tei.citeStructure import CiteStructureParser
+from dapitains.constants import PROCESSOR, get_xpath_proc, saxonlib
from typing import Optional, List, Tuple, Dict
from lxml.etree import fromstring
from lxml.objectify import Element, SubElement
from lxml import objectify
-from saxonche import PyXdmNode, PyXPathProcessor
import re
from dapitains.errors import UnknownTreeName
@@ -31,7 +30,7 @@ def xpath_walk(xpath: List[str]) -> Tuple[str, List[str]]:
return current, queue
-def is_traversing_xpath(parent: PyXdmNode, xpath: str) -> bool:
+def is_traversing_xpath(parent: saxonlib.PyXdmNode, xpath: str) -> bool:
""" Check if an XPath is traversing more than one level
:param parent:
@@ -49,7 +48,7 @@ def is_traversing_xpath(parent: PyXdmNode, xpath: str) -> bool:
return False
-def xpath_walk_step(parent: PyXdmNode, xpath: str) -> Tuple[PyXdmNode, bool]:
+def xpath_walk_step(parent: saxonlib.PyXdmNode, xpath: str) -> Tuple[saxonlib.PyXdmNode, bool]:
""" Perform an XPath on an element to find a child that is part of the XPath.
If the child is a direct member of the path, returns a False boolean indicating to move
onto the next element.
@@ -71,7 +70,7 @@ def xpath_walk_step(parent: PyXdmNode, xpath: str) -> Tuple[PyXdmNode, bool]:
return xpath_proc.evaluate_single(xpath), False
-def copy_node(node: PyXdmNode, include_children=False, parent: Optional[Element] = None):
+def copy_node(node: saxonlib.PyXdmNode, include_children=False, parent: Optional[Element] = None):
""" Copy an XML Node
:param node: Etree Node
@@ -124,7 +123,7 @@ def normalize_xpath(xpath: List[str]) -> List[str]:
def reconstruct_doc(
- root: PyXdmNode,
+ root: saxonlib.PyXdmNode,
start_xpath: List[str],
new_tree: Optional[Element] = None,
end_xpath: Optional[List[str]] = None
diff --git a/requirements.txt b/requirements.txt
index 3fdfa2c..980dd76 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,5 @@
saxonche==12.5.0
-lxml
\ No newline at end of file
+lxml
+flask
+flask-sqlalchemy
+click
\ No newline at end of file
diff --git a/tests/catalog/example-collection.xml b/tests/catalog/example-collection.xml
index 122a343..72d2047 100644
--- a/tests/catalog/example-collection.xml
+++ b/tests/catalog/example-collection.xml
@@ -7,7 +7,8 @@
Et je peux traduire en français
-
+
+
A simple resource
With a description
diff --git a/tests/catalog/example-sub-collection.xml b/tests/catalog/example-sub-collection.xml
new file mode 100644
index 0000000..f5798c4
--- /dev/null
+++ b/tests/catalog/example-sub-collection.xml
@@ -0,0 +1,23 @@
+
+ My First Collection
+
+ John Doe
+ History
+ 2023-08-24
+
+
+
+ Historical Document
+ A document about historical events.
+ https://foo.bar/default
+
+ World War II
+ en
+
+
+ 5 stars
+ Very informative document.
+
+
+
+
diff --git a/tests/test_catalog.py b/tests/test_catalog.py
new file mode 100644
index 0000000..c376624
--- /dev/null
+++ b/tests/test_catalog.py
@@ -0,0 +1,65 @@
+import os.path
+
+from dapitains.metadata.xml_parser import ingest_catalog
+from dapitains.metadata.classes import *
+
+
+local_dir = os.path.join(os.path.dirname(__file__))
+
+
+def test_ingestion():
+ tree, _ = ingest_catalog(f"{local_dir}/catalog/example-collection.xml")
+
+ assert tree.objects == {
+ "https://foo.bar/default": Collection(
+ identifier='https://foo.bar/default',
+ title='A collection', description=None,
+ dublin_core=[
+ DublinCore(term='abstract', value='This is a perfect example of an absract.', language=None),
+ DublinCore(term='abstract', value='Et je peux traduire en français', language='fr')], extension=[],
+ resource=False,
+ filepath=None
+ ),
+ "https://example.org/collection1": Collection(
+ identifier='https://example.org/collection1',
+ title='My First Collection',
+ description=None,
+ dublin_core=[
+ DublinCore(term='creator', value='John Doe', language=None),
+ DublinCore(term='subject', value='History', language=None),
+ DublinCore(term='date', value='2023-08-24', language=None)
+ ],
+ extension=[],
+ resource=False,
+ filepath=None
+ ),
+ "https://example.org/resource1": Collection(
+ identifier='https://example.org/resource1',
+ title='Historical Document',
+ description='A document about historical events.',
+ dublin_core=[
+ DublinCore(term='subject', value='World War II', language=None),
+ DublinCore(term='language', value='en', language=None)
+ ],
+ extension=[], resource=True,
+ filepath=os.path.abspath(f"{local_dir}/tei/multiple_tree.xml")
+ ),
+ "https://foo.bar/text": Collection(
+ identifier='https://foo.bar/text',
+ title='A simple resource',
+ description='With a description',
+ dublin_core=[
+ DublinCore(term='title', value='A simple resource', language=None)
+ ],
+ extension=[],
+ resource=True,
+ filepath=os.path.abspath(f"{local_dir}/tei/base_tei.xml")
+ )
+ }
+
+ assert sorted(tree.relationships) == [
+ ('https://example.org/collection1', 'https://example.org/resource1'),
+ ('https://foo.bar/default', 'https://example.org/collection1'),
+ ('https://foo.bar/default', 'https://example.org/resource1',),
+ ('https://foo.bar/default', 'https://foo.bar/text')
+ ]
diff --git a/tests/test_citeStructure.py b/tests/test_citeStructure.py
index c0f3d39..8154fd9 100644
--- a/tests/test_citeStructure.py
+++ b/tests/test_citeStructure.py
@@ -1,4 +1,4 @@
-from dapitains.local.citeStructure import CiteStructureParser
+from dapitains.tei.citeStructure import CiteStructureParser
from dapitains.constants import PROCESSOR, get_xpath_proc
import os.path
import pytest
diff --git a/tests/test_tei.py b/tests/test_tei.py
index 8f0120d..a9f367e 100644
--- a/tests/test_tei.py
+++ b/tests/test_tei.py
@@ -1,7 +1,7 @@
import os.path
import pytest
-from dapitains.local.tei import Document
+from dapitains.tei.tei import Document
from lxml.etree import tostring
local_dir = os.path.join(os.path.dirname(__file__), "tei")