From a06dd866ce95392ae2706fa97ef76f7e975208df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= <1929830+PonteIneptique@users.noreply.github.com> Date: Sat, 24 Aug 2024 17:59:16 +0200 Subject: [PATCH] Ability to represent Collection and Resources (#3) * [WIP] Adding a catalog solution * [WIP] Some basic work-around for children/parents * [WIP] Moved to a Catalog object * [WIP] Test catalog ingestion * Adding flask for the coming up adapter for WEB API * [WIP] Heavy changes to package structure, and the processor can now use different version * [WIP] Fixing tests * Fixing stuff * Adding test for relationships * Updating checkout version * [WIP] Cataloguing * Parents are parenting --- .github/workflows/test.yml | 2 +- dapitains/{local => app}/__init__.py | 0 dapitains/constants.py | 24 +++++- dapitains/metadata/__init__.py | 0 dapitains/metadata/classes.py | 51 ++++++++++++ dapitains/metadata/xml_parser.py | 97 +++++++++++++++++++++++ dapitains/tei/__init__.py | 0 dapitains/{local => tei}/citeStructure.py | 15 ++-- dapitains/{local => tei}/tei.py | 13 ++- requirements.txt | 5 +- tests/catalog/example-collection.xml | 3 +- tests/catalog/example-sub-collection.xml | 23 ++++++ tests/test_catalog.py | 65 +++++++++++++++ tests/test_citeStructure.py | 2 +- tests/test_tei.py | 2 +- 15 files changed, 278 insertions(+), 24 deletions(-) rename dapitains/{local => app}/__init__.py (100%) create mode 100644 dapitains/metadata/__init__.py create mode 100644 dapitains/metadata/classes.py create mode 100644 dapitains/metadata/xml_parser.py create mode 100644 dapitains/tei/__init__.py rename dapitains/{local => tei}/citeStructure.py (96%) rename dapitains/{local => tei}/tei.py (96%) create mode 100644 tests/catalog/example-sub-collection.xml create mode 100644 tests/test_catalog.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8fe5ced..f057f9a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,7 +12,7 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v2 with: diff --git a/dapitains/local/__init__.py b/dapitains/app/__init__.py similarity index 100% rename from dapitains/local/__init__.py rename to dapitains/app/__init__.py diff --git a/dapitains/constants.py b/dapitains/constants.py index 6e6ee66..b85f31e 100644 --- a/dapitains/constants.py +++ b/dapitains/constants.py @@ -1,12 +1,28 @@ +import logging +import os + try: - from saxonche import PySaxonProcessor, PyXdmNode, PyXPathProcessor + saxon_version = os.getenv("pysaxon", "HE") + saxon_license = os.getenv("pysaxon_license", "") + logging.info(f"Using SaxonLib {saxon_version}") + if saxon_version == "HE": + import saxonche as saxonlib + PROCESSOR = saxonlib.PySaxonProcessor() + elif saxon_version == "PE": + import saxoncpe as saxonlib + PROCESSOR = saxonlib.PySaxonProcessor(license=saxon_license) + elif saxon_version == "PE": + import saxoncee as saxonlib + PROCESSOR = saxonlib.PySaxonProcessor(license=saxon_license) except ImportError: - print("PySaxonC-HE not found") + print("Unable to import the required PySaxonC version, resorting to PySaxonC-HE") + import saxonche as saxonlib + PROCESSOR = saxonlib.PySaxonProcessor() + -PROCESSOR = PySaxonProcessor() -def get_xpath_proc(elem: PyXdmNode) -> PyXPathProcessor: +def get_xpath_proc(elem: saxonlib.PyXdmNode) -> saxonlib.PyXPathProcessor: """ Builds an XPath processor around a given element, with the default TEI namespace :param elem: An XML node, root or not diff --git a/dapitains/metadata/__init__.py b/dapitains/metadata/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dapitains/metadata/classes.py b/dapitains/metadata/classes.py new file mode 100644 index 0000000..a77f694 --- /dev/null +++ b/dapitains/metadata/classes.py @@ -0,0 +1,51 @@ +from dataclasses import dataclass, field +from typing import List, Optional + + +@dataclass +class DublinCore: + term: str + value: str + language: Optional[str] = None + + def json(self): + return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value, "language": self.language} + + +class Extension(DublinCore): + term: str + value: str + language: Optional[str] = None + + def json(self): + return {"property": self.term, "value": self.value, "language": self.language} + + +@dataclass +class Collection: + identifier: str + title: str + description: Optional[str] = None + dublin_core: List[DublinCore] = field(default_factory=list) + extension: List[Extension] = field(default_factory=list) + resource: bool = False + filepath: Optional[str] = None + + def json(self): + return { + "identifier": self.identifier, + "title": self.title, + "description": self.description, + "dublin_core": self.dublin_core, + "extension": self.extension, + "resource": self.resource, + "filepath": self.filepath + } + +@dataclass +class CitableUnit: + resource: str + reference: str + children: List[str] = field(default_factory=list) + dublin_core: List[DublinCore] = field(default_factory=list) + extension: List[Extension] = field(default_factory=list) diff --git a/dapitains/metadata/xml_parser.py b/dapitains/metadata/xml_parser.py new file mode 100644 index 0000000..c33c8a4 --- /dev/null +++ b/dapitains/metadata/xml_parser.py @@ -0,0 +1,97 @@ +import os.path +import re +from typing import Dict, Optional, List, Tuple, Any +from dataclasses import dataclass, field +import lxml.etree as ET +from dapitains.metadata.classes import DublinCore, Extension, Collection + + +_re_tag = re.compile(r"[{}]") + + +@dataclass +class Catalog: + relationships: List[Tuple[str, str]] = field(default_factory=list) + objects: Dict[str, Collection] = field(default_factory=dict) + + +def parse_metadata(xml: ET.Element) -> Tuple[Dict[str, Any], List[str]]: + """ Parse Metadata + + :param xml: Collection/Resource tag + :returns: Main metadata obj Resource or Collection objects + """ + obj = { + "identifier": xml.attrib["identifier"], + "title": xml.xpath("./title[1]/text()")[0], + "description": (xml.xpath("./description[1]/text()") or [None])[0] + } + # Treat Dublin Core + dublin_core = [] + for node in xml.xpath("./dublinCore/*"): + tag = node.tag.split("}")[-1] + language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang") + text = node.text + dublin_core.append(DublinCore(tag, text, language)) + if dublin_core: + obj["dublin_core"] = dublin_core + + # Treat Extension + extensions = [] + for node in xml.xpath("./extension/*"): + tag = _re_tag.sub("", node.tag) + language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang") + text = node.text + extensions.append(Extension(tag, text, language)) + if extensions: + obj["extensions"] = extensions + + # Parents + parents = [] + for node in xml.xpath("./parent/text()"): + parents.append(str(node)) + + return obj, parents + + +def parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection: + """ Parse a Collection or Resource object + + :param xml: Parsed Collection or Resource by LXML + :param basedir: Directory used to resolve filepath, that are relative to the main object + :param tree: Catalog that is updated with objects. + """ + obj, parents = parse_metadata(xml) + obj = Collection(**obj, resource=xml.tag == "resource") + for parent in parents: + tree.relationships.append((parent, obj.identifier)) + tree.objects[obj.identifier] = obj + if xml.attrib.get("filepath") and obj.resource: + obj.filepath = os.path.normpath(os.path.join(basedir, xml.attrib["filepath"])) + for member in xml.xpath("./members/*"): + if member.xpath("./title"): + child = parse_collection(member, basedir, tree) + tree.relationships.append((obj.identifier, child.identifier)) + else: + _, child = ingest_catalog(os.path.join(basedir, member.attrib["filepath"]), tree) + tree.relationships.append((obj.identifier, child.identifier)) + return obj + + +def ingest_catalog(path: str, tree: Optional[Catalog] = None) -> Tuple[Catalog, Collection]: + """ Ingest a collection description file. + + :param path: Path to a Collection XML File, see the schema at tests/catalog/schema.rng + :param tree: Current catalog, which is either updated or created + :return: Catalog and root collection found at path. + + >>> ingest_catalog("../../tests/catalog/example-collection.xml") + """ + xml = ET.parse(path) + current_dir = os.path.abspath(os.path.dirname(path)) + + root: ET.Element = xml.getroot() + tree = tree or Catalog() + root_collection = parse_collection(root, basedir=current_dir, tree=tree) + return tree, root_collection + diff --git a/dapitains/tei/__init__.py b/dapitains/tei/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dapitains/local/citeStructure.py b/dapitains/tei/citeStructure.py similarity index 96% rename from dapitains/local/citeStructure.py rename to dapitains/tei/citeStructure.py index 6547972..79e866d 100644 --- a/dapitains/local/citeStructure.py +++ b/dapitains/tei/citeStructure.py @@ -1,10 +1,9 @@ import re from typing import Dict, List, Optional from dataclasses import dataclass, field -from saxonche import PyXdmNode, PyXPathProcessor from collections import namedtuple, defaultdict from functools import cmp_to_key -from dapitains.constants import PROCESSOR, get_xpath_proc +from dapitains.constants import get_xpath_proc, saxonlib @dataclass @@ -45,7 +44,7 @@ class CitableUnit: citeType: str ref: str children: List["CitableUnit"] = field(default_factory=list) - node: Optional[PyXdmNode] = None + node: Optional[saxonlib.PyXdmNode] = None dublinCore: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list)) extension: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list)) @@ -69,7 +68,7 @@ def to_dts(self): _simple_node = namedtuple("SimpleNode", ["citation", "xpath", "struct"]) -def get_children_cite_structures(elem: PyXdmNode) -> List[PyXdmNode]: +def get_children_cite_structures(elem: saxonlib.PyXdmNode) -> List[saxonlib.PyXdmNode]: xpath = get_xpath_proc(elem=elem).evaluate("./citeStructure") if xpath is not None: return list(iter(xpath)) @@ -82,7 +81,7 @@ class CiteStructureParser: ToDo: Add the ability to use CiteData. This will mean moving from len(element) to len(element.xpath("./citeStructure")) ToDo: Add the ability to use citationTree labels """ - def __init__(self, root: PyXdmNode): + def __init__(self, root: saxonlib.PyXdmNode): self.root = root self.xpath_matcher: Dict[str, str] = {} self.regex_pattern, cite_structure = self.build_regex_and_xpath( @@ -189,7 +188,7 @@ def _dispatch( self, child_xpath: str, structure: CitableStructure, - xpath_processor: PyXPathProcessor, + xpath_processor: saxonlib.PyXPathProcessor, unit: CitableUnit): # target = self.generate_xpath(child.ref) if len(structure.children) == 1: @@ -207,7 +206,7 @@ def _dispatch( def find_refs( self, - root: PyXdmNode, + root: saxonlib.PyXdmNode, structure: CitableStructure = None, unit: Optional[CitableUnit] = None ) -> List[CitableUnit]: @@ -245,7 +244,7 @@ def find_refs( def find_refs_from_branches( self, - root: PyXdmNode, + root: saxonlib.PyXdmNode, structure: List[CitableStructure], unit: Optional[CitableUnit] = None ) -> List[CitableUnit]: diff --git a/dapitains/local/tei.py b/dapitains/tei/tei.py similarity index 96% rename from dapitains/local/tei.py rename to dapitains/tei/tei.py index 01c8406..6ed7299 100644 --- a/dapitains/local/tei.py +++ b/dapitains/tei/tei.py @@ -1,10 +1,9 @@ -from dapitains.local.citeStructure import CiteStructureParser -from dapitains.constants import PROCESSOR, get_xpath_proc +from dapitains.tei.citeStructure import CiteStructureParser +from dapitains.constants import PROCESSOR, get_xpath_proc, saxonlib from typing import Optional, List, Tuple, Dict from lxml.etree import fromstring from lxml.objectify import Element, SubElement from lxml import objectify -from saxonche import PyXdmNode, PyXPathProcessor import re from dapitains.errors import UnknownTreeName @@ -31,7 +30,7 @@ def xpath_walk(xpath: List[str]) -> Tuple[str, List[str]]: return current, queue -def is_traversing_xpath(parent: PyXdmNode, xpath: str) -> bool: +def is_traversing_xpath(parent: saxonlib.PyXdmNode, xpath: str) -> bool: """ Check if an XPath is traversing more than one level :param parent: @@ -49,7 +48,7 @@ def is_traversing_xpath(parent: PyXdmNode, xpath: str) -> bool: return False -def xpath_walk_step(parent: PyXdmNode, xpath: str) -> Tuple[PyXdmNode, bool]: +def xpath_walk_step(parent: saxonlib.PyXdmNode, xpath: str) -> Tuple[saxonlib.PyXdmNode, bool]: """ Perform an XPath on an element to find a child that is part of the XPath. If the child is a direct member of the path, returns a False boolean indicating to move onto the next element. @@ -71,7 +70,7 @@ def xpath_walk_step(parent: PyXdmNode, xpath: str) -> Tuple[PyXdmNode, bool]: return xpath_proc.evaluate_single(xpath), False -def copy_node(node: PyXdmNode, include_children=False, parent: Optional[Element] = None): +def copy_node(node: saxonlib.PyXdmNode, include_children=False, parent: Optional[Element] = None): """ Copy an XML Node :param node: Etree Node @@ -124,7 +123,7 @@ def normalize_xpath(xpath: List[str]) -> List[str]: def reconstruct_doc( - root: PyXdmNode, + root: saxonlib.PyXdmNode, start_xpath: List[str], new_tree: Optional[Element] = None, end_xpath: Optional[List[str]] = None diff --git a/requirements.txt b/requirements.txt index 3fdfa2c..980dd76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ saxonche==12.5.0 -lxml \ No newline at end of file +lxml +flask +flask-sqlalchemy +click \ No newline at end of file diff --git a/tests/catalog/example-collection.xml b/tests/catalog/example-collection.xml index 122a343..72d2047 100644 --- a/tests/catalog/example-collection.xml +++ b/tests/catalog/example-collection.xml @@ -7,7 +7,8 @@ Et je peux traduire en français - + + A simple resource With a description diff --git a/tests/catalog/example-sub-collection.xml b/tests/catalog/example-sub-collection.xml new file mode 100644 index 0000000..f5798c4 --- /dev/null +++ b/tests/catalog/example-sub-collection.xml @@ -0,0 +1,23 @@ + + My First Collection + + John Doe + History + 2023-08-24 + + + + Historical Document + A document about historical events. + https://foo.bar/default + + World War II + en + + + 5 stars + Very informative document. + + + + diff --git a/tests/test_catalog.py b/tests/test_catalog.py new file mode 100644 index 0000000..c376624 --- /dev/null +++ b/tests/test_catalog.py @@ -0,0 +1,65 @@ +import os.path + +from dapitains.metadata.xml_parser import ingest_catalog +from dapitains.metadata.classes import * + + +local_dir = os.path.join(os.path.dirname(__file__)) + + +def test_ingestion(): + tree, _ = ingest_catalog(f"{local_dir}/catalog/example-collection.xml") + + assert tree.objects == { + "https://foo.bar/default": Collection( + identifier='https://foo.bar/default', + title='A collection', description=None, + dublin_core=[ + DublinCore(term='abstract', value='This is a perfect example of an absract.', language=None), + DublinCore(term='abstract', value='Et je peux traduire en français', language='fr')], extension=[], + resource=False, + filepath=None + ), + "https://example.org/collection1": Collection( + identifier='https://example.org/collection1', + title='My First Collection', + description=None, + dublin_core=[ + DublinCore(term='creator', value='John Doe', language=None), + DublinCore(term='subject', value='History', language=None), + DublinCore(term='date', value='2023-08-24', language=None) + ], + extension=[], + resource=False, + filepath=None + ), + "https://example.org/resource1": Collection( + identifier='https://example.org/resource1', + title='Historical Document', + description='A document about historical events.', + dublin_core=[ + DublinCore(term='subject', value='World War II', language=None), + DublinCore(term='language', value='en', language=None) + ], + extension=[], resource=True, + filepath=os.path.abspath(f"{local_dir}/tei/multiple_tree.xml") + ), + "https://foo.bar/text": Collection( + identifier='https://foo.bar/text', + title='A simple resource', + description='With a description', + dublin_core=[ + DublinCore(term='title', value='A simple resource', language=None) + ], + extension=[], + resource=True, + filepath=os.path.abspath(f"{local_dir}/tei/base_tei.xml") + ) + } + + assert sorted(tree.relationships) == [ + ('https://example.org/collection1', 'https://example.org/resource1'), + ('https://foo.bar/default', 'https://example.org/collection1'), + ('https://foo.bar/default', 'https://example.org/resource1',), + ('https://foo.bar/default', 'https://foo.bar/text') + ] diff --git a/tests/test_citeStructure.py b/tests/test_citeStructure.py index c0f3d39..8154fd9 100644 --- a/tests/test_citeStructure.py +++ b/tests/test_citeStructure.py @@ -1,4 +1,4 @@ -from dapitains.local.citeStructure import CiteStructureParser +from dapitains.tei.citeStructure import CiteStructureParser from dapitains.constants import PROCESSOR, get_xpath_proc import os.path import pytest diff --git a/tests/test_tei.py b/tests/test_tei.py index 8f0120d..a9f367e 100644 --- a/tests/test_tei.py +++ b/tests/test_tei.py @@ -1,7 +1,7 @@ import os.path import pytest -from dapitains.local.tei import Document +from dapitains.tei.tei import Document from lxml.etree import tostring local_dir = os.path.join(os.path.dirname(__file__), "tei")