Skip to content

Commit

Permalink
Ability to represent Collection and Resources (#3)
Browse files Browse the repository at this point in the history
* [WIP] Adding a catalog solution

* [WIP] Some basic work-around for children/parents

* [WIP] Moved to a Catalog object

* [WIP] Test catalog ingestion

* Adding flask for the coming up adapter for WEB API

* [WIP] Heavy changes to package structure, and the processor can now use different version

* [WIP] Fixing tests

* Fixing stuff

* Adding test for relationships

* Updating checkout version

* [WIP] Cataloguing

* Parents are parenting
  • Loading branch information
PonteIneptique authored Aug 24, 2024
1 parent e42d4c2 commit a06dd86
Show file tree
Hide file tree
Showing 15 changed files with 278 additions and 24 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v2
with:
Expand Down
File renamed without changes.
24 changes: 20 additions & 4 deletions dapitains/constants.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,28 @@
import logging
import os

try:
from saxonche import PySaxonProcessor, PyXdmNode, PyXPathProcessor
saxon_version = os.getenv("pysaxon", "HE")
saxon_license = os.getenv("pysaxon_license", "")
logging.info(f"Using SaxonLib {saxon_version}")
if saxon_version == "HE":
import saxonche as saxonlib
PROCESSOR = saxonlib.PySaxonProcessor()
elif saxon_version == "PE":
import saxoncpe as saxonlib
PROCESSOR = saxonlib.PySaxonProcessor(license=saxon_license)
elif saxon_version == "PE":
import saxoncee as saxonlib
PROCESSOR = saxonlib.PySaxonProcessor(license=saxon_license)
except ImportError:
print("PySaxonC-HE not found")
print("Unable to import the required PySaxonC version, resorting to PySaxonC-HE")
import saxonche as saxonlib
PROCESSOR = saxonlib.PySaxonProcessor()


PROCESSOR = PySaxonProcessor()


def get_xpath_proc(elem: PyXdmNode) -> PyXPathProcessor:
def get_xpath_proc(elem: saxonlib.PyXdmNode) -> saxonlib.PyXPathProcessor:
""" Builds an XPath processor around a given element, with the default TEI namespace
:param elem: An XML node, root or not
Expand Down
Empty file added dapitains/metadata/__init__.py
Empty file.
51 changes: 51 additions & 0 deletions dapitains/metadata/classes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from dataclasses import dataclass, field
from typing import List, Optional


@dataclass
class DublinCore:
term: str
value: str
language: Optional[str] = None

def json(self):
return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value, "language": self.language}


class Extension(DublinCore):
term: str
value: str
language: Optional[str] = None

def json(self):
return {"property": self.term, "value": self.value, "language": self.language}


@dataclass
class Collection:
identifier: str
title: str
description: Optional[str] = None
dublin_core: List[DublinCore] = field(default_factory=list)
extension: List[Extension] = field(default_factory=list)
resource: bool = False
filepath: Optional[str] = None

def json(self):
return {
"identifier": self.identifier,
"title": self.title,
"description": self.description,
"dublin_core": self.dublin_core,
"extension": self.extension,
"resource": self.resource,
"filepath": self.filepath
}

@dataclass
class CitableUnit:
resource: str
reference: str
children: List[str] = field(default_factory=list)
dublin_core: List[DublinCore] = field(default_factory=list)
extension: List[Extension] = field(default_factory=list)
97 changes: 97 additions & 0 deletions dapitains/metadata/xml_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import os.path
import re
from typing import Dict, Optional, List, Tuple, Any
from dataclasses import dataclass, field
import lxml.etree as ET
from dapitains.metadata.classes import DublinCore, Extension, Collection


_re_tag = re.compile(r"[{}]")


@dataclass
class Catalog:
relationships: List[Tuple[str, str]] = field(default_factory=list)
objects: Dict[str, Collection] = field(default_factory=dict)


def parse_metadata(xml: ET.Element) -> Tuple[Dict[str, Any], List[str]]:
""" Parse Metadata
:param xml: Collection/Resource tag
:returns: Main metadata obj Resource or Collection objects
"""
obj = {
"identifier": xml.attrib["identifier"],
"title": xml.xpath("./title[1]/text()")[0],
"description": (xml.xpath("./description[1]/text()") or [None])[0]
}
# Treat Dublin Core
dublin_core = []
for node in xml.xpath("./dublinCore/*"):
tag = node.tag.split("}")[-1]
language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
text = node.text
dublin_core.append(DublinCore(tag, text, language))
if dublin_core:
obj["dublin_core"] = dublin_core

# Treat Extension
extensions = []
for node in xml.xpath("./extension/*"):
tag = _re_tag.sub("", node.tag)
language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
text = node.text
extensions.append(Extension(tag, text, language))
if extensions:
obj["extensions"] = extensions

# Parents
parents = []
for node in xml.xpath("./parent/text()"):
parents.append(str(node))

return obj, parents


def parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection:
""" Parse a Collection or Resource object
:param xml: Parsed Collection or Resource by LXML
:param basedir: Directory used to resolve filepath, that are relative to the main object
:param tree: Catalog that is updated with objects.
"""
obj, parents = parse_metadata(xml)
obj = Collection(**obj, resource=xml.tag == "resource")
for parent in parents:
tree.relationships.append((parent, obj.identifier))
tree.objects[obj.identifier] = obj
if xml.attrib.get("filepath") and obj.resource:
obj.filepath = os.path.normpath(os.path.join(basedir, xml.attrib["filepath"]))
for member in xml.xpath("./members/*"):
if member.xpath("./title"):
child = parse_collection(member, basedir, tree)
tree.relationships.append((obj.identifier, child.identifier))
else:
_, child = ingest_catalog(os.path.join(basedir, member.attrib["filepath"]), tree)
tree.relationships.append((obj.identifier, child.identifier))
return obj


def ingest_catalog(path: str, tree: Optional[Catalog] = None) -> Tuple[Catalog, Collection]:
""" Ingest a collection description file.
:param path: Path to a Collection XML File, see the schema at tests/catalog/schema.rng
:param tree: Current catalog, which is either updated or created
:return: Catalog and root collection found at path.
>>> ingest_catalog("../../tests/catalog/example-collection.xml")
"""
xml = ET.parse(path)
current_dir = os.path.abspath(os.path.dirname(path))

root: ET.Element = xml.getroot()
tree = tree or Catalog()
root_collection = parse_collection(root, basedir=current_dir, tree=tree)
return tree, root_collection

Empty file added dapitains/tei/__init__.py
Empty file.
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import re
from typing import Dict, List, Optional
from dataclasses import dataclass, field
from saxonche import PyXdmNode, PyXPathProcessor
from collections import namedtuple, defaultdict
from functools import cmp_to_key
from dapitains.constants import PROCESSOR, get_xpath_proc
from dapitains.constants import get_xpath_proc, saxonlib


@dataclass
Expand Down Expand Up @@ -45,7 +44,7 @@ class CitableUnit:
citeType: str
ref: str
children: List["CitableUnit"] = field(default_factory=list)
node: Optional[PyXdmNode] = None
node: Optional[saxonlib.PyXdmNode] = None
dublinCore: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list))
extension: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list))

Expand All @@ -69,7 +68,7 @@ def to_dts(self):
_simple_node = namedtuple("SimpleNode", ["citation", "xpath", "struct"])


def get_children_cite_structures(elem: PyXdmNode) -> List[PyXdmNode]:
def get_children_cite_structures(elem: saxonlib.PyXdmNode) -> List[saxonlib.PyXdmNode]:
xpath = get_xpath_proc(elem=elem).evaluate("./citeStructure")
if xpath is not None:
return list(iter(xpath))
Expand All @@ -82,7 +81,7 @@ class CiteStructureParser:
ToDo: Add the ability to use CiteData. This will mean moving from len(element) to len(element.xpath("./citeStructure"))
ToDo: Add the ability to use citationTree labels
"""
def __init__(self, root: PyXdmNode):
def __init__(self, root: saxonlib.PyXdmNode):
self.root = root
self.xpath_matcher: Dict[str, str] = {}
self.regex_pattern, cite_structure = self.build_regex_and_xpath(
Expand Down Expand Up @@ -189,7 +188,7 @@ def _dispatch(
self,
child_xpath: str,
structure: CitableStructure,
xpath_processor: PyXPathProcessor,
xpath_processor: saxonlib.PyXPathProcessor,
unit: CitableUnit):
# target = self.generate_xpath(child.ref)
if len(structure.children) == 1:
Expand All @@ -207,7 +206,7 @@ def _dispatch(

def find_refs(
self,
root: PyXdmNode,
root: saxonlib.PyXdmNode,
structure: CitableStructure = None,
unit: Optional[CitableUnit] = None
) -> List[CitableUnit]:
Expand Down Expand Up @@ -245,7 +244,7 @@ def find_refs(

def find_refs_from_branches(
self,
root: PyXdmNode,
root: saxonlib.PyXdmNode,
structure: List[CitableStructure],
unit: Optional[CitableUnit] = None
) -> List[CitableUnit]:
Expand Down
13 changes: 6 additions & 7 deletions dapitains/local/tei.py → dapitains/tei/tei.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from dapitains.local.citeStructure import CiteStructureParser
from dapitains.constants import PROCESSOR, get_xpath_proc
from dapitains.tei.citeStructure import CiteStructureParser
from dapitains.constants import PROCESSOR, get_xpath_proc, saxonlib
from typing import Optional, List, Tuple, Dict
from lxml.etree import fromstring
from lxml.objectify import Element, SubElement
from lxml import objectify
from saxonche import PyXdmNode, PyXPathProcessor
import re
from dapitains.errors import UnknownTreeName

Expand All @@ -31,7 +30,7 @@ def xpath_walk(xpath: List[str]) -> Tuple[str, List[str]]:
return current, queue


def is_traversing_xpath(parent: PyXdmNode, xpath: str) -> bool:
def is_traversing_xpath(parent: saxonlib.PyXdmNode, xpath: str) -> bool:
""" Check if an XPath is traversing more than one level
:param parent:
Expand All @@ -49,7 +48,7 @@ def is_traversing_xpath(parent: PyXdmNode, xpath: str) -> bool:
return False


def xpath_walk_step(parent: PyXdmNode, xpath: str) -> Tuple[PyXdmNode, bool]:
def xpath_walk_step(parent: saxonlib.PyXdmNode, xpath: str) -> Tuple[saxonlib.PyXdmNode, bool]:
""" Perform an XPath on an element to find a child that is part of the XPath.
If the child is a direct member of the path, returns a False boolean indicating to move
onto the next element.
Expand All @@ -71,7 +70,7 @@ def xpath_walk_step(parent: PyXdmNode, xpath: str) -> Tuple[PyXdmNode, bool]:
return xpath_proc.evaluate_single(xpath), False


def copy_node(node: PyXdmNode, include_children=False, parent: Optional[Element] = None):
def copy_node(node: saxonlib.PyXdmNode, include_children=False, parent: Optional[Element] = None):
""" Copy an XML Node
:param node: Etree Node
Expand Down Expand Up @@ -124,7 +123,7 @@ def normalize_xpath(xpath: List[str]) -> List[str]:


def reconstruct_doc(
root: PyXdmNode,
root: saxonlib.PyXdmNode,
start_xpath: List[str],
new_tree: Optional[Element] = None,
end_xpath: Optional[List[str]] = None
Expand Down
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
saxonche==12.5.0
lxml
lxml
flask
flask-sqlalchemy
click
3 changes: 2 additions & 1 deletion tests/catalog/example-collection.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
<abstract xmlns="http://purl.org/dc/terms/" xml:lang="fr">Et je peux traduire en français</abstract>
</dublinCore>
<members>
<resource identifier="https://foo.bar/text" path="../tei/base_tei.xml">
<collection filepath="./example-sub-collection.xml"/>
<resource identifier="https://foo.bar/text" filepath="../tei/base_tei.xml">
<title>A simple resource</title>
<description>With a description</description>
<dublinCore>
Expand Down
23 changes: 23 additions & 0 deletions tests/catalog/example-sub-collection.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<collection identifier="https://example.org/collection1">
<title>My First Collection</title>
<dublinCore>
<creator xmlns="http://purl.org/dc/terms/">John Doe</creator>
<subject xmlns="http://purl.org/dc/terms/">History</subject>
<date xmlns="http://purl.org/dc/terms/">2023-08-24</date>
</dublinCore>
<members>
<resource identifier="https://example.org/resource1" filepath="../tei/multiple_tree.xml">
<title>Historical Document</title>
<description>A document about historical events.</description>
<parent>https://foo.bar/default</parent>
<dublinCore>
<subject xmlns="http://purl.org/dc/terms/">World War II</subject>
<language xmlns="http://purl.org/dc/terms/">en</language>
</dublinCore>
<extensions>
<rating xmlns="https://example.org/rating">5 stars</rating>
<comment xmlns="https://example.org/comment">Very informative document.</comment>
</extensions>
</resource>
</members>
</collection>
Loading

0 comments on commit a06dd86

Please sign in to comment.