-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Ability to represent Collection and Resources (#3)
* [WIP] Adding a catalog solution * [WIP] Some basic work-around for children/parents * [WIP] Moved to a Catalog object * [WIP] Test catalog ingestion * Adding flask for the coming up adapter for WEB API * [WIP] Heavy changes to package structure, and the processor can now use different version * [WIP] Fixing tests * Fixing stuff * Adding test for relationships * Updating checkout version * [WIP] Cataloguing * Parents are parenting
- Loading branch information
1 parent
e42d4c2
commit a06dd86
Showing
15 changed files
with
278 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
from dataclasses import dataclass, field | ||
from typing import List, Optional | ||
|
||
|
||
@dataclass | ||
class DublinCore: | ||
term: str | ||
value: str | ||
language: Optional[str] = None | ||
|
||
def json(self): | ||
return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value, "language": self.language} | ||
|
||
|
||
class Extension(DublinCore): | ||
term: str | ||
value: str | ||
language: Optional[str] = None | ||
|
||
def json(self): | ||
return {"property": self.term, "value": self.value, "language": self.language} | ||
|
||
|
||
@dataclass | ||
class Collection: | ||
identifier: str | ||
title: str | ||
description: Optional[str] = None | ||
dublin_core: List[DublinCore] = field(default_factory=list) | ||
extension: List[Extension] = field(default_factory=list) | ||
resource: bool = False | ||
filepath: Optional[str] = None | ||
|
||
def json(self): | ||
return { | ||
"identifier": self.identifier, | ||
"title": self.title, | ||
"description": self.description, | ||
"dublin_core": self.dublin_core, | ||
"extension": self.extension, | ||
"resource": self.resource, | ||
"filepath": self.filepath | ||
} | ||
|
||
@dataclass | ||
class CitableUnit: | ||
resource: str | ||
reference: str | ||
children: List[str] = field(default_factory=list) | ||
dublin_core: List[DublinCore] = field(default_factory=list) | ||
extension: List[Extension] = field(default_factory=list) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import os.path | ||
import re | ||
from typing import Dict, Optional, List, Tuple, Any | ||
from dataclasses import dataclass, field | ||
import lxml.etree as ET | ||
from dapitains.metadata.classes import DublinCore, Extension, Collection | ||
|
||
|
||
_re_tag = re.compile(r"[{}]") | ||
|
||
|
||
@dataclass | ||
class Catalog: | ||
relationships: List[Tuple[str, str]] = field(default_factory=list) | ||
objects: Dict[str, Collection] = field(default_factory=dict) | ||
|
||
|
||
def parse_metadata(xml: ET.Element) -> Tuple[Dict[str, Any], List[str]]: | ||
""" Parse Metadata | ||
:param xml: Collection/Resource tag | ||
:returns: Main metadata obj Resource or Collection objects | ||
""" | ||
obj = { | ||
"identifier": xml.attrib["identifier"], | ||
"title": xml.xpath("./title[1]/text()")[0], | ||
"description": (xml.xpath("./description[1]/text()") or [None])[0] | ||
} | ||
# Treat Dublin Core | ||
dublin_core = [] | ||
for node in xml.xpath("./dublinCore/*"): | ||
tag = node.tag.split("}")[-1] | ||
language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang") | ||
text = node.text | ||
dublin_core.append(DublinCore(tag, text, language)) | ||
if dublin_core: | ||
obj["dublin_core"] = dublin_core | ||
|
||
# Treat Extension | ||
extensions = [] | ||
for node in xml.xpath("./extension/*"): | ||
tag = _re_tag.sub("", node.tag) | ||
language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang") | ||
text = node.text | ||
extensions.append(Extension(tag, text, language)) | ||
if extensions: | ||
obj["extensions"] = extensions | ||
|
||
# Parents | ||
parents = [] | ||
for node in xml.xpath("./parent/text()"): | ||
parents.append(str(node)) | ||
|
||
return obj, parents | ||
|
||
|
||
def parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection: | ||
""" Parse a Collection or Resource object | ||
:param xml: Parsed Collection or Resource by LXML | ||
:param basedir: Directory used to resolve filepath, that are relative to the main object | ||
:param tree: Catalog that is updated with objects. | ||
""" | ||
obj, parents = parse_metadata(xml) | ||
obj = Collection(**obj, resource=xml.tag == "resource") | ||
for parent in parents: | ||
tree.relationships.append((parent, obj.identifier)) | ||
tree.objects[obj.identifier] = obj | ||
if xml.attrib.get("filepath") and obj.resource: | ||
obj.filepath = os.path.normpath(os.path.join(basedir, xml.attrib["filepath"])) | ||
for member in xml.xpath("./members/*"): | ||
if member.xpath("./title"): | ||
child = parse_collection(member, basedir, tree) | ||
tree.relationships.append((obj.identifier, child.identifier)) | ||
else: | ||
_, child = ingest_catalog(os.path.join(basedir, member.attrib["filepath"]), tree) | ||
tree.relationships.append((obj.identifier, child.identifier)) | ||
return obj | ||
|
||
|
||
def ingest_catalog(path: str, tree: Optional[Catalog] = None) -> Tuple[Catalog, Collection]: | ||
""" Ingest a collection description file. | ||
:param path: Path to a Collection XML File, see the schema at tests/catalog/schema.rng | ||
:param tree: Current catalog, which is either updated or created | ||
:return: Catalog and root collection found at path. | ||
>>> ingest_catalog("../../tests/catalog/example-collection.xml") | ||
""" | ||
xml = ET.parse(path) | ||
current_dir = os.path.abspath(os.path.dirname(path)) | ||
|
||
root: ET.Element = xml.getroot() | ||
tree = tree or Catalog() | ||
root_collection = parse_collection(root, basedir=current_dir, tree=tree) | ||
return tree, root_collection | ||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
saxonche==12.5.0 | ||
lxml | ||
lxml | ||
flask | ||
flask-sqlalchemy | ||
click |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
<collection identifier="https://example.org/collection1"> | ||
<title>My First Collection</title> | ||
<dublinCore> | ||
<creator xmlns="http://purl.org/dc/terms/">John Doe</creator> | ||
<subject xmlns="http://purl.org/dc/terms/">History</subject> | ||
<date xmlns="http://purl.org/dc/terms/">2023-08-24</date> | ||
</dublinCore> | ||
<members> | ||
<resource identifier="https://example.org/resource1" filepath="../tei/multiple_tree.xml"> | ||
<title>Historical Document</title> | ||
<description>A document about historical events.</description> | ||
<parent>https://foo.bar/default</parent> | ||
<dublinCore> | ||
<subject xmlns="http://purl.org/dc/terms/">World War II</subject> | ||
<language xmlns="http://purl.org/dc/terms/">en</language> | ||
</dublinCore> | ||
<extensions> | ||
<rating xmlns="https://example.org/rating">5 stars</rating> | ||
<comment xmlns="https://example.org/comment">Very informative document.</comment> | ||
</extensions> | ||
</resource> | ||
</members> | ||
</collection> |
Oops, something went wrong.