Skip to content

Commit

Permalink
Merge pull request #58 from sldouglas-nist/add_pdf_dictionary_mapping
Browse files Browse the repository at this point in the history
Add PDF Dictionary Mapping
  • Loading branch information
ajnelson-nist authored Jul 3, 2024
2 parents 7979044 + d33e90c commit f7b28a7
Show file tree
Hide file tree
Showing 3 changed files with 225 additions and 0 deletions.
47 changes: 47 additions & 0 deletions case_exiftool/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,11 @@ def __init__(
self._exif_dictionary_dict: typing.Optional[
typing.Dict[str, rdflib.Literal]
] = None

self._pdf_dictionary_dict: typing.Optional[typing.Dict[str, rdflib.Literal]] = (
None
)

self._graph = graph

self._use_deterministic_uuids = use_deterministic_uuids
Expand All @@ -177,6 +182,7 @@ def __init__(
self._n_exif_facet: typing.Optional[rdflib.URIRef] = None
self._n_file_facet: typing.Optional[rdflib.URIRef] = None
self._n_location_object: typing.Optional[rdflib.URIRef] = None
self._n_pdf_dictionary_object: typing.Optional[rdflib.URIRef] = None
self._n_pdf_file_facet: typing.Optional[rdflib.URIRef] = None
self._n_location_object_latlong_facet: typing.Optional[rdflib.URIRef] = None
self._n_observable_object: typing.Optional[rdflib.URIRef] = None
Expand Down Expand Up @@ -375,6 +381,7 @@ def map_raw_and_printconv_iri(self, n_exiftool_predicate: rdflib.URIRef) -> None
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/Author":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.pdf_dictionary_dict["Author"] = v_raw
self.graph.add(
(
self.n_pdf_file_facet,
Expand All @@ -385,6 +392,8 @@ def map_raw_and_printconv_iri(self, n_exiftool_predicate: rdflib.URIRef) -> None
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/CreateDate":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
# CreationDate entry in self.pdf_dictionary_dict references term in ISO 32000-1:2008 PDF Table 317.
self.pdf_dictionary_dict["CreationDate"] = v_raw
self.graph.add(
(
self.n_pdf_file_facet,
Expand All @@ -397,6 +406,7 @@ def map_raw_and_printconv_iri(self, n_exiftool_predicate: rdflib.URIRef) -> None
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/Creator":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.pdf_dictionary_dict["Creator"] = v_raw
self.graph.add(
(
self.n_pdf_file_facet,
Expand All @@ -407,6 +417,7 @@ def map_raw_and_printconv_iri(self, n_exiftool_predicate: rdflib.URIRef) -> None
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/Linearized":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.pdf_dictionary_dict["Linearized"] = v_raw
self.graph.add(
(
self.n_pdf_file_facet,
Expand All @@ -417,6 +428,7 @@ def map_raw_and_printconv_iri(self, n_exiftool_predicate: rdflib.URIRef) -> None
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/ModifyDate":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.pdf_dictionary_dict["ModDate"] = v_raw
self.graph.add(
(
self.n_pdf_file_facet,
Expand All @@ -429,6 +441,7 @@ def map_raw_and_printconv_iri(self, n_exiftool_predicate: rdflib.URIRef) -> None
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/PDFVersion":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.pdf_dictionary_dict["PDFVersion"] = v_raw
self.graph.add(
(
self.n_pdf_file_facet,
Expand All @@ -439,6 +452,7 @@ def map_raw_and_printconv_iri(self, n_exiftool_predicate: rdflib.URIRef) -> None
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/PageCount":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.pdf_dictionary_dict["PageCount"] = v_raw
self.graph.add(
(
self.n_pdf_file_facet,
Expand All @@ -449,6 +463,7 @@ def map_raw_and_printconv_iri(self, n_exiftool_predicate: rdflib.URIRef) -> None
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/Producer":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.pdf_dictionary_dict["Producer"] = v_raw
self.graph.add(
(
self.n_pdf_file_facet,
Expand All @@ -459,6 +474,7 @@ def map_raw_and_printconv_iri(self, n_exiftool_predicate: rdflib.URIRef) -> None
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/Subject":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.pdf_dictionary_dict["Subject"] = v_raw
self.graph.add(
(
self.n_pdf_file_facet,
Expand All @@ -469,6 +485,7 @@ def map_raw_and_printconv_iri(self, n_exiftool_predicate: rdflib.URIRef) -> None
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/Title":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.pdf_dictionary_dict["Title"] = v_raw
self.graph.add(
(
self.n_pdf_file_facet,
Expand Down Expand Up @@ -559,6 +576,8 @@ def _load_xml_file_into_dict(
# Derive remaining objects.
if self._exif_dictionary_dict is not None:
_ = self.n_exif_dictionary_object
if self._pdf_dictionary_dict is not None:
_ = self.n_pdf_dictionary_object
if self._n_location_object is not None:
_ = self.n_relationship_object_location

Expand Down Expand Up @@ -596,6 +615,16 @@ def exif_dictionary_dict(self) -> typing.Dict[str, rdflib.Literal]:
self._exif_dictionary_dict = dict()
return self._exif_dictionary_dict

@property
def pdf_dictionary_dict(self) -> typing.Dict[str, rdflib.Literal]:
"""
Initialized on first access.
Controlled dictionary keys reference terms from ISO 32000-1:2008 PDF Table 317 and ExifTool Tag Names.
"""
if self._pdf_dictionary_dict is None:
self._pdf_dictionary_dict = dict()
return self._pdf_dictionary_dict

@property
def graph(self) -> rdflib.Graph:
"""
Expand Down Expand Up @@ -983,6 +1012,24 @@ def n_pdf_file_facet(self) -> rdflib.URIRef:
)
return self._n_pdf_file_facet

@property
def n_pdf_dictionary_object(self) -> rdflib.URIRef:
"""
Initialized on first access.
"""
if self._n_pdf_dictionary_object is None:
self._n_pdf_dictionary_object = controlled_dictionary_object_to_node(
self.graph, self.ns_base, self.pdf_dictionary_dict
)
self.graph.add(
(
self.n_pdf_file_facet,
NS_UCO_OBSERVABLE.documentInformationDictionary,
self._n_pdf_dictionary_object,
)
)
return self._n_pdf_dictionary_object

@property
def use_deterministic_uuids(self) -> bool:
"""
Expand Down
100 changes: 100 additions & 0 deletions tests/govdocs1/files/000/015/analysis.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"uco-core": "https://ontology.unifiedcyberontology.org/uco/core/",
"uco-observable": "https://ontology.unifiedcyberontology.org/uco/observable/",
"uco-types": "https://ontology.unifiedcyberontology.org/uco/types/",
"xsd": "http://www.w3.org/2001/XMLSchema#"
},
"@graph": [
Expand All @@ -22,6 +23,102 @@
"@value": "55964"
}
},
{
"@id": "kb:ControlledDictionary-0a0b2f35-20de-5bb7-a993-55740c68f7ca",
"@type": "uco-types:ControlledDictionary",
"uco-types:entry": [
{
"@id": "kb:ControlledDictionaryEntry-01dfe8b3-1e16-5f73-bb94-b918fa730543"
},
{
"@id": "kb:ControlledDictionaryEntry-0f02787f-3e37-5594-a993-8208a52099d8"
},
{
"@id": "kb:ControlledDictionaryEntry-49831587-5558-5b66-a220-7039f36edd4c"
},
{
"@id": "kb:ControlledDictionaryEntry-62910360-2495-554b-9096-5f363677d67e"
},
{
"@id": "kb:ControlledDictionaryEntry-6a80cc5e-e41f-55e8-864c-470a1575f5a4"
},
{
"@id": "kb:ControlledDictionaryEntry-8789fd16-8606-576b-8e00-ee2e9f6056d7"
},
{
"@id": "kb:ControlledDictionaryEntry-ae07f7b8-b25e-5117-8f8a-cab8b6498c19"
},
{
"@id": "kb:ControlledDictionaryEntry-d9c7584f-ba6e-5635-ba66-340ff3a15b26"
},
{
"@id": "kb:ControlledDictionaryEntry-df02edd9-5048-5776-a85f-d363a44ae14c"
},
{
"@id": "kb:ControlledDictionaryEntry-f8db854f-ad65-565c-b731-e17283b91b22"
}
]
},
{
"@id": "kb:ControlledDictionaryEntry-01dfe8b3-1e16-5f73-bb94-b918fa730543",
"@type": "uco-types:ControlledDictionaryEntry",
"uco-types:key": "Subject",
"uco-types:value": "Extracted Pages"
},
{
"@id": "kb:ControlledDictionaryEntry-0f02787f-3e37-5594-a993-8208a52099d8",
"@type": "uco-types:ControlledDictionaryEntry",
"uco-types:key": "PageCount",
"uco-types:value": "2"
},
{
"@id": "kb:ControlledDictionaryEntry-49831587-5558-5b66-a220-7039f36edd4c",
"@type": "uco-types:ControlledDictionaryEntry",
"uco-types:key": "Creator",
"uco-types:value": "ACOMP.exe WinVer 1b43 jul 14 2003"
},
{
"@id": "kb:ControlledDictionaryEntry-62910360-2495-554b-9096-5f363677d67e",
"@type": "uco-types:ControlledDictionaryEntry",
"uco-types:key": "Author",
"uco-types:value": "U.S. Government Printing Office"
},
{
"@id": "kb:ControlledDictionaryEntry-6a80cc5e-e41f-55e8-864c-470a1575f5a4",
"@type": "uco-types:ControlledDictionaryEntry",
"uco-types:key": "Producer",
"uco-types:value": "Acrobat Distiller 4.0 for Windows"
},
{
"@id": "kb:ControlledDictionaryEntry-8789fd16-8606-576b-8e00-ee2e9f6056d7",
"@type": "uco-types:ControlledDictionaryEntry",
"uco-types:key": "ModDate",
"uco-types:value": "2005:05:25 02:23:55"
},
{
"@id": "kb:ControlledDictionaryEntry-ae07f7b8-b25e-5117-8f8a-cab8b6498c19",
"@type": "uco-types:ControlledDictionaryEntry",
"uco-types:key": "CreationDate",
"uco-types:value": "2005:05:25 02:23:55"
},
{
"@id": "kb:ControlledDictionaryEntry-d9c7584f-ba6e-5635-ba66-340ff3a15b26",
"@type": "uco-types:ControlledDictionaryEntry",
"uco-types:key": "Title",
"uco-types:value": "Document"
},
{
"@id": "kb:ControlledDictionaryEntry-df02edd9-5048-5776-a85f-d363a44ae14c",
"@type": "uco-types:ControlledDictionaryEntry",
"uco-types:key": "Linearized",
"uco-types:value": "false"
},
{
"@id": "kb:ControlledDictionaryEntry-f8db854f-ad65-565c-b731-e17283b91b22",
"@type": "uco-types:ControlledDictionaryEntry",
"uco-types:key": "PDFVersion",
"uco-types:value": "1.5"
},
{
"@id": "kb:File-69728535-c7b0-548c-a7ac-46116019f793",
"@type": [
Expand Down Expand Up @@ -78,6 +175,9 @@
"drafting:pdfProducer": "Acrobat Distiller 4.0 for Windows",
"drafting:pdfSubject": "Extracted Pages",
"drafting:pdfTitle": "Document",
"uco-observable:documentInformationDictionary": {
"@id": "kb:ControlledDictionary-0a0b2f35-20de-5bb7-a993-55740c68f7ca"
},
"uco-observable:pdfCreationDate": {
"@type": "xsd:dateTime",
"@value": "2005-01-01T02:23:55"
Expand Down
78 changes: 78 additions & 0 deletions tests/govdocs1/files/000/015/analysis.ttl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix uco-core: <https://ontology.unifiedcyberontology.org/uco/core/> .
@prefix uco-observable: <https://ontology.unifiedcyberontology.org/uco/observable/> .
@prefix uco-types: <https://ontology.unifiedcyberontology.org/uco/types/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

kb:ContentDataFacet-c43d5237-b3f7-51d8-a7d6-ac412b476a39
Expand All @@ -17,6 +18,82 @@ kb:ContentDataFacet-c43d5237-b3f7-51d8-a7d6-ac412b476a39
uco-observable:sizeInBytes "55964"^^xsd:integer ;
.

kb:ControlledDictionary-0a0b2f35-20de-5bb7-a993-55740c68f7ca
a uco-types:ControlledDictionary ;
uco-types:entry
kb:ControlledDictionaryEntry-01dfe8b3-1e16-5f73-bb94-b918fa730543 ,
kb:ControlledDictionaryEntry-0f02787f-3e37-5594-a993-8208a52099d8 ,
kb:ControlledDictionaryEntry-49831587-5558-5b66-a220-7039f36edd4c ,
kb:ControlledDictionaryEntry-62910360-2495-554b-9096-5f363677d67e ,
kb:ControlledDictionaryEntry-6a80cc5e-e41f-55e8-864c-470a1575f5a4 ,
kb:ControlledDictionaryEntry-8789fd16-8606-576b-8e00-ee2e9f6056d7 ,
kb:ControlledDictionaryEntry-ae07f7b8-b25e-5117-8f8a-cab8b6498c19 ,
kb:ControlledDictionaryEntry-d9c7584f-ba6e-5635-ba66-340ff3a15b26 ,
kb:ControlledDictionaryEntry-df02edd9-5048-5776-a85f-d363a44ae14c ,
kb:ControlledDictionaryEntry-f8db854f-ad65-565c-b731-e17283b91b22
;
.

kb:ControlledDictionaryEntry-01dfe8b3-1e16-5f73-bb94-b918fa730543
a uco-types:ControlledDictionaryEntry ;
uco-types:key "Subject" ;
uco-types:value "Extracted Pages" ;
.

kb:ControlledDictionaryEntry-0f02787f-3e37-5594-a993-8208a52099d8
a uco-types:ControlledDictionaryEntry ;
uco-types:key "PageCount" ;
uco-types:value "2" ;
.

kb:ControlledDictionaryEntry-49831587-5558-5b66-a220-7039f36edd4c
a uco-types:ControlledDictionaryEntry ;
uco-types:key "Creator" ;
uco-types:value "ACOMP.exe WinVer 1b43 jul 14 2003" ;
.

kb:ControlledDictionaryEntry-62910360-2495-554b-9096-5f363677d67e
a uco-types:ControlledDictionaryEntry ;
uco-types:key "Author" ;
uco-types:value "U.S. Government Printing Office" ;
.

kb:ControlledDictionaryEntry-6a80cc5e-e41f-55e8-864c-470a1575f5a4
a uco-types:ControlledDictionaryEntry ;
uco-types:key "Producer" ;
uco-types:value "Acrobat Distiller 4.0 for Windows" ;
.

kb:ControlledDictionaryEntry-8789fd16-8606-576b-8e00-ee2e9f6056d7
a uco-types:ControlledDictionaryEntry ;
uco-types:key "ModDate" ;
uco-types:value "2005:05:25 02:23:55" ;
.

kb:ControlledDictionaryEntry-ae07f7b8-b25e-5117-8f8a-cab8b6498c19
a uco-types:ControlledDictionaryEntry ;
uco-types:key "CreationDate" ;
uco-types:value "2005:05:25 02:23:55" ;
.

kb:ControlledDictionaryEntry-d9c7584f-ba6e-5635-ba66-340ff3a15b26
a uco-types:ControlledDictionaryEntry ;
uco-types:key "Title" ;
uco-types:value "Document" ;
.

kb:ControlledDictionaryEntry-df02edd9-5048-5776-a85f-d363a44ae14c
a uco-types:ControlledDictionaryEntry ;
uco-types:key "Linearized" ;
uco-types:value "false" ;
.

kb:ControlledDictionaryEntry-f8db854f-ad65-565c-b731-e17283b91b22
a uco-types:ControlledDictionaryEntry ;
uco-types:key "PDFVersion" ;
uco-types:value "1.5" ;
.

kb:File-69728535-c7b0-548c-a7ac-46116019f793
a
uco-observable:ObservableObject ,
Expand Down Expand Up @@ -55,6 +132,7 @@ kb:PDFFileFacet-b8927948-bdae-58e6-b6cd-41841eafe1b2
drafting:pdfProducer "Acrobat Distiller 4.0 for Windows" ;
drafting:pdfSubject "Extracted Pages" ;
drafting:pdfTitle "Document" ;
uco-observable:documentInformationDictionary kb:ControlledDictionary-0a0b2f35-20de-5bb7-a993-55740c68f7ca ;
uco-observable:pdfCreationDate "2005-01-01T02:23:55"^^xsd:dateTime ;
uco-observable:pdfModDate "2005-01-01T02:23:55"^^xsd:dateTime ;
uco-observable:version "1.5" ;
Expand Down

0 comments on commit f7b28a7

Please sign in to comment.