Skip to content

Commit

Permalink
Merge pull request #56 from sldouglas-nist/add_pdf_file_mapping
Browse files Browse the repository at this point in the history
Add PDF File Mapping
  • Loading branch information
ajnelson-nist authored Jun 28, 2024
2 parents e1a0394 + d59cb21 commit 7979044
Show file tree
Hide file tree
Showing 10 changed files with 627 additions and 95 deletions.
156 changes: 145 additions & 11 deletions case_exiftool/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

_logger = logging.getLogger(os.path.basename(__file__))

NS_DRAFTING = rdflib.Namespace("http://example.org/ontology/drafting/")
NS_EXIFTOOL_COMPOSITE = rdflib.Namespace("http://ns.exiftool.org/Composite/1.0/")
NS_EXIFTOOL_ET = rdflib.Namespace("http://ns.exiftool.org/1.0/")
NS_EXIFTOOL_EXIFTOOL = rdflib.Namespace("http://ns.exiftool.org/ExifTool/1.0/")
Expand All @@ -50,6 +51,7 @@
NS_EXIFTOOL_IFD0 = rdflib.Namespace("http://ns.exiftool.org/EXIF/IFD0/1.0/")
NS_EXIFTOOL_EXIFIFD = rdflib.Namespace("http://ns.exiftool.org/EXIF/ExifIFD/1.0/")
NS_EXIFTOOL_NIKON = rdflib.Namespace("http://ns.exiftool.org/MakerNotes/Nikon/1.0/")
NS_EXIFTOOL_PDF_PDF = rdflib.Namespace("http://ns.exiftool.org/PDF/PDF/1.0/")
NS_EXIFTOOL_PREVIEWIFD = rdflib.Namespace(
"http://ns.exiftool.org/MakerNotes/PreviewIFD/1.0/"
)
Expand Down Expand Up @@ -175,12 +177,13 @@ def __init__(
self._n_exif_facet: typing.Optional[rdflib.URIRef] = None
self._n_file_facet: typing.Optional[rdflib.URIRef] = None
self._n_location_object: typing.Optional[rdflib.URIRef] = None
self._n_pdf_file_facet: typing.Optional[rdflib.URIRef] = None
self._n_location_object_latlong_facet: typing.Optional[rdflib.URIRef] = None
self._n_observable_object: typing.Optional[rdflib.URIRef] = None
self._n_raster_picture_facet: typing.Optional[rdflib.URIRef] = None
self._n_relationship_object_location: typing.Optional[rdflib.URIRef] = None
self._n_unix_file_permissions_facet: typing.Optional[rdflib.URIRef] = None
self._oo_slug: typing.Optional[str] = None
self._oo_slug: str = "File-"
self.ns_base = ns_base

def map_raw_and_printconv_iri(self, n_exiftool_predicate: rdflib.URIRef) -> None:
Expand Down Expand Up @@ -369,6 +372,110 @@ def map_raw_and_printconv_iri(self, n_exiftool_predicate: rdflib.URIRef) -> None
rdflib.Literal(int(v_raw.toPython())),
)
)
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/Author":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.graph.add(
(
self.n_pdf_file_facet,
NS_DRAFTING.pdfAuthor,
rdflib.Literal(v_raw.toPython()),
)
)
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/CreateDate":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.graph.add(
(
self.n_pdf_file_facet,
NS_UCO_OBSERVABLE.pdfCreationDate,
rdflib.Literal(
v_raw.toPython().replace(" ", "T"), datatype=NS_XSD.dateTime
),
)
)
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/Creator":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.graph.add(
(
self.n_pdf_file_facet,
NS_DRAFTING.pdfCreator,
rdflib.Literal(v_raw.toPython()),
)
)
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/Linearized":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.graph.add(
(
self.n_pdf_file_facet,
NS_DRAFTING.pdfLinearized,
rdflib.Literal(v_raw.toPython()),
)
)
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/ModifyDate":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.graph.add(
(
self.n_pdf_file_facet,
NS_UCO_OBSERVABLE.pdfModDate,
rdflib.Literal(
v_raw.toPython().replace(" ", "T"), datatype=NS_XSD.dateTime
),
)
)
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/PDFVersion":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.graph.add(
(
self.n_pdf_file_facet,
NS_UCO_OBSERVABLE.version,
rdflib.Literal(v_raw.toPython()),
)
)
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/PageCount":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.graph.add(
(
self.n_pdf_file_facet,
NS_DRAFTING.pdfPageCount,
rdflib.Literal(v_raw.toPython()),
)
)
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/Producer":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.graph.add(
(
self.n_pdf_file_facet,
NS_DRAFTING.pdfProducer,
rdflib.Literal(v_raw.toPython()),
)
)
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/Subject":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.graph.add(
(
self.n_pdf_file_facet,
NS_DRAFTING.pdfSubject,
rdflib.Literal(v_raw.toPython()),
)
)
elif exiftool_iri == "http://ns.exiftool.org/PDF/PDF/1.0/Title":
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
if isinstance(v_raw, rdflib.Literal):
self.graph.add(
(
self.n_pdf_file_facet,
NS_DRAFTING.pdfTitle,
rdflib.Literal(v_raw.toPython()),
)
)
else:
# Somewhat in the name of information preservation, somewhat as a progress marker on converting data: Attach all remaining unconverted properties directly to the ObservableObject. Provide both values to assist with mapping decisions.
(v_raw, v_printconv) = self.pop_n_exiftool_predicate(n_exiftool_predicate)
Expand Down Expand Up @@ -423,16 +530,6 @@ def _load_xml_file_into_dict(
rdflib.URIRef("http://ns.exiftool.org/File/1.0/MIMEType")
)

# Determine slug by MIME type.
self.oo_slug = "File-" # The prefix "oo_" means generic observable object.
if self.mime_type == "image/jpeg":
self.oo_slug = "Picture-"
else:
_logger.warning("TODO - MIME type %r not yet implemented." % self.mime_type)

# Access observable object to instantiate it with the oo_slug value.
_ = self.n_observable_object

# Finish special case MIME type processing left undone by map_raw_and_printconv_iri.
if self.mime_type is not None:
self.graph.add(
Expand Down Expand Up @@ -514,6 +611,19 @@ def mime_type(self) -> typing.Optional[str]:
def mime_type(self, value: str) -> None:
assert isinstance(value, str)
self._mime_type = value
if value == "application/pdf":
self.graph.add(
(self.n_observable_object, NS_RDF.type, NS_UCO_OBSERVABLE.PDFFile)
)
elif self.mime_type == "image/jpeg":
self.graph.add(
(self.n_observable_object, NS_RDF.type, NS_UCO_OBSERVABLE.RasterPicture)
)

else:
_logger.warning("TODO - MIME type %r not yet implemented." % self.mime_type)

# Access observable object to instantiate it with the oo_slug value.

@property
def n_camera_object(self) -> rdflib.URIRef:
Expand Down Expand Up @@ -851,6 +961,28 @@ def oo_slug(self, value: str) -> None:
assert isinstance(value, str)
self._oo_slug = value

@property
def n_pdf_file_facet(self) -> rdflib.URIRef:
"""
Initialized on first access.
"""
if self._n_pdf_file_facet is None:
if self.use_deterministic_uuids:
self._n_pdf_file_facet = case_utils.inherent_uuid.get_facet_uriref(
self.n_observable_object,
NS_UCO_OBSERVABLE.PDFFileFacet,
namespace=self.ns_base,
)
else:
self._n_pdf_file_facet = self.ns_base["PDFFileFacet-" + local_uuid()]
self.graph.add(
(self._n_pdf_file_facet, NS_RDF.type, NS_UCO_OBSERVABLE.PDFFileFacet)
)
self.graph.add(
(self.n_observable_object, NS_UCO_CORE.hasFacet, self._n_pdf_file_facet)
)
return self._n_pdf_file_facet

@property
def use_deterministic_uuids(self) -> bool:
"""
Expand All @@ -868,10 +1000,12 @@ def main() -> None:
NS_BASE = rdflib.Namespace(args.base_prefix)
out_graph = rdflib.Graph()

out_graph.namespace_manager.bind("drafting", NS_DRAFTING)
out_graph.namespace_manager.bind("exiftool-Composite", NS_EXIFTOOL_COMPOSITE)
out_graph.namespace_manager.bind("exiftool-et", NS_EXIFTOOL_ET)
out_graph.namespace_manager.bind("exiftool-ExifTool", NS_EXIFTOOL_EXIFTOOL)
out_graph.namespace_manager.bind("exiftool-System", NS_EXIFTOOL_SYSTEM)
out_graph.namespace_manager.bind("exiftool-PDF-PDF", NS_EXIFTOOL_PDF_PDF)
out_graph.namespace_manager.bind("exiftool-File", NS_EXIFTOOL_FILE)
out_graph.namespace_manager.bind("exiftool-GPS", NS_EXIFTOOL_GPS)
out_graph.namespace_manager.bind("exiftool-IFD0", NS_EXIFTOOL_IFD0)
Expand Down
23 changes: 17 additions & 6 deletions tests/govdocs1/Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
#!/usr/bin/make -f

# Portions of this file contributed by NIST are governed by the
# following statement:
#
# This software was developed at the National Institute of Standards
# and Technology by employees of the Federal Government in the course
# of their official duties. Pursuant to title 17 Section 105 of the
# United States Code this software is not subject to copyright
# protection and is in the public domain. NIST assumes no
# responsibility whatsoever for its use by other parties, and makes
# no guarantees, expressed or implied, about its quality,
# reliability, or any other characteristic.
# of their official duties. Pursuant to Title 17 Section 105 of the
# United States Code, this software is not subject to copyright
# protection within the United States. NIST assumes no responsibility
# whatsoever for its use by other parties, and makes no guarantees,
# expressed or implied, about its quality, reliability, or any other
# characteristic.
#
# We would appreciate acknowledgement if the software is used.

Expand All @@ -16,13 +19,21 @@ SHELL := /bin/bash
all:
$(MAKE) \
--directory files/799/987
$(MAKE) \
--directory files/000/015

check:
$(MAKE) \
--directory files/799/987 \
check
$(MAKE) \
--directory files/000/015 \
check

clean:
@$(MAKE) \
--directory files/799/987 \
clean
@$(MAKE) \
--directory files/000/015 \
clean
32 changes: 32 additions & 0 deletions tests/govdocs1/files/000/015/000015_printConv.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<?xml version='1.0' encoding='UTF-8'?>
<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'>

<rdf:Description rdf:about='http://example.org/kb/govdocs1/000015.pdf'
xmlns:et='http://ns.exiftool.org/1.0/' et:toolkit='Image::ExifTool 12.76'
xmlns:ExifTool='http://ns.exiftool.org/ExifTool/1.0/'
xmlns:System='http://ns.exiftool.org/File/System/1.0/'
xmlns:File='http://ns.exiftool.org/File/1.0/'
xmlns:PDF='http://ns.exiftool.org/PDF/PDF/1.0/'>
<ExifTool:ExifToolVersion>12.76</ExifTool:ExifToolVersion>
<System:FileName>000015.pdf</System:FileName>
<System:Directory>.</System:Directory>
<System:FileSize>56 kB</System:FileSize>
<System:FileModifyDate>2024:06:13 17:32:40-04:00</System:FileModifyDate>
<System:FileAccessDate>2024:06:13 17:34:31-04:00</System:FileAccessDate>
<System:FileInodeChangeDate>2024:06:13 17:32:40-04:00</System:FileInodeChangeDate>
<System:FilePermissions>-rw-r--r--</System:FilePermissions>
<File:FileType>PDF</File:FileType>
<File:FileTypeExtension>pdf</File:FileTypeExtension>
<File:MIMEType>application/pdf</File:MIMEType>
<PDF:PDFVersion>1.5</PDF:PDFVersion>
<PDF:Linearized>No</PDF:Linearized>
<PDF:ModifyDate>2005:05:25 02:23:55</PDF:ModifyDate>
<PDF:Title>Document</PDF:Title>
<PDF:Creator>ACOMP.exe WinVer 1b43 jul 14 2003</PDF:Creator>
<PDF:Producer>Acrobat Distiller 4.0 for Windows</PDF:Producer>
<PDF:CreateDate>2005:05:25 02:23:55</PDF:CreateDate>
<PDF:Author>U.S. Government Printing Office</PDF:Author>
<PDF:Subject>Extracted Pages</PDF:Subject>
<PDF:PageCount>2</PDF:PageCount>
</rdf:Description>
</rdf:RDF>
32 changes: 32 additions & 0 deletions tests/govdocs1/files/000/015/000015_raw.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<?xml version='1.0' encoding='UTF-8'?>
<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'>

<rdf:Description rdf:about='http://example.org/kb/govdocs1/000015.pdf'
xmlns:et='http://ns.exiftool.org/1.0/' et:toolkit='Image::ExifTool 12.76'
xmlns:ExifTool='http://ns.exiftool.org/ExifTool/1.0/'
xmlns:System='http://ns.exiftool.org/File/System/1.0/'
xmlns:File='http://ns.exiftool.org/File/1.0/'
xmlns:PDF='http://ns.exiftool.org/PDF/PDF/1.0/'>
<ExifTool:ExifToolVersion>12.76</ExifTool:ExifToolVersion>
<System:FileName>000015.pdf</System:FileName>
<System:Directory>.</System:Directory>
<System:FileSize>55964</System:FileSize>
<System:FileModifyDate>2024:06:13 17:32:40-04:00</System:FileModifyDate>
<System:FileAccessDate>2024:06:13 17:34:31-04:00</System:FileAccessDate>
<System:FileInodeChangeDate>2024:06:13 17:32:40-04:00</System:FileInodeChangeDate>
<System:FilePermissions>100644</System:FilePermissions>
<File:FileType>PDF</File:FileType>
<File:FileTypeExtension>PDF</File:FileTypeExtension>
<File:MIMEType>application/pdf</File:MIMEType>
<PDF:PDFVersion>1.5</PDF:PDFVersion>
<PDF:Linearized>false</PDF:Linearized>
<PDF:ModifyDate>2005:05:25 02:23:55</PDF:ModifyDate>
<PDF:Title>Document</PDF:Title>
<PDF:Creator>ACOMP.exe WinVer 1b43 jul 14 2003</PDF:Creator>
<PDF:Producer>Acrobat Distiller 4.0 for Windows</PDF:Producer>
<PDF:CreateDate>2005:05:25 02:23:55</PDF:CreateDate>
<PDF:Author>U.S. Government Printing Office</PDF:Author>
<PDF:Subject>Extracted Pages</PDF:Subject>
<PDF:PageCount>2</PDF:PageCount>
</rdf:Description>
</rdf:RDF>
Loading

0 comments on commit 7979044

Please sign in to comment.