diff --git a/LICENSE b/LICENSE
index 261eeb9..3e4a79b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -66,7 +66,7 @@
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
+ copyright license to reproduce, prep Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
diff --git a/MANIFEST.in b/MANIFEST.in
index 4927edd..0ae76ad 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,5 @@
include LICENSE
include README.md
recursive-include readux_ingest_ecds/templates *
+recursive-include readux_ingest_ecds/services *
prune test*
diff --git a/README.md b/README.md
index 8a9d903..6d2b81f 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,7 @@ python manage.py migrate readux_ingest_ecds
| IIIF_RELATED_LINK_MODEL | Model reference, eg. 'iiif.RelatedLink' |
| IIIF_CANVAS_MODEL | Model reference, eg. 'iiif.Canvas' |
| IIIF_COLLECTION_MODEL | Model reference, eg. 'iiif.Collection' |
+| IIIF_OCR_MODEL | Model reference, eg. 'iiif.OCR' |
| INGEST_TMP_DIR | Absolute path where files will be temporarily stored. |
| INGEST_PROCESSING_DIR | Absolute path where Lambda will look for images. |
| INGEST_OCR_DIR | Absolute path where OCR files will be preserved. |
diff --git a/readux_ingest_ecds/admin.py b/readux_ingest_ecds/admin.py
index 35d3af4..6a847e6 100644
--- a/readux_ingest_ecds/admin.py
+++ b/readux_ingest_ecds/admin.py
@@ -1,10 +1,7 @@
import os
import logging
from django.contrib import admin
-from django.urls import reverse
-from django.utils.html import format_html
from django.shortcuts import redirect
-from django_celery_results.models import TaskResult
from .models import Local
from .tasks import local_ingest_task_ecds
@@ -18,15 +15,15 @@ class LocalAdmin(admin.ModelAdmin):
def save_model(self, request, obj, form, change):
LOGGER.info(f'INGEST: Local ingest started by {request.user.username}')
obj.creator = request.user
- obj.process()
+ obj.prep()
super().save_model(request, obj, form, change)
-
- def response_add(self, request, obj, post_url_continue=None):
- obj.refresh_from_db()
if os.environ["DJANGO_ENV"] != 'test': # pragma: no cover
local_ingest_task_ecds.apply_async(args=[obj.id])
else:
local_ingest_task_ecds(obj.id)
+
+ def response_add(self, request, obj, post_url_continue=None):
+ obj.refresh_from_db()
LOGGER.info(f'INGEST: Local ingest - {obj.id} - added for {obj.manifest.pid}')
return redirect('/admin/manifests/manifest/{m}/change/'.format(m=obj.manifest.pk))
diff --git a/readux_ingest_ecds/helpers.py b/readux_ingest_ecds/helpers.py
index d45f0e3..2249663 100644
--- a/readux_ingest_ecds/helpers.py
+++ b/readux_ingest_ecds/helpers.py
@@ -10,6 +10,7 @@ def get_iiif_models():
'RelatedLink': apps.get_model(settings.IIIF_RELATED_LINK_MODEL),
'Canvas': apps.get_model(settings.IIIF_CANVAS_MODEL),
'Collection': apps.get_model(settings.IIIF_COLLECTION_MODEL),
+ 'OCR': apps.get_model(settings.IIIF_OCR_MODEL),
}
except AppRegistryNotReady:
return {
@@ -18,4 +19,5 @@ def get_iiif_models():
'RelatedLink': settings.IIIF_RELATED_LINK_MODEL,
'Canvas': settings.IIIF_CANVAS_MODEL,
'Collection': settings.IIIF_COLLECTION_MODEL,
+ 'OCR': settings.IIIF_OCR_MODEL,
}
diff --git a/readux_ingest_ecds/models.py b/readux_ingest_ecds/models.py
index dd1897c..bbf7d10 100644
--- a/readux_ingest_ecds/models.py
+++ b/readux_ingest_ecds/models.py
@@ -4,7 +4,9 @@
from django.core.files.storage import FileSystemStorage
from django.db import models
from django.conf import settings
-from .services import is_image, is_ocr, is_junk, metadata_from_file, create_manifest, move_image_file, move_ocr_file, canvas_dimensions, upload_trigger_file
+from .services.file_services import is_image, is_ocr, is_junk, move_image_file, move_ocr_file, canvas_dimensions, upload_trigger_file
+from .services.iiif_services import create_manifest
+from .services.metadata_services import metadata_from_file
from .helpers import get_iiif_models
Manifest = get_iiif_models()['Manifest']
@@ -74,12 +76,13 @@ def ocr_directory(self):
def trigger_file(self):
return os.path.join(settings.INGEST_TMP_DIR, f'{self.manifest.pid}.txt')
- def process(self):
+ def prep(self):
"""
Open metadata
Create manifest
Unzip bundle
"""
+ LOGGER.info(f'INGEST: Local ingest - preparing new local ingest')
os.makedirs(settings.INGEST_TMP_DIR, exist_ok=True)
os.makedirs(settings.INGEST_PROCESSING_DIR, exist_ok=True)
os.makedirs(settings.INGEST_OCR_DIR, exist_ok=True)
diff --git a/readux_ingest_ecds/services.py b/readux_ingest_ecds/services.py
deleted file mode 100644
index 98ad3ad..0000000
--- a/readux_ingest_ecds/services.py
+++ /dev/null
@@ -1,353 +0,0 @@
-""" Module of service classes and methods for ingest. """
-import itertools
-import os
-from shutil import move
-from PIL import Image
-from boto3 import resource
-from tablib.core import Dataset
-from mimetypes import guess_type
-from urllib.parse import unquote, urlparse
-
-from django.conf import settings
-
-from .helpers import get_iiif_models
-
-Manifest = get_iiif_models()['Manifest']
-RelatedLink = get_iiif_models()['RelatedLink']
-
-def clean_metadata(metadata):
- """Remove keys that do not align with Manifest fields.
-
- :param metadata:
- :type metadata: tablib.Dataset
- :return: Dictionary with keys matching Manifest fields
- :rtype: dict
- """
- metadata = {key.casefold().replace(' ', '_'): value for key, value in metadata.items()}
- fields = [f.name for f in get_iiif_models()['Manifest']._meta.get_fields()]
- invalid_keys = []
-
- for key in metadata.keys():
- if key != 'metadata' and isinstance(metadata[key], list):
- if isinstance(metadata[key][0], dict):
- for meta_key in metadata[key][0].keys():
- if 'value' in meta_key:
- metadata[key] = metadata[key][0][meta_key]
- else:
- metadata[key] = ', '.join(metadata[key])
- if key not in fields:
- invalid_keys.append(key)
-
- for invalid_key in invalid_keys:
- metadata.pop(invalid_key)
-
- return metadata
-
-def create_manifest(ingest):
- """
- Create or update a Manifest from supplied metadata and images.
- :return: New or updated Manifest with supplied `pid`
- :rtype: iiif.manifest.models.Manifest
- """
- Manifest = get_iiif_models()['Manifest']
- manifest = None
- # Make a copy of the metadata so we don't extract it over and over.
- try:
- if not bool(ingest.manifest) or ingest.manifest is None:
- ingest.open_metadata()
-
- metadata = dict(ingest.metadata)
- except TypeError:
- metadata = None
- if metadata:
- if 'pid' in metadata:
- manifest, created = Manifest.objects.get_or_create(pid=metadata['pid'].replace('_', '-'))
- else:
- manifest = Manifest.objects.create()
- for (key, value) in metadata.items():
- setattr(manifest, key, value)
- else:
- manifest = Manifest()
-
- manifest.image_server = ingest.image_server
-
- # This was giving me a 'django.core.exceptions.AppRegistryNotReady: Models aren't loaded yet' error.
- # Remote = apps.get_model('ingest.remote')
-
- # Ensure that manifest has an ID before updating the M2M relationship
- manifest.save()
- # if not isinstance(ingest, Remote):
- manifest.refresh_from_db()
- manifest.collections.set(ingest.collections.all())
- # Save again once relationship is set
- manifest.save()
-
- # if type(ingest, .models.Remote):
- # if isinstance(ingest, Remote):
- # RelatedLink(
- # manifest=manifest,
- # link=ingest.remote_url,
- # format='application/ld+json'
- # ).save()
-
- return manifest
-
-def extract_image_server(canvas):
- """Determines the IIIF image server URL for a given IIIF Canvas
-
- :param canvas: IIIF Canvas
- :type canvas: dict
- :return: IIIF image server URL
- :rtype: str
- """
- url = urlparse(canvas['images'][0]['resource']['service']['@id'])
- parts = url.path.split('/')
- parts.pop()
- base_path = '/'.join(parts)
- host = url.hostname
- if url.port is not None:
- host = '{h}:{p}'.format(h=url.hostname, p=url.port)
- return '{s}://{h}{p}'.format(s=url.scheme, h=host, p=base_path)
-
-def parse_iiif_v2_manifest(data):
- """Parse IIIF Manifest based on v2.1.1 or the presentation API.
- https://iiif.io/api/presentation/2.1
-
- :param data: IIIF Presentation v2.1.1 manifest
- :type data: dict
- :return: Extracted metadata
- :rtype: dict
- """
- properties = {}
- manifest_data = []
-
- if 'metadata' in data:
- manifest_data.append({ 'metadata': data['metadata'] })
-
- for iiif_metadata in [{prop['label']: prop['value']} for prop in data['metadata']]:
- properties.update(iiif_metadata)
-
- # Sometimes, the label appears as a list.
- if 'label' in data.keys() and isinstance(data['label'], list):
- data['label'] = ' '.join(data['label'])
-
- manifest_data.extend([{prop: data[prop]} for prop in data if isinstance(data[prop], str)])
-
- for datum in manifest_data:
- properties.update(datum)
-
- uri = urlparse(data['@id'])
-
- if not uri.query:
- properties['pid'] = uri.path.split('/')[-2]
- else:
- properties['pid'] = uri.query
-
- if 'description' in data.keys():
- if isinstance(data['description'], list):
- if isinstance(data['description'][0], dict):
- en = [lang['@value'] for lang in data['description'] if lang['@language'] == 'en']
- properties['summary'] = data['description'][0]['@value'] if not en else en[0]
- else:
- properties['summary'] = data['description'][0]
- else:
- properties['summary'] = data['description']
-
- if 'logo' in properties:
- properties['logo_url'] = properties['logo']
- properties.pop('logo')
-
- manifest_metadata = clean_metadata(properties)
-
- return manifest_metadata
-
-def parse_iiif_v2_canvas(canvas):
- """ """
- canvas_id = canvas['@id'].split('/')
- pid = canvas_id[-1] if canvas_id[-1] != 'canvas' else canvas_id[-2]
-
- service = urlparse(canvas['images'][0]['resource']['service']['@id'])
- resource = unquote(service.path.split('/').pop())
-
- summary = canvas['description'] if 'description' in canvas.keys() else ''
- label = canvas['label'] if 'label' in canvas.keys() else ''
- return {
- 'pid': pid,
- 'height': canvas['height'],
- 'width': canvas['width'],
- 'summary': summary,
- 'label': label,
- 'resource': resource
- }
-
-def get_metadata_from(files):
- """
- Find metadata file in uploaded files.
- :return: If metadata file exists, returns the values. If no file, returns None.
- :rtype: list or None
- """
- metadata = None
- for file in files:
- if metadata is not None:
- continue
- if 'zip' in guess_type(file.name)[0]:
- continue
- if 'metadata' in file.name.casefold():
- stream = file.read()
- if 'csv' in guess_type(file.name)[0] or 'tab-separated' in guess_type(file.name)[0]:
- metadata = Dataset().load(stream.decode('utf-8-sig'), format='csv').dict
- else:
- metadata = Dataset().load(stream).dict
- return metadata
-
-def metadata_from_file(metadata_file):
- format = metadata_file_format(metadata_file)
- if format is None:
- return
-
- metadata = None
-
- if format == 'excel':
- with open(metadata_file, 'rb') as fh:
- metadata = Dataset().load(fh.read(), format=metadata_file.split('.')[-1])
- else:
- with open(metadata_file, 'r', encoding="utf-8-sig") as fh:
- metadata = Dataset().load(fh.read(), format=format)
-
- if metadata is not None:
- metadata = clean_metadata(metadata.dict[0])
-
- return metadata
-
-def get_associated_meta(all_metadata, file):
- """
- Associate metadata with filename.
- :return: If a matching filename is found, returns the row as dict,
- with generated pid. Otherwise, returns {}.
- :rtype: dict
- """
- file_meta = {}
- extless_filename = file.name[0:file.name.rindex('.')]
- for meta_dict in all_metadata:
- for key, val in meta_dict.items():
- if key.casefold() == 'filename':
- metadata_found_filename = val
- # Match filename column, case-sensitive, against filename
- if metadata_found_filename and metadata_found_filename in (extless_filename, file.name):
- file_meta = meta_dict
- return file_meta
-
-def lowercase_first_line(iterator):
- """Lowercase the first line of a text file (such as the header row of a CSV)"""
- return itertools.chain(
- # ignore unicode characters, set lowercase, and strip whitespace
- [next(iterator).encode('ascii', 'ignore').decode().casefold().strip()], iterator
- )
-
-def is_image(file_path):
- """Check if file is expected type for image files
-
- :param file_path: Name of file to check
- :type file_path: str
- :return: Bool if file type is an image.
- :rtype: bool
- """
- return file_path is not None and 'images' in file_path and 'image' in guess_type(file_path)[0]
-
-def is_ocr(file_path):
- """Check if file is expected type for OCR files
-
- :param file_path: Name of file to check
- :type file_path: str
- :return: Bool if file type matches OCR file types.
- :rtype: bool
- """
- ocr_file_types = ['text', 'xml','json','html', 'hocr', 'tsv']
- return file_path is not None and 'ocr' in file_path and any(file_path.endswith(ocr_type) for ocr_type in ocr_file_types)
-
-def metadata_file_format(file_path):
- """Get format used to read the metadata file
-
- :param file_path: Name of metadata file
- :type file_path: str
- :return: Format of metadata file, csv, tsv, excel, or None
- :rtype: str, None
- """
- if file_path is None:
- return None
-
- file_type = guess_type(file_path)[0]
-
- if 'csv' in file_type:
- return 'csv'
- elif 'tab-separated' in file_type:
- return 'tsv'
- elif 'officedocument' in file_type:
- return 'excel'
-
- return None
-
-def is_junk(file_path):
- """Check if a file should be considered junk
-
- :param file_path: File name to check
- :type file_path: str
- :return: True if file name starts with special char
- :rtype: bol
- """
- return file_path.startswith('.') or file_path.startswith('~') or file_path.startswith('__') or file_path.endswith('/') or file_path == ''
-
-def move_image_file(ingest, file_path):
- """ Move files to directory where they processed.
- Add the Manifest pid to the file name if not already there.
-
- :param ingest: Ingest object
- :type ingest: _type_
- :param file_path: Absolute path of tmp file
- :type file_path: str
- :return: File name file to be processed
- :rtype: str
- """
- base_name = os.path.basename(file_path)
- if ingest.manifest.pid not in base_name:
- base_name = f'{ingest.manifest.pid}_{base_name}'
- move(file_path, os.path.join(settings.INGEST_PROCESSING_DIR, base_name))
- return base_name
-
-def move_ocr_file(ingest, file_path):
- """ Move OCR file to where it belongs.
-
- :param ingest: Ingest object
- :type ingest: _type_
- :param file_path: Absolute path of tmp file
- :type file_path: str
- """
- base_name = os.path.basename(file_path)
- if ingest.manifest.pid not in base_name:
- base_name = f'{ingest.manifest.pid}_{base_name}'
- move(file_path, os.path.join(ingest.ocr_directory, base_name))
-
-def upload_trigger_file(trigger_file):
- """
- Upload trigger file to S3. The file contains a list of images being ingested.
- The file will be picked up by an AWS lambda function and the images will be
- converted to ptiffs.
-
- :param trigger_file: Absolute path to trigger file.
- :type trigger_file: str
- """
- s3 = resource('s3')
- s3.Bucket(settings.INGEST_TRIGGER_BUCKET).upload_file(trigger_file, os.path.basename(trigger_file))
-
-def canvas_dimensions(image_name):
- """Get canvas dimensions
-
- :param image_name: File name without extension of image file.
- :type image_name: str
- :return: 2-tuple containing width and height (in pixels)
- :rtype: tuple
- """
- original_image = [img for img in os.listdir(settings.INGEST_PROCESSING_DIR) if img.startswith(image_name)]
- if len(original_image) > 0:
- return Image.open(os.path.join(settings.INGEST_PROCESSING_DIR, original_image[0])).size
- return (0,0)
diff --git a/readux_ingest_ecds/services/__init__.py b/readux_ingest_ecds/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/readux_ingest_ecds/services/file_services.py b/readux_ingest_ecds/services/file_services.py
new file mode 100644
index 0000000..8618538
--- /dev/null
+++ b/readux_ingest_ecds/services/file_services.py
@@ -0,0 +1,99 @@
+""" Module of service methods for ingest files. """
+import os
+from shutil import move
+from PIL import Image
+from boto3 import resource
+from mimetypes import guess_type
+
+from django.conf import settings
+
+from readux_ingest_ecds.helpers import get_iiif_models
+
+Manifest = get_iiif_models()['Manifest']
+RelatedLink = get_iiif_models()['RelatedLink']
+
+def is_image(file_path):
+ """Check if file is expected type for image files
+
+ :param file_path: Name of file to check
+ :type file_path: str
+ :return: Bool if file type is an image.
+ :rtype: bool
+ """
+ return file_path is not None and 'images' in file_path and 'image' in guess_type(file_path)[0]
+
+def is_ocr(file_path):
+ """Check if file is expected type for OCR files
+
+ :param file_path: Name of file to check
+ :type file_path: str
+ :return: Bool if file type matches OCR file types.
+ :rtype: bool
+ """
+ ocr_file_types = ['text', 'xml','json','html', 'hocr', 'tsv']
+ return file_path is not None and 'ocr' in file_path and any(file_path.endswith(ocr_type) for ocr_type in ocr_file_types)
+
+def is_junk(file_path):
+ """Check if a file should be considered junk
+
+ :param file_path: File name to check
+ :type file_path: str
+ :return: True if file name starts with special char
+ :rtype: bol
+ """
+ return file_path.startswith('.') or file_path.startswith('~') or file_path.startswith('__') or file_path.endswith('/') or file_path == ''
+
+def move_image_file(ingest, file_path):
+ """ Move files to directory where they processed.
+ Add the Manifest pid to the file name if not already there.
+
+ :param ingest: Ingest object
+ :type ingest: _type_
+ :param file_path: Absolute path of tmp file
+ :type file_path: str
+ :return: File name file to be processed
+ :rtype: str
+ """
+ base_name = os.path.basename(file_path)
+ if ingest.manifest.pid not in base_name:
+ base_name = f'{ingest.manifest.pid}_{base_name}'
+ move(file_path, os.path.join(settings.INGEST_PROCESSING_DIR, base_name))
+ return base_name
+
+def move_ocr_file(ingest, file_path):
+ """ Move OCR file to where it belongs.
+
+ :param ingest: Ingest object
+ :type ingest: _type_
+ :param file_path: Absolute path of tmp file
+ :type file_path: str
+ """
+ base_name = os.path.basename(file_path)
+ if ingest.manifest.pid not in base_name:
+ base_name = f'{ingest.manifest.pid}_{base_name}'
+ move(file_path, os.path.join(ingest.ocr_directory, base_name))
+
+def upload_trigger_file(trigger_file):
+ """
+ Upload trigger file to S3. The file contains a list of images being ingested.
+ The file will be picked up by an AWS lambda function and the images will be
+ converted to ptiffs.
+
+ :param trigger_file: Absolute path to trigger file.
+ :type trigger_file: str
+ """
+ s3 = resource('s3')
+ s3.Bucket(settings.INGEST_TRIGGER_BUCKET).upload_file(trigger_file, os.path.basename(trigger_file))
+
+def canvas_dimensions(image_name):
+ """Get canvas dimensions
+
+ :param image_name: File name without extension of image file.
+ :type image_name: str
+ :return: 2-tuple containing width and height (in pixels)
+ :rtype: tuple
+ """
+ original_image = [img for img in os.listdir(settings.INGEST_PROCESSING_DIR) if img.startswith(image_name)]
+ if len(original_image) > 0:
+ return Image.open(os.path.join(settings.INGEST_PROCESSING_DIR, original_image[0])).size
+ return (0,0)
diff --git a/readux_ingest_ecds/services/iiif_services.py b/readux_ingest_ecds/services/iiif_services.py
new file mode 100644
index 0000000..70aa6f1
--- /dev/null
+++ b/readux_ingest_ecds/services/iiif_services.py
@@ -0,0 +1,43 @@
+""" Module of service methods for IIIF objects. """
+from readux_ingest_ecds.helpers import get_iiif_models
+
+Manifest = get_iiif_models()['Manifest']
+RelatedLink = get_iiif_models()['RelatedLink']
+OCR = get_iiif_models()['OCR']
+
+def create_manifest(ingest):
+ """
+ Create or update a Manifest from supplied metadata and images.
+ :return: New or updated Manifest with supplied `pid`
+ :rtype: iiif.manifest.models.Manifest
+ """
+ Manifest = get_iiif_models()['Manifest']
+ manifest = None
+ # Make a copy of the metadata so we don't extract it over and over.
+ try:
+ if not bool(ingest.manifest) or ingest.manifest is None:
+ ingest.open_metadata()
+
+ metadata = dict(ingest.metadata)
+ except TypeError:
+ metadata = None
+ if metadata:
+ if 'pid' in metadata:
+ manifest, created = Manifest.objects.get_or_create(pid=metadata['pid'].replace('_', '-'))
+ else:
+ manifest = Manifest.objects.create()
+ for (key, value) in metadata.items():
+ setattr(manifest, key, value)
+ else:
+ manifest = Manifest()
+
+ manifest.image_server = ingest.image_server
+
+ # Ensure that manifest has an ID before updating the M2M relationship
+ manifest.save()
+ manifest.refresh_from_db()
+ manifest.collections.set(ingest.collections.all())
+ # Save again once relationship is set
+ manifest.save()
+
+ return manifest
diff --git a/readux_ingest_ecds/services/metadata_services.py b/readux_ingest_ecds/services/metadata_services.py
new file mode 100644
index 0000000..9b73398
--- /dev/null
+++ b/readux_ingest_ecds/services/metadata_services.py
@@ -0,0 +1,96 @@
+""" Module of service methods for ingest files. """
+from readux_ingest_ecds.helpers import get_iiif_models
+from mimetypes import guess_type
+from tablib.core import Dataset
+
+Manifest = get_iiif_models()['Manifest']
+RelatedLink = get_iiif_models()['RelatedLink']
+
+def clean_metadata(metadata):
+ """Remove keys that do not align with Manifest fields.
+
+ :param metadata:
+ :type metadata: tablib.Dataset
+ :return: Dictionary with keys matching Manifest fields
+ :rtype: dict
+ """
+ metadata = {key.casefold().replace(' ', '_'): value for key, value in metadata.items()}
+ fields = [f.name for f in get_iiif_models()['Manifest']._meta.get_fields()]
+ invalid_keys = []
+
+ for key in metadata.keys():
+ if key != 'metadata' and isinstance(metadata[key], list):
+ if isinstance(metadata[key][0], dict):
+ for meta_key in metadata[key][0].keys():
+ if 'value' in meta_key:
+ metadata[key] = metadata[key][0][meta_key]
+ else:
+ metadata[key] = ', '.join(metadata[key])
+ if key not in fields:
+ invalid_keys.append(key)
+
+ for invalid_key in invalid_keys:
+ metadata.pop(invalid_key)
+
+ return metadata
+
+def get_metadata_from(files):
+ """
+ Find metadata file in uploaded files.
+ :return: If metadata file exists, returns the values. If no file, returns None.
+ :rtype: list or None
+ """
+ metadata = None
+ for file in files:
+ if metadata is not None:
+ continue
+ if 'zip' in guess_type(file.name)[0]:
+ continue
+ if 'metadata' in file.name.casefold():
+ stream = file.read()
+ if 'csv' in guess_type(file.name)[0] or 'tab-separated' in guess_type(file.name)[0]:
+ metadata = Dataset().load(stream.decode('utf-8-sig'), format='csv').dict
+ else:
+ metadata = Dataset().load(stream).dict
+ return metadata
+
+def metadata_from_file(metadata_file):
+ format = metadata_file_format(metadata_file)
+ if format is None:
+ return
+
+ metadata = None
+
+ if format == 'excel':
+ with open(metadata_file, 'rb') as fh:
+ metadata = Dataset().load(fh.read(), format=metadata_file.split('.')[-1])
+ else:
+ with open(metadata_file, 'r', encoding="utf-8-sig") as fh:
+ metadata = Dataset().load(fh.read(), format=format)
+
+ if metadata is not None:
+ metadata = clean_metadata(metadata.dict[0])
+
+ return metadata
+
+def metadata_file_format(file_path):
+ """Get format used to read the metadata file
+
+ :param file_path: Name of metadata file
+ :type file_path: str
+ :return: Format of metadata file, csv, tsv, excel, or None
+ :rtype: str, None
+ """
+ if file_path is None:
+ return None
+
+ file_type = guess_type(file_path)[0]
+
+ if 'csv' in file_type:
+ return 'csv'
+ elif 'tab-separated' in file_type:
+ return 'tsv'
+ elif 'officedocument' in file_type:
+ return 'excel'
+
+ return None
diff --git a/readux_ingest_ecds/services/ocr_services.py b/readux_ingest_ecds/services/ocr_services.py
new file mode 100644
index 0000000..296da29
--- /dev/null
+++ b/readux_ingest_ecds/services/ocr_services.py
@@ -0,0 +1,476 @@
+import httpretty
+import json
+import csv
+import re
+import tempfile
+from os import environ, path, unlink, remove
+from io import BytesIO
+import logging
+from hocr_spec import HocrValidator
+from lxml import etree
+from django.conf import settings
+from django.core.serializers import deserialize
+from readux_ingest_ecds.helpers import get_iiif_models
+from .services import fetch_url
+
+LOGGER = logging.getLogger(__name__)
+OCR = get_iiif_models()['OCR']
+
+class IncludeQuotesDialect(csv.Dialect): # pylint: disable=too-few-public-methods
+ """Subclass of csv.Dialect to include the quote marks in OCR content."""
+ # include the quote marks in content
+ lineterminator = '\n'
+ delimiter = '\t'
+ quoting = csv.QUOTE_NONE # perform no special processing of quote characters
+
+class HocrValidationError(Exception):
+ """Exception for hOCR validation errors."""
+ pass # pylint: disable=unnecessary-pass
+
+def get_ocr(canvas):
+ """Function to determine method for fetching OCR for a canvas.
+
+ :param canvas: Canvas object
+ :type canvas: apps.iiif.canvases.models.Canvas
+ :return: List of dicts of parsed OCR data.
+ :rtype: list
+ """
+ if canvas.default_ocr == "line":
+ result = fetch_tei_ocr(canvas)
+ return parse_tei_ocr(result)
+
+ result = fetch_positional_ocr(canvas)
+ return add_positional_ocr(canvas, result)
+
+def fetch_tei_ocr(canvas):
+ """Function to fetch TEI OCR data for a given canvas.
+
+ :param canvas: Canvas object
+ :type canvas: apps.iiif.canvases.models.Canvas
+ :return: Positional OCR data
+ :rtype: requests.models.Response
+ """
+ if 'archivelab' in canvas.manifest.image_server.server_base:
+ return None
+ url = "{p}{c}/datastreams/tei/content".format(
+ p=settings.DATASTREAM_PREFIX,
+ c=canvas.pid.replace('fedora:', '')
+ )
+
+ return fetch_url(url, data_format='text/plain')
+
+def fetch_positional_ocr(canvas):
+ """Function to get OCR for a canvas depending on the image's source.
+
+ :param canvas: Canvas object
+ :type canvas: apps.iiif.canvases.models.Canvas
+ :return: Positional OCR data
+ :rtype: requests.models.Response
+ """
+ if 'archivelab' in canvas.manifest.image_server.server_base:
+ if '$' in canvas.pid:
+ pid = str(int(canvas.pid.split('$')[-1]) - canvas.ocr_offset)
+ else:
+ pid = canvas.pid
+
+ url = f"https://api.archivelab.org/books/{canvas.manifest.pid}/pages/{pid}/ocr?mode=words"
+
+ if environ['DJANGO_ENV'] == 'test':
+ fake_ocr = open(path.join(settings.FIXTURE_DIR, 'ocr_words.json'))
+ words = fake_ocr.read()
+ httpretty.enable()
+ httpretty.register_uri(httpretty.GET, url, body=words)
+
+ return fetch_url(url)
+
+ if 'images.readux.ecds.emory' in canvas.manifest.image_server.server_base:
+ # Fake TSV data for testing.
+ if environ['DJANGO_ENV'] == 'test':
+ fake_tsv = open(path.join(settings.FIXTURE_DIR, 'sample.tsv'))
+ tsv = fake_tsv.read()
+ url = "https://raw.githubusercontent.com/ecds/ocr-bucket/master/{m}/boo.tsv".format(
+ m=canvas.manifest.pid
+ )
+ httpretty.enable()
+ httpretty.register_uri(httpretty.GET, url, body=tsv)
+
+ if canvas.ocr_file_path is None:
+ return fetch_url(
+ "https://raw.githubusercontent.com/ecds/ocr-bucket/master/{m}/{p}.tsv".format(
+ m=canvas.manifest.pid,
+ p=canvas.pid.split('_')[-1]
+ .replace('.jp2', '')
+ .replace('.jpg', '')
+ .replace('.tif', '')
+ ),
+ data_format='text'
+ )
+
+ url = "{p}{c}{s}".format(
+ p=settings.DATASTREAM_PREFIX,
+ c=canvas.pid.replace('fedora:', ''),
+ s=settings.DATASTREAM_SUFFIX
+ )
+
+ if (
+ environ['DJANGO_ENV'] == 'test'
+ and 'images.readux.ecds.emory' not in canvas.manifest.image_server.server_base
+ and canvas.ocr_file_path is None
+ ):
+ fake_json = open(path.join(settings.FIXTURE_DIR, 'ocr_words.json'))
+ words = fake_json.read()
+ httpretty.enable(allow_net_connect=True)
+ httpretty.register_uri(httpretty.GET, url, body=words)
+
+ if canvas.ocr_file_path is not None:
+ if canvas.image_server.storage_service == 's3':
+ return canvas.image_server.bucket.Object(canvas.ocr_file_path).get()['Body'].read()
+
+ return fetch_url(url, data_format='text/plain')
+
+def parse_alto_ocr(result):
+ """Function to parse fetched ALTO OCR data for a given canvas.
+
+ :param result: Fetched ALTO OCR data
+ :type result: requests.models.Response
+ :return: Parsed OCR data
+ :rtype: list
+ """
+ if result is None:
+ return None
+ ocr = []
+ unvalidated_root = etree.fromstring(result)
+ if 'ns-v2' in unvalidated_root.tag:
+ schema_file = 'xml_schema/alto-2-1.xsd'
+ elif 'ns-v3' in unvalidated_root.tag:
+ schema_file = 'xml_schema/alto-3-1.xsd'
+ elif 'ns-v4' in unvalidated_root.tag:
+ schema_file = 'xml_schema/alto-4-2.xsd'
+ else:
+ schema_file = 'xml_schema/alto-1-4.xsd'
+ parser = etree.XMLParser(schema = etree.XMLSchema(file=schema_file))
+ # The following will raise etree.XMLSyntaxError if invalid
+ root = etree.fromstring(result, parser=parser)
+ strings = root.findall('.//String')
+ if not strings:
+ strings = root.findall('.//{*}String')
+ for string in strings:
+ attrib = {k.lower(): v for k, v in string.attrib.items()}
+ ocr.append({
+ 'content': attrib['content'],
+ 'h': int(attrib['height']),
+ 'w': int(attrib['width']),
+ 'x': int(attrib['hpos']),
+ 'y': int(attrib['vpos'])
+ })
+ if ocr:
+ return ocr
+ return None
+
+def parse_hocr_ocr(result):
+ """Function to parse fetched hOCR data for a given canvas.
+
+ :param result: Fetched hOCR data
+ :type result: requests.models.Response
+ :return: Parsed OCR data
+ :rtype: list
+ """
+ if isinstance(result, bytes):
+ as_string = result.decode('utf-8')
+ else:
+ as_string = str(result)
+ # Regex to ignore x_size, x_ascenders, x_descenders. this is a known issue with
+ # tesseract produced hOCR: https://github.com/tesseract-ocr/tesseract/issues/3303
+ result_without_invalid = re.sub(
+ r'([ ;]+)(x_size [0-9\.\-;]+)|( x_descenders [0-9\.\-;]+)|( x_ascenders [0-9\.\-;]+)',
+ repl='', string=as_string
+ )
+ file_like_hocr = BytesIO(result_without_invalid.encode('utf-8'))
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+ file_like_hocr.seek(0)
+ tmp_file.write(file_like_hocr.read())
+ tmp_file.flush()
+ temp_file_name = tmp_file.name
+ validator = HocrValidator(profile='relaxed')
+ report = validator.validate(source=temp_file_name)
+ is_valid = report.format('bool')
+ if not is_valid:
+ report_text = report.format('text')
+ unlink(temp_file_name)
+ raise HocrValidationError(str(report_text))
+ unlink(temp_file_name)
+ ocr = []
+ file_like_hocr.seek(0)
+ tree = etree.parse(file_like_hocr)
+ words = tree.findall(".//span[@class]")
+ if not words:
+ words = tree.findall(".//{*}span[@class]")
+ for word in words:
+ if word.attrib['class'] == 'ocrx_word':
+ all_attrs = word.attrib['title'].split(';')
+ bbox = next((attrib for attrib in all_attrs if 'bbox' in attrib), '')
+ # Splitting 'bbox x0 y0 x1 y1'
+ bbox_attrs = bbox.split(' ')
+ if len(bbox_attrs) == 5:
+ ocr.append({
+ 'content': word.text,
+ 'h': int(bbox_attrs[4]) - int(bbox_attrs[2]),
+ 'w': int(bbox_attrs[3]) - int(bbox_attrs[1]),
+ 'x': int(bbox_attrs[1]),
+ 'y': int(bbox_attrs[2])
+ })
+ if ocr:
+ return ocr
+ return None
+
+def parse_dict_ocr(result):
+ """Function to parse dict or JSON OCR data.
+
+ :param result: Fetched dict OCR data
+ :type result: requests.models.Response
+ :return: Parsed OCR data
+ :rtype: list
+ """
+ ocr = []
+ if isinstance(result, bytes):
+ as_string = result.decode('utf-8')
+ as_dict = json.loads(as_string)
+ elif isinstance(result, str):
+ as_dict = json.loads(result)
+ else:
+ as_dict = result
+ if 'ocr' in as_dict and as_dict['ocr'] is not None:
+ for index, word in enumerate(as_dict['ocr']): # pylint: disable=unused-variable
+ if len(word) > 0:
+ for w in word:
+ ocr.append({
+ 'content': w[0],
+ 'w': (w[1][2] - w[1][0]),
+ 'h': (w[1][1] - w[1][3]),
+ 'x': w[1][0],
+ 'y': w[1][3],
+ })
+ if ocr:
+ return ocr
+ return None
+
+def parse_tei_ocr(result):
+ """Function to parse fetched TEI OCR data for a given canvas.
+
+ :param result: Fetched TEI OCR data
+ :type result: requests.models.Response
+ :return: Parsed OCR data
+ :rtype: list
+ """
+ if result is None:
+ return None
+ ocr = []
+ parser = etree.XMLParser(schema = etree.XMLSchema(file='xml_schema/tei_all.xsd'))
+ # The following will raise etree.XMLSyntaxError if invalid
+ surface = etree.fromstring(result, parser=parser)[-1][0]
+ for zones in surface:
+ if 'zone' in zones.tag:
+ for line in zones:
+ # if line[-1].text is None:
+ # continue
+ ocr.append({
+ 'content': line[-1].text,
+ 'h': int(line.get('lry')) - int(line.get('uly')),
+ 'w': int(line.get('lrx')) - int(line.get('ulx')),
+ 'x': int(line.get('ulx')),
+ 'y': int(line.get('uly'))
+ })
+ if ocr:
+ return ocr
+ return None
+
+def parse_tsv_ocr(result):
+ """Function to parse fetched TSV OCR data for a given canvas.
+
+ :param result: Fetched TSV OCR data
+ :type result: requests.models.Response
+ :return: Parsed OCR data
+ :rtype: list
+ """
+ ocr = []
+ if isinstance(result, bytes):
+ lines = result.decode('utf-8').splitlines()
+ else:
+ lines = str(result).split('\n')
+
+ # Sometimes the TSV has some extra tabs at the beginning and the end. These have
+ # to be cleaned out. It gets complicated.
+ for index, line in enumerate(lines):
+ # First we remove any leading column that is empty.
+ line = line.strip()
+ lines[index] = line
+ # It might be true that the "content" column is empty. However, we just
+ # removed it. So we have to add it back.
+ if lines[index].count('\t') == 3:
+ lines[index] = ' \t' + lines[index]
+
+ reader = csv.DictReader(lines, dialect=IncludeQuotesDialect)
+
+ for row in reader:
+ content = row['content']
+ w = int(row['w'])
+ h = int(row['h'])
+ x = int(row['x'])
+ y = int(row['y'])
+ ocr.append({
+ 'content': content,
+ 'w': w,
+ 'h': h,
+ 'x': x,
+ 'y': y,
+ })
+ if ocr:
+ return ocr
+ return None
+
+def parse_fedora_ocr(result):
+ """Function to parse fetched Fedora OCR data for a given canvas.
+
+ :param result: Fetched Fedora OCR data (bytes)
+ :type result: requests.models.Response
+ :return: Parsed OCR data
+ :rtype: list
+ """
+ ocr = []
+ if isinstance(result, bytes):
+ # What comes back from fedora is 8-bit bytes
+ for _, word in enumerate(result.decode('UTF-8-sig').strip().split('\r\n')):
+ if len(word.split('\t')) == 5:
+ ocr.append({
+ 'content': word.split('\t')[4],
+ 'w': int(word.split('\t')[2]),
+ 'h': int(word.split('\t')[3]),
+ 'x': int(word.split('\t')[0]),
+ 'y': int(word.split('\t')[1])
+ })
+ return ocr
+
+def parse_xml_ocr(result):
+ """Function to determine the flavor of XML OCR and then parse accordingly.
+
+ :param result: Fetched XML OCR data
+ :type result: requests.models.Response
+ :return: Parsed OCR data
+ :rtype: list
+ """
+ root = etree.fromstring(result)
+ if (
+ re.match(r'{[0-9A-Za-z.:/#-]+}alto|alto', root.tag)
+ or 'www.loc.gov/standards/alto' in root.find('.//*').tag
+ ):
+ return parse_alto_ocr(result)
+ if root.find('.//teiHeader') is not None or root.find('.//{*}teiHeader') is not None:
+ return parse_tei_ocr(result)
+ if root.find('.//div') is not None or root.find('.//{*}div') is not None:
+ # Fallback to hOCR if it looks like XHTML
+ return parse_hocr_ocr(result)
+ return None
+
+def add_ocr_annotations(canvas, ocr):
+ word_order = 1
+ for word in ocr:
+ # A quick check to make sure the header row didn't slip through.
+ if word['x'] == 'x':
+ continue
+
+ # Set the content to a single space if it's missing.
+ if (
+ word == '' or
+ 'content' not in word or
+ not word['content'] or
+ word['content'].isspace()
+ ):
+ word['content'] = ' '
+ anno = OCR()
+ anno.canvas = canvas
+ anno.x = word['x']
+ anno.y = word['y']
+ anno.w = word['w']
+ anno.h = word['h']
+ anno.resource_type = anno.OCR
+ anno.content = word['content']
+ anno.order = word_order
+ anno.save()
+ word_order += 1
+
+def add_oa_annotations(annotation_list_url):
+ data = fetch_url(annotation_list_url)
+ for oa_annotation in data['resources']:
+ anno = deserialize('annotation', oa_annotation)
+ anno.save()
+
+def add_positional_ocr(canvas, result):
+ """Function to disambiguate and parse fetched OCR data for a canvas.
+
+ :param canvas: Canvas object
+ :type canvas: apps.iiif.canvases.models.Canvas
+ :param result: Previously fetched OCR data
+ :type result: requests.models.Response
+ :return: List of dicts of parsed OCR data.
+ :rtype: list
+ """
+ if result is None:
+ return None
+ if canvas.ocr_file_path is None:
+ if isinstance(result, dict) or is_json(result):
+ ocr = parse_dict_ocr(result)
+ elif is_tsv(result) and isinstance(result, bytes):
+ if result.decode('utf-8') == result.decode('UTF-8-sig'):
+ ocr = parse_tsv_ocr(result)
+ else:
+ ocr = parse_fedora_ocr(result)
+ elif is_tsv(result):
+ ocr = parse_tsv_ocr(result)
+ elif canvas.ocr_file_path.endswith('.json'):
+ ocr = parse_dict_ocr(result)
+ elif canvas.ocr_file_path.endswith('.tsv') or canvas.ocr_file_path.endswith('.tab'):
+ ocr = parse_tsv_ocr(result)
+ elif canvas.ocr_file_path.endswith('.xml'):
+ ocr = parse_xml_ocr(result)
+ elif canvas.ocr_file_path.endswith('.hocr'):
+ ocr = parse_hocr_ocr(result)
+ if ocr:
+ return ocr
+ return None
+
+def is_json(to_test):
+ """Function to test if data is shaped like JSON.
+
+ :param to_test: String or bytes
+ :type to_test: requests.models.Response
+ :return: True if shaped like JSON, False if not.
+ :rtype: bool
+ """
+ if isinstance(to_test, bytes):
+ as_str = to_test.decode('utf-8')
+ else:
+ as_str = str(to_test)
+ try:
+ json.loads(as_str)
+ except ValueError:
+ return False
+ return True
+
+def is_tsv(to_test):
+ """Function to test if data is shaped like a TSV.
+
+ :param to_test: String or bytes
+ :type to_test: requests.models.Response
+ :return: True if shaped like a TSV, False if not.
+ :rtype: bool
+ """
+ if isinstance(to_test, bytes):
+ as_str = to_test.decode('utf-8')
+ as_list = as_str.splitlines()
+ else:
+ as_str = str(to_test)
+ as_list = as_str.split('\n')
+ if len(as_list) > 1:
+ if len(as_str.split('\t')) > 1:
+ return True
+ return False
diff --git a/readux_ingest_ecds/services/services.py b/readux_ingest_ecds/services/services.py
new file mode 100644
index 0000000..c15f966
--- /dev/null
+++ b/readux_ingest_ecds/services/services.py
@@ -0,0 +1,40 @@
+""" Utility functions for fetching remote data. """
+import json
+import logging
+import requests
+
+logger = logging.getLogger(__name__)
+logging.getLogger("urllib3").setLevel(logging.ERROR)
+
+def fetch_url(url, timeout=30, data_format='json', verbosity=1):
+ """ Given a url, this function returns the data."""
+ data = None
+ try:
+ resp = requests.get(url, timeout=timeout, verify=True)
+ except requests.exceptions.Timeout as err:
+ if verbosity > 2:
+ logger.warning('Connection timeoutout for {}'.format(url))
+ return data
+ except Exception as err:
+ if verbosity > 2:
+ logger.warning('Connection failed for {}. ({})'.format(url, str(err)))
+ return data
+
+ if resp.status_code != 200:
+ if verbosity > 2:
+ logger.warning('Connection failed status {}. ({})'.format(url, resp.status_code))
+ return data
+
+ if data_format == 'json':
+ try:
+ data = resp.json()
+ except json.decoder.JSONDecodeError as err:
+ if verbosity > 2:
+ logger.warning('Server send success status with bad content {}'.format(url))
+ return data
+
+ if data_format == 'text':
+ data = resp.text
+ else:
+ data = resp.content
+ return data
diff --git a/readux_ingest_ecds/tasks.py b/readux_ingest_ecds/tasks.py
index 43e194b..c5b5267 100644
--- a/readux_ingest_ecds/tasks.py
+++ b/readux_ingest_ecds/tasks.py
@@ -1,18 +1,18 @@
# pylint: disable = unused-argument
""" Common tasks for ingest. """
+import os
from celery import Celery
from django.apps import apps
from django.conf import settings
from .helpers import get_iiif_models
+from .services.ocr_services import get_ocr, add_ocr_annotations
# Use `apps.get_model` to avoid circular import error. Because the parameters used to
# create a background task have to be serializable, we can't just pass in the model object.
Local = apps.get_model('readux_ingest_ecds.local') # pylint: disable = invalid-name
-# Remote = apps.get_model('ingest.remote')
-# S3Ingest = apps.get_model('ingest.S3Ingest')
-Manifest = get_iiif_models()['Manifest']
+Manifest = get_iiif_models()['Manifest']
Canvas = get_iiif_models()['Canvas']
app = Celery('readux_ingest_ecds', result_extended=True)
@@ -29,3 +29,19 @@ def local_ingest_task_ecds(ingest_id):
"""
local_ingest = Local.objects.get(pk=ingest_id)
local_ingest.ingest()
+ if os.environ["DJANGO_ENV"] != 'test': # pragma: no cover
+ add_ocr_task.delay(local_ingest.manifest.pk)
+ else:
+ add_ocr_task(local_ingest.manifest.pk)
+
+
+@app.task(name='adding_ocr_to_canvas', autoretry_for=(Manifest.DoesNotExist,), retry_backoff=5)
+def add_ocr_task(manifest_id, *args, **kwargs):
+ """Function for parsing and adding OCR."""
+ manifest = Manifest.objects.get(pk=manifest_id)
+ for canvas in manifest.canvas_set.all():
+ ocr = get_ocr(canvas)
+
+ if ocr is not None:
+ add_ocr_annotations(canvas, ocr)
+ canvas.save() # trigger reindex
diff --git a/setup.cfg b/setup.cfg
index 7faa55e..939072d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,3 +36,4 @@ install_requires =
django-celery-results~=2.4.0
boto3
Pillow==9.4.0 # wagtail 4.2.4 depends on Pillow<10.0.0 and >=4.0.0
+ requests>=1.3.1
diff --git a/test_app/fixtures/00000002.tsv b/test_app/fixtures/00000002.tsv
new file mode 100644
index 0000000..791b3a0
--- /dev/null
+++ b/test_app/fixtures/00000002.tsv
@@ -0,0 +1,11 @@
+content x y w h
+Manuscript 939 561 745 247
+, 1698 577 63 232
+Archives 1787 578 554 243
+and 969 739 213 235
+Rare 1242 754 310 240
+Book 1608 775 300 239
+Library 1997 795 450 249
+F 1516 1182 22 90
+EMORY 829 2728 560 161
+UNIVERSITY 1427 2748 971 173
diff --git a/test_app/fixtures/__init__.py b/test_app/fixtures/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test_app/fixtures/alto.xml b/test_app/fixtures/alto.xml
new file mode 100755
index 0000000..482eddd
--- /dev/null
+++ b/test_app/fixtures/alto.xml
@@ -0,0 +1,41 @@
+
+
+ + MAGNA + CAMPI + MARTII + β + + + ICHNOGRAPHIA + DESCRIPTA + SV + NT + +
++ + MAGNA + CAMPI + MARTII + β + + + ICHNOGRAPHIA + DESCRIPTA + SV + NT + +
+Abbyy file derived from OCR of