diff --git a/LICENSE b/LICENSE index 261eeb9..3e4a79b 100644 --- a/LICENSE +++ b/LICENSE @@ -66,7 +66,7 @@ 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, + copyright license to reproduce, prep Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. diff --git a/MANIFEST.in b/MANIFEST.in index 4927edd..0ae76ad 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include LICENSE include README.md recursive-include readux_ingest_ecds/templates * +recursive-include readux_ingest_ecds/services * prune test* diff --git a/README.md b/README.md index 8a9d903..6d2b81f 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ python manage.py migrate readux_ingest_ecds | IIIF_RELATED_LINK_MODEL | Model reference, eg. 'iiif.RelatedLink' | | IIIF_CANVAS_MODEL | Model reference, eg. 'iiif.Canvas' | | IIIF_COLLECTION_MODEL | Model reference, eg. 'iiif.Collection' | +| IIIF_OCR_MODEL | Model reference, eg. 'iiif.OCR' | | INGEST_TMP_DIR | Absolute path where files will be temporarily stored. | | INGEST_PROCESSING_DIR | Absolute path where Lambda will look for images. | | INGEST_OCR_DIR | Absolute path where OCR files will be preserved. | diff --git a/readux_ingest_ecds/admin.py b/readux_ingest_ecds/admin.py index 35d3af4..6a847e6 100644 --- a/readux_ingest_ecds/admin.py +++ b/readux_ingest_ecds/admin.py @@ -1,10 +1,7 @@ import os import logging from django.contrib import admin -from django.urls import reverse -from django.utils.html import format_html from django.shortcuts import redirect -from django_celery_results.models import TaskResult from .models import Local from .tasks import local_ingest_task_ecds @@ -18,15 +15,15 @@ class LocalAdmin(admin.ModelAdmin): def save_model(self, request, obj, form, change): LOGGER.info(f'INGEST: Local ingest started by {request.user.username}') obj.creator = request.user - obj.process() + obj.prep() super().save_model(request, obj, form, change) - - def response_add(self, request, obj, post_url_continue=None): - obj.refresh_from_db() if os.environ["DJANGO_ENV"] != 'test': # pragma: no cover local_ingest_task_ecds.apply_async(args=[obj.id]) else: local_ingest_task_ecds(obj.id) + + def response_add(self, request, obj, post_url_continue=None): + obj.refresh_from_db() LOGGER.info(f'INGEST: Local ingest - {obj.id} - added for {obj.manifest.pid}') return redirect('/admin/manifests/manifest/{m}/change/'.format(m=obj.manifest.pk)) diff --git a/readux_ingest_ecds/helpers.py b/readux_ingest_ecds/helpers.py index d45f0e3..2249663 100644 --- a/readux_ingest_ecds/helpers.py +++ b/readux_ingest_ecds/helpers.py @@ -10,6 +10,7 @@ def get_iiif_models(): 'RelatedLink': apps.get_model(settings.IIIF_RELATED_LINK_MODEL), 'Canvas': apps.get_model(settings.IIIF_CANVAS_MODEL), 'Collection': apps.get_model(settings.IIIF_COLLECTION_MODEL), + 'OCR': apps.get_model(settings.IIIF_OCR_MODEL), } except AppRegistryNotReady: return { @@ -18,4 +19,5 @@ def get_iiif_models(): 'RelatedLink': settings.IIIF_RELATED_LINK_MODEL, 'Canvas': settings.IIIF_CANVAS_MODEL, 'Collection': settings.IIIF_COLLECTION_MODEL, + 'OCR': settings.IIIF_OCR_MODEL, } diff --git a/readux_ingest_ecds/models.py b/readux_ingest_ecds/models.py index dd1897c..bbf7d10 100644 --- a/readux_ingest_ecds/models.py +++ b/readux_ingest_ecds/models.py @@ -4,7 +4,9 @@ from django.core.files.storage import FileSystemStorage from django.db import models from django.conf import settings -from .services import is_image, is_ocr, is_junk, metadata_from_file, create_manifest, move_image_file, move_ocr_file, canvas_dimensions, upload_trigger_file +from .services.file_services import is_image, is_ocr, is_junk, move_image_file, move_ocr_file, canvas_dimensions, upload_trigger_file +from .services.iiif_services import create_manifest +from .services.metadata_services import metadata_from_file from .helpers import get_iiif_models Manifest = get_iiif_models()['Manifest'] @@ -74,12 +76,13 @@ def ocr_directory(self): def trigger_file(self): return os.path.join(settings.INGEST_TMP_DIR, f'{self.manifest.pid}.txt') - def process(self): + def prep(self): """ Open metadata Create manifest Unzip bundle """ + LOGGER.info(f'INGEST: Local ingest - preparing new local ingest') os.makedirs(settings.INGEST_TMP_DIR, exist_ok=True) os.makedirs(settings.INGEST_PROCESSING_DIR, exist_ok=True) os.makedirs(settings.INGEST_OCR_DIR, exist_ok=True) diff --git a/readux_ingest_ecds/services.py b/readux_ingest_ecds/services.py deleted file mode 100644 index 98ad3ad..0000000 --- a/readux_ingest_ecds/services.py +++ /dev/null @@ -1,353 +0,0 @@ -""" Module of service classes and methods for ingest. """ -import itertools -import os -from shutil import move -from PIL import Image -from boto3 import resource -from tablib.core import Dataset -from mimetypes import guess_type -from urllib.parse import unquote, urlparse - -from django.conf import settings - -from .helpers import get_iiif_models - -Manifest = get_iiif_models()['Manifest'] -RelatedLink = get_iiif_models()['RelatedLink'] - -def clean_metadata(metadata): - """Remove keys that do not align with Manifest fields. - - :param metadata: - :type metadata: tablib.Dataset - :return: Dictionary with keys matching Manifest fields - :rtype: dict - """ - metadata = {key.casefold().replace(' ', '_'): value for key, value in metadata.items()} - fields = [f.name for f in get_iiif_models()['Manifest']._meta.get_fields()] - invalid_keys = [] - - for key in metadata.keys(): - if key != 'metadata' and isinstance(metadata[key], list): - if isinstance(metadata[key][0], dict): - for meta_key in metadata[key][0].keys(): - if 'value' in meta_key: - metadata[key] = metadata[key][0][meta_key] - else: - metadata[key] = ', '.join(metadata[key]) - if key not in fields: - invalid_keys.append(key) - - for invalid_key in invalid_keys: - metadata.pop(invalid_key) - - return metadata - -def create_manifest(ingest): - """ - Create or update a Manifest from supplied metadata and images. - :return: New or updated Manifest with supplied `pid` - :rtype: iiif.manifest.models.Manifest - """ - Manifest = get_iiif_models()['Manifest'] - manifest = None - # Make a copy of the metadata so we don't extract it over and over. - try: - if not bool(ingest.manifest) or ingest.manifest is None: - ingest.open_metadata() - - metadata = dict(ingest.metadata) - except TypeError: - metadata = None - if metadata: - if 'pid' in metadata: - manifest, created = Manifest.objects.get_or_create(pid=metadata['pid'].replace('_', '-')) - else: - manifest = Manifest.objects.create() - for (key, value) in metadata.items(): - setattr(manifest, key, value) - else: - manifest = Manifest() - - manifest.image_server = ingest.image_server - - # This was giving me a 'django.core.exceptions.AppRegistryNotReady: Models aren't loaded yet' error. - # Remote = apps.get_model('ingest.remote') - - # Ensure that manifest has an ID before updating the M2M relationship - manifest.save() - # if not isinstance(ingest, Remote): - manifest.refresh_from_db() - manifest.collections.set(ingest.collections.all()) - # Save again once relationship is set - manifest.save() - - # if type(ingest, .models.Remote): - # if isinstance(ingest, Remote): - # RelatedLink( - # manifest=manifest, - # link=ingest.remote_url, - # format='application/ld+json' - # ).save() - - return manifest - -def extract_image_server(canvas): - """Determines the IIIF image server URL for a given IIIF Canvas - - :param canvas: IIIF Canvas - :type canvas: dict - :return: IIIF image server URL - :rtype: str - """ - url = urlparse(canvas['images'][0]['resource']['service']['@id']) - parts = url.path.split('/') - parts.pop() - base_path = '/'.join(parts) - host = url.hostname - if url.port is not None: - host = '{h}:{p}'.format(h=url.hostname, p=url.port) - return '{s}://{h}{p}'.format(s=url.scheme, h=host, p=base_path) - -def parse_iiif_v2_manifest(data): - """Parse IIIF Manifest based on v2.1.1 or the presentation API. - https://iiif.io/api/presentation/2.1 - - :param data: IIIF Presentation v2.1.1 manifest - :type data: dict - :return: Extracted metadata - :rtype: dict - """ - properties = {} - manifest_data = [] - - if 'metadata' in data: - manifest_data.append({ 'metadata': data['metadata'] }) - - for iiif_metadata in [{prop['label']: prop['value']} for prop in data['metadata']]: - properties.update(iiif_metadata) - - # Sometimes, the label appears as a list. - if 'label' in data.keys() and isinstance(data['label'], list): - data['label'] = ' '.join(data['label']) - - manifest_data.extend([{prop: data[prop]} for prop in data if isinstance(data[prop], str)]) - - for datum in manifest_data: - properties.update(datum) - - uri = urlparse(data['@id']) - - if not uri.query: - properties['pid'] = uri.path.split('/')[-2] - else: - properties['pid'] = uri.query - - if 'description' in data.keys(): - if isinstance(data['description'], list): - if isinstance(data['description'][0], dict): - en = [lang['@value'] for lang in data['description'] if lang['@language'] == 'en'] - properties['summary'] = data['description'][0]['@value'] if not en else en[0] - else: - properties['summary'] = data['description'][0] - else: - properties['summary'] = data['description'] - - if 'logo' in properties: - properties['logo_url'] = properties['logo'] - properties.pop('logo') - - manifest_metadata = clean_metadata(properties) - - return manifest_metadata - -def parse_iiif_v2_canvas(canvas): - """ """ - canvas_id = canvas['@id'].split('/') - pid = canvas_id[-1] if canvas_id[-1] != 'canvas' else canvas_id[-2] - - service = urlparse(canvas['images'][0]['resource']['service']['@id']) - resource = unquote(service.path.split('/').pop()) - - summary = canvas['description'] if 'description' in canvas.keys() else '' - label = canvas['label'] if 'label' in canvas.keys() else '' - return { - 'pid': pid, - 'height': canvas['height'], - 'width': canvas['width'], - 'summary': summary, - 'label': label, - 'resource': resource - } - -def get_metadata_from(files): - """ - Find metadata file in uploaded files. - :return: If metadata file exists, returns the values. If no file, returns None. - :rtype: list or None - """ - metadata = None - for file in files: - if metadata is not None: - continue - if 'zip' in guess_type(file.name)[0]: - continue - if 'metadata' in file.name.casefold(): - stream = file.read() - if 'csv' in guess_type(file.name)[0] or 'tab-separated' in guess_type(file.name)[0]: - metadata = Dataset().load(stream.decode('utf-8-sig'), format='csv').dict - else: - metadata = Dataset().load(stream).dict - return metadata - -def metadata_from_file(metadata_file): - format = metadata_file_format(metadata_file) - if format is None: - return - - metadata = None - - if format == 'excel': - with open(metadata_file, 'rb') as fh: - metadata = Dataset().load(fh.read(), format=metadata_file.split('.')[-1]) - else: - with open(metadata_file, 'r', encoding="utf-8-sig") as fh: - metadata = Dataset().load(fh.read(), format=format) - - if metadata is not None: - metadata = clean_metadata(metadata.dict[0]) - - return metadata - -def get_associated_meta(all_metadata, file): - """ - Associate metadata with filename. - :return: If a matching filename is found, returns the row as dict, - with generated pid. Otherwise, returns {}. - :rtype: dict - """ - file_meta = {} - extless_filename = file.name[0:file.name.rindex('.')] - for meta_dict in all_metadata: - for key, val in meta_dict.items(): - if key.casefold() == 'filename': - metadata_found_filename = val - # Match filename column, case-sensitive, against filename - if metadata_found_filename and metadata_found_filename in (extless_filename, file.name): - file_meta = meta_dict - return file_meta - -def lowercase_first_line(iterator): - """Lowercase the first line of a text file (such as the header row of a CSV)""" - return itertools.chain( - # ignore unicode characters, set lowercase, and strip whitespace - [next(iterator).encode('ascii', 'ignore').decode().casefold().strip()], iterator - ) - -def is_image(file_path): - """Check if file is expected type for image files - - :param file_path: Name of file to check - :type file_path: str - :return: Bool if file type is an image. - :rtype: bool - """ - return file_path is not None and 'images' in file_path and 'image' in guess_type(file_path)[0] - -def is_ocr(file_path): - """Check if file is expected type for OCR files - - :param file_path: Name of file to check - :type file_path: str - :return: Bool if file type matches OCR file types. - :rtype: bool - """ - ocr_file_types = ['text', 'xml','json','html', 'hocr', 'tsv'] - return file_path is not None and 'ocr' in file_path and any(file_path.endswith(ocr_type) for ocr_type in ocr_file_types) - -def metadata_file_format(file_path): - """Get format used to read the metadata file - - :param file_path: Name of metadata file - :type file_path: str - :return: Format of metadata file, csv, tsv, excel, or None - :rtype: str, None - """ - if file_path is None: - return None - - file_type = guess_type(file_path)[0] - - if 'csv' in file_type: - return 'csv' - elif 'tab-separated' in file_type: - return 'tsv' - elif 'officedocument' in file_type: - return 'excel' - - return None - -def is_junk(file_path): - """Check if a file should be considered junk - - :param file_path: File name to check - :type file_path: str - :return: True if file name starts with special char - :rtype: bol - """ - return file_path.startswith('.') or file_path.startswith('~') or file_path.startswith('__') or file_path.endswith('/') or file_path == '' - -def move_image_file(ingest, file_path): - """ Move files to directory where they processed. - Add the Manifest pid to the file name if not already there. - - :param ingest: Ingest object - :type ingest: _type_ - :param file_path: Absolute path of tmp file - :type file_path: str - :return: File name file to be processed - :rtype: str - """ - base_name = os.path.basename(file_path) - if ingest.manifest.pid not in base_name: - base_name = f'{ingest.manifest.pid}_{base_name}' - move(file_path, os.path.join(settings.INGEST_PROCESSING_DIR, base_name)) - return base_name - -def move_ocr_file(ingest, file_path): - """ Move OCR file to where it belongs. - - :param ingest: Ingest object - :type ingest: _type_ - :param file_path: Absolute path of tmp file - :type file_path: str - """ - base_name = os.path.basename(file_path) - if ingest.manifest.pid not in base_name: - base_name = f'{ingest.manifest.pid}_{base_name}' - move(file_path, os.path.join(ingest.ocr_directory, base_name)) - -def upload_trigger_file(trigger_file): - """ - Upload trigger file to S3. The file contains a list of images being ingested. - The file will be picked up by an AWS lambda function and the images will be - converted to ptiffs. - - :param trigger_file: Absolute path to trigger file. - :type trigger_file: str - """ - s3 = resource('s3') - s3.Bucket(settings.INGEST_TRIGGER_BUCKET).upload_file(trigger_file, os.path.basename(trigger_file)) - -def canvas_dimensions(image_name): - """Get canvas dimensions - - :param image_name: File name without extension of image file. - :type image_name: str - :return: 2-tuple containing width and height (in pixels) - :rtype: tuple - """ - original_image = [img for img in os.listdir(settings.INGEST_PROCESSING_DIR) if img.startswith(image_name)] - if len(original_image) > 0: - return Image.open(os.path.join(settings.INGEST_PROCESSING_DIR, original_image[0])).size - return (0,0) diff --git a/readux_ingest_ecds/services/__init__.py b/readux_ingest_ecds/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/readux_ingest_ecds/services/file_services.py b/readux_ingest_ecds/services/file_services.py new file mode 100644 index 0000000..8618538 --- /dev/null +++ b/readux_ingest_ecds/services/file_services.py @@ -0,0 +1,99 @@ +""" Module of service methods for ingest files. """ +import os +from shutil import move +from PIL import Image +from boto3 import resource +from mimetypes import guess_type + +from django.conf import settings + +from readux_ingest_ecds.helpers import get_iiif_models + +Manifest = get_iiif_models()['Manifest'] +RelatedLink = get_iiif_models()['RelatedLink'] + +def is_image(file_path): + """Check if file is expected type for image files + + :param file_path: Name of file to check + :type file_path: str + :return: Bool if file type is an image. + :rtype: bool + """ + return file_path is not None and 'images' in file_path and 'image' in guess_type(file_path)[0] + +def is_ocr(file_path): + """Check if file is expected type for OCR files + + :param file_path: Name of file to check + :type file_path: str + :return: Bool if file type matches OCR file types. + :rtype: bool + """ + ocr_file_types = ['text', 'xml','json','html', 'hocr', 'tsv'] + return file_path is not None and 'ocr' in file_path and any(file_path.endswith(ocr_type) for ocr_type in ocr_file_types) + +def is_junk(file_path): + """Check if a file should be considered junk + + :param file_path: File name to check + :type file_path: str + :return: True if file name starts with special char + :rtype: bol + """ + return file_path.startswith('.') or file_path.startswith('~') or file_path.startswith('__') or file_path.endswith('/') or file_path == '' + +def move_image_file(ingest, file_path): + """ Move files to directory where they processed. + Add the Manifest pid to the file name if not already there. + + :param ingest: Ingest object + :type ingest: _type_ + :param file_path: Absolute path of tmp file + :type file_path: str + :return: File name file to be processed + :rtype: str + """ + base_name = os.path.basename(file_path) + if ingest.manifest.pid not in base_name: + base_name = f'{ingest.manifest.pid}_{base_name}' + move(file_path, os.path.join(settings.INGEST_PROCESSING_DIR, base_name)) + return base_name + +def move_ocr_file(ingest, file_path): + """ Move OCR file to where it belongs. + + :param ingest: Ingest object + :type ingest: _type_ + :param file_path: Absolute path of tmp file + :type file_path: str + """ + base_name = os.path.basename(file_path) + if ingest.manifest.pid not in base_name: + base_name = f'{ingest.manifest.pid}_{base_name}' + move(file_path, os.path.join(ingest.ocr_directory, base_name)) + +def upload_trigger_file(trigger_file): + """ + Upload trigger file to S3. The file contains a list of images being ingested. + The file will be picked up by an AWS lambda function and the images will be + converted to ptiffs. + + :param trigger_file: Absolute path to trigger file. + :type trigger_file: str + """ + s3 = resource('s3') + s3.Bucket(settings.INGEST_TRIGGER_BUCKET).upload_file(trigger_file, os.path.basename(trigger_file)) + +def canvas_dimensions(image_name): + """Get canvas dimensions + + :param image_name: File name without extension of image file. + :type image_name: str + :return: 2-tuple containing width and height (in pixels) + :rtype: tuple + """ + original_image = [img for img in os.listdir(settings.INGEST_PROCESSING_DIR) if img.startswith(image_name)] + if len(original_image) > 0: + return Image.open(os.path.join(settings.INGEST_PROCESSING_DIR, original_image[0])).size + return (0,0) diff --git a/readux_ingest_ecds/services/iiif_services.py b/readux_ingest_ecds/services/iiif_services.py new file mode 100644 index 0000000..70aa6f1 --- /dev/null +++ b/readux_ingest_ecds/services/iiif_services.py @@ -0,0 +1,43 @@ +""" Module of service methods for IIIF objects. """ +from readux_ingest_ecds.helpers import get_iiif_models + +Manifest = get_iiif_models()['Manifest'] +RelatedLink = get_iiif_models()['RelatedLink'] +OCR = get_iiif_models()['OCR'] + +def create_manifest(ingest): + """ + Create or update a Manifest from supplied metadata and images. + :return: New or updated Manifest with supplied `pid` + :rtype: iiif.manifest.models.Manifest + """ + Manifest = get_iiif_models()['Manifest'] + manifest = None + # Make a copy of the metadata so we don't extract it over and over. + try: + if not bool(ingest.manifest) or ingest.manifest is None: + ingest.open_metadata() + + metadata = dict(ingest.metadata) + except TypeError: + metadata = None + if metadata: + if 'pid' in metadata: + manifest, created = Manifest.objects.get_or_create(pid=metadata['pid'].replace('_', '-')) + else: + manifest = Manifest.objects.create() + for (key, value) in metadata.items(): + setattr(manifest, key, value) + else: + manifest = Manifest() + + manifest.image_server = ingest.image_server + + # Ensure that manifest has an ID before updating the M2M relationship + manifest.save() + manifest.refresh_from_db() + manifest.collections.set(ingest.collections.all()) + # Save again once relationship is set + manifest.save() + + return manifest diff --git a/readux_ingest_ecds/services/metadata_services.py b/readux_ingest_ecds/services/metadata_services.py new file mode 100644 index 0000000..9b73398 --- /dev/null +++ b/readux_ingest_ecds/services/metadata_services.py @@ -0,0 +1,96 @@ +""" Module of service methods for ingest files. """ +from readux_ingest_ecds.helpers import get_iiif_models +from mimetypes import guess_type +from tablib.core import Dataset + +Manifest = get_iiif_models()['Manifest'] +RelatedLink = get_iiif_models()['RelatedLink'] + +def clean_metadata(metadata): + """Remove keys that do not align with Manifest fields. + + :param metadata: + :type metadata: tablib.Dataset + :return: Dictionary with keys matching Manifest fields + :rtype: dict + """ + metadata = {key.casefold().replace(' ', '_'): value for key, value in metadata.items()} + fields = [f.name for f in get_iiif_models()['Manifest']._meta.get_fields()] + invalid_keys = [] + + for key in metadata.keys(): + if key != 'metadata' and isinstance(metadata[key], list): + if isinstance(metadata[key][0], dict): + for meta_key in metadata[key][0].keys(): + if 'value' in meta_key: + metadata[key] = metadata[key][0][meta_key] + else: + metadata[key] = ', '.join(metadata[key]) + if key not in fields: + invalid_keys.append(key) + + for invalid_key in invalid_keys: + metadata.pop(invalid_key) + + return metadata + +def get_metadata_from(files): + """ + Find metadata file in uploaded files. + :return: If metadata file exists, returns the values. If no file, returns None. + :rtype: list or None + """ + metadata = None + for file in files: + if metadata is not None: + continue + if 'zip' in guess_type(file.name)[0]: + continue + if 'metadata' in file.name.casefold(): + stream = file.read() + if 'csv' in guess_type(file.name)[0] or 'tab-separated' in guess_type(file.name)[0]: + metadata = Dataset().load(stream.decode('utf-8-sig'), format='csv').dict + else: + metadata = Dataset().load(stream).dict + return metadata + +def metadata_from_file(metadata_file): + format = metadata_file_format(metadata_file) + if format is None: + return + + metadata = None + + if format == 'excel': + with open(metadata_file, 'rb') as fh: + metadata = Dataset().load(fh.read(), format=metadata_file.split('.')[-1]) + else: + with open(metadata_file, 'r', encoding="utf-8-sig") as fh: + metadata = Dataset().load(fh.read(), format=format) + + if metadata is not None: + metadata = clean_metadata(metadata.dict[0]) + + return metadata + +def metadata_file_format(file_path): + """Get format used to read the metadata file + + :param file_path: Name of metadata file + :type file_path: str + :return: Format of metadata file, csv, tsv, excel, or None + :rtype: str, None + """ + if file_path is None: + return None + + file_type = guess_type(file_path)[0] + + if 'csv' in file_type: + return 'csv' + elif 'tab-separated' in file_type: + return 'tsv' + elif 'officedocument' in file_type: + return 'excel' + + return None diff --git a/readux_ingest_ecds/services/ocr_services.py b/readux_ingest_ecds/services/ocr_services.py new file mode 100644 index 0000000..296da29 --- /dev/null +++ b/readux_ingest_ecds/services/ocr_services.py @@ -0,0 +1,476 @@ +import httpretty +import json +import csv +import re +import tempfile +from os import environ, path, unlink, remove +from io import BytesIO +import logging +from hocr_spec import HocrValidator +from lxml import etree +from django.conf import settings +from django.core.serializers import deserialize +from readux_ingest_ecds.helpers import get_iiif_models +from .services import fetch_url + +LOGGER = logging.getLogger(__name__) +OCR = get_iiif_models()['OCR'] + +class IncludeQuotesDialect(csv.Dialect): # pylint: disable=too-few-public-methods + """Subclass of csv.Dialect to include the quote marks in OCR content.""" + # include the quote marks in content + lineterminator = '\n' + delimiter = '\t' + quoting = csv.QUOTE_NONE # perform no special processing of quote characters + +class HocrValidationError(Exception): + """Exception for hOCR validation errors.""" + pass # pylint: disable=unnecessary-pass + +def get_ocr(canvas): + """Function to determine method for fetching OCR for a canvas. + + :param canvas: Canvas object + :type canvas: apps.iiif.canvases.models.Canvas + :return: List of dicts of parsed OCR data. + :rtype: list + """ + if canvas.default_ocr == "line": + result = fetch_tei_ocr(canvas) + return parse_tei_ocr(result) + + result = fetch_positional_ocr(canvas) + return add_positional_ocr(canvas, result) + +def fetch_tei_ocr(canvas): + """Function to fetch TEI OCR data for a given canvas. + + :param canvas: Canvas object + :type canvas: apps.iiif.canvases.models.Canvas + :return: Positional OCR data + :rtype: requests.models.Response + """ + if 'archivelab' in canvas.manifest.image_server.server_base: + return None + url = "{p}{c}/datastreams/tei/content".format( + p=settings.DATASTREAM_PREFIX, + c=canvas.pid.replace('fedora:', '') + ) + + return fetch_url(url, data_format='text/plain') + +def fetch_positional_ocr(canvas): + """Function to get OCR for a canvas depending on the image's source. + + :param canvas: Canvas object + :type canvas: apps.iiif.canvases.models.Canvas + :return: Positional OCR data + :rtype: requests.models.Response + """ + if 'archivelab' in canvas.manifest.image_server.server_base: + if '$' in canvas.pid: + pid = str(int(canvas.pid.split('$')[-1]) - canvas.ocr_offset) + else: + pid = canvas.pid + + url = f"https://api.archivelab.org/books/{canvas.manifest.pid}/pages/{pid}/ocr?mode=words" + + if environ['DJANGO_ENV'] == 'test': + fake_ocr = open(path.join(settings.FIXTURE_DIR, 'ocr_words.json')) + words = fake_ocr.read() + httpretty.enable() + httpretty.register_uri(httpretty.GET, url, body=words) + + return fetch_url(url) + + if 'images.readux.ecds.emory' in canvas.manifest.image_server.server_base: + # Fake TSV data for testing. + if environ['DJANGO_ENV'] == 'test': + fake_tsv = open(path.join(settings.FIXTURE_DIR, 'sample.tsv')) + tsv = fake_tsv.read() + url = "https://raw.githubusercontent.com/ecds/ocr-bucket/master/{m}/boo.tsv".format( + m=canvas.manifest.pid + ) + httpretty.enable() + httpretty.register_uri(httpretty.GET, url, body=tsv) + + if canvas.ocr_file_path is None: + return fetch_url( + "https://raw.githubusercontent.com/ecds/ocr-bucket/master/{m}/{p}.tsv".format( + m=canvas.manifest.pid, + p=canvas.pid.split('_')[-1] + .replace('.jp2', '') + .replace('.jpg', '') + .replace('.tif', '') + ), + data_format='text' + ) + + url = "{p}{c}{s}".format( + p=settings.DATASTREAM_PREFIX, + c=canvas.pid.replace('fedora:', ''), + s=settings.DATASTREAM_SUFFIX + ) + + if ( + environ['DJANGO_ENV'] == 'test' + and 'images.readux.ecds.emory' not in canvas.manifest.image_server.server_base + and canvas.ocr_file_path is None + ): + fake_json = open(path.join(settings.FIXTURE_DIR, 'ocr_words.json')) + words = fake_json.read() + httpretty.enable(allow_net_connect=True) + httpretty.register_uri(httpretty.GET, url, body=words) + + if canvas.ocr_file_path is not None: + if canvas.image_server.storage_service == 's3': + return canvas.image_server.bucket.Object(canvas.ocr_file_path).get()['Body'].read() + + return fetch_url(url, data_format='text/plain') + +def parse_alto_ocr(result): + """Function to parse fetched ALTO OCR data for a given canvas. + + :param result: Fetched ALTO OCR data + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list + """ + if result is None: + return None + ocr = [] + unvalidated_root = etree.fromstring(result) + if 'ns-v2' in unvalidated_root.tag: + schema_file = 'xml_schema/alto-2-1.xsd' + elif 'ns-v3' in unvalidated_root.tag: + schema_file = 'xml_schema/alto-3-1.xsd' + elif 'ns-v4' in unvalidated_root.tag: + schema_file = 'xml_schema/alto-4-2.xsd' + else: + schema_file = 'xml_schema/alto-1-4.xsd' + parser = etree.XMLParser(schema = etree.XMLSchema(file=schema_file)) + # The following will raise etree.XMLSyntaxError if invalid + root = etree.fromstring(result, parser=parser) + strings = root.findall('.//String') + if not strings: + strings = root.findall('.//{*}String') + for string in strings: + attrib = {k.lower(): v for k, v in string.attrib.items()} + ocr.append({ + 'content': attrib['content'], + 'h': int(attrib['height']), + 'w': int(attrib['width']), + 'x': int(attrib['hpos']), + 'y': int(attrib['vpos']) + }) + if ocr: + return ocr + return None + +def parse_hocr_ocr(result): + """Function to parse fetched hOCR data for a given canvas. + + :param result: Fetched hOCR data + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list + """ + if isinstance(result, bytes): + as_string = result.decode('utf-8') + else: + as_string = str(result) + # Regex to ignore x_size, x_ascenders, x_descenders. this is a known issue with + # tesseract produced hOCR: https://github.com/tesseract-ocr/tesseract/issues/3303 + result_without_invalid = re.sub( + r'([ ;]+)(x_size [0-9\.\-;]+)|( x_descenders [0-9\.\-;]+)|( x_ascenders [0-9\.\-;]+)', + repl='', string=as_string + ) + file_like_hocr = BytesIO(result_without_invalid.encode('utf-8')) + with tempfile.NamedTemporaryFile(delete=False) as tmp_file: + file_like_hocr.seek(0) + tmp_file.write(file_like_hocr.read()) + tmp_file.flush() + temp_file_name = tmp_file.name + validator = HocrValidator(profile='relaxed') + report = validator.validate(source=temp_file_name) + is_valid = report.format('bool') + if not is_valid: + report_text = report.format('text') + unlink(temp_file_name) + raise HocrValidationError(str(report_text)) + unlink(temp_file_name) + ocr = [] + file_like_hocr.seek(0) + tree = etree.parse(file_like_hocr) + words = tree.findall(".//span[@class]") + if not words: + words = tree.findall(".//{*}span[@class]") + for word in words: + if word.attrib['class'] == 'ocrx_word': + all_attrs = word.attrib['title'].split(';') + bbox = next((attrib for attrib in all_attrs if 'bbox' in attrib), '') + # Splitting 'bbox x0 y0 x1 y1' + bbox_attrs = bbox.split(' ') + if len(bbox_attrs) == 5: + ocr.append({ + 'content': word.text, + 'h': int(bbox_attrs[4]) - int(bbox_attrs[2]), + 'w': int(bbox_attrs[3]) - int(bbox_attrs[1]), + 'x': int(bbox_attrs[1]), + 'y': int(bbox_attrs[2]) + }) + if ocr: + return ocr + return None + +def parse_dict_ocr(result): + """Function to parse dict or JSON OCR data. + + :param result: Fetched dict OCR data + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list + """ + ocr = [] + if isinstance(result, bytes): + as_string = result.decode('utf-8') + as_dict = json.loads(as_string) + elif isinstance(result, str): + as_dict = json.loads(result) + else: + as_dict = result + if 'ocr' in as_dict and as_dict['ocr'] is not None: + for index, word in enumerate(as_dict['ocr']): # pylint: disable=unused-variable + if len(word) > 0: + for w in word: + ocr.append({ + 'content': w[0], + 'w': (w[1][2] - w[1][0]), + 'h': (w[1][1] - w[1][3]), + 'x': w[1][0], + 'y': w[1][3], + }) + if ocr: + return ocr + return None + +def parse_tei_ocr(result): + """Function to parse fetched TEI OCR data for a given canvas. + + :param result: Fetched TEI OCR data + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list + """ + if result is None: + return None + ocr = [] + parser = etree.XMLParser(schema = etree.XMLSchema(file='xml_schema/tei_all.xsd')) + # The following will raise etree.XMLSyntaxError if invalid + surface = etree.fromstring(result, parser=parser)[-1][0] + for zones in surface: + if 'zone' in zones.tag: + for line in zones: + # if line[-1].text is None: + # continue + ocr.append({ + 'content': line[-1].text, + 'h': int(line.get('lry')) - int(line.get('uly')), + 'w': int(line.get('lrx')) - int(line.get('ulx')), + 'x': int(line.get('ulx')), + 'y': int(line.get('uly')) + }) + if ocr: + return ocr + return None + +def parse_tsv_ocr(result): + """Function to parse fetched TSV OCR data for a given canvas. + + :param result: Fetched TSV OCR data + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list + """ + ocr = [] + if isinstance(result, bytes): + lines = result.decode('utf-8').splitlines() + else: + lines = str(result).split('\n') + + # Sometimes the TSV has some extra tabs at the beginning and the end. These have + # to be cleaned out. It gets complicated. + for index, line in enumerate(lines): + # First we remove any leading column that is empty. + line = line.strip() + lines[index] = line + # It might be true that the "content" column is empty. However, we just + # removed it. So we have to add it back. + if lines[index].count('\t') == 3: + lines[index] = ' \t' + lines[index] + + reader = csv.DictReader(lines, dialect=IncludeQuotesDialect) + + for row in reader: + content = row['content'] + w = int(row['w']) + h = int(row['h']) + x = int(row['x']) + y = int(row['y']) + ocr.append({ + 'content': content, + 'w': w, + 'h': h, + 'x': x, + 'y': y, + }) + if ocr: + return ocr + return None + +def parse_fedora_ocr(result): + """Function to parse fetched Fedora OCR data for a given canvas. + + :param result: Fetched Fedora OCR data (bytes) + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list + """ + ocr = [] + if isinstance(result, bytes): + # What comes back from fedora is 8-bit bytes + for _, word in enumerate(result.decode('UTF-8-sig').strip().split('\r\n')): + if len(word.split('\t')) == 5: + ocr.append({ + 'content': word.split('\t')[4], + 'w': int(word.split('\t')[2]), + 'h': int(word.split('\t')[3]), + 'x': int(word.split('\t')[0]), + 'y': int(word.split('\t')[1]) + }) + return ocr + +def parse_xml_ocr(result): + """Function to determine the flavor of XML OCR and then parse accordingly. + + :param result: Fetched XML OCR data + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list + """ + root = etree.fromstring(result) + if ( + re.match(r'{[0-9A-Za-z.:/#-]+}alto|alto', root.tag) + or 'www.loc.gov/standards/alto' in root.find('.//*').tag + ): + return parse_alto_ocr(result) + if root.find('.//teiHeader') is not None or root.find('.//{*}teiHeader') is not None: + return parse_tei_ocr(result) + if root.find('.//div') is not None or root.find('.//{*}div') is not None: + # Fallback to hOCR if it looks like XHTML + return parse_hocr_ocr(result) + return None + +def add_ocr_annotations(canvas, ocr): + word_order = 1 + for word in ocr: + # A quick check to make sure the header row didn't slip through. + if word['x'] == 'x': + continue + + # Set the content to a single space if it's missing. + if ( + word == '' or + 'content' not in word or + not word['content'] or + word['content'].isspace() + ): + word['content'] = ' ' + anno = OCR() + anno.canvas = canvas + anno.x = word['x'] + anno.y = word['y'] + anno.w = word['w'] + anno.h = word['h'] + anno.resource_type = anno.OCR + anno.content = word['content'] + anno.order = word_order + anno.save() + word_order += 1 + +def add_oa_annotations(annotation_list_url): + data = fetch_url(annotation_list_url) + for oa_annotation in data['resources']: + anno = deserialize('annotation', oa_annotation) + anno.save() + +def add_positional_ocr(canvas, result): + """Function to disambiguate and parse fetched OCR data for a canvas. + + :param canvas: Canvas object + :type canvas: apps.iiif.canvases.models.Canvas + :param result: Previously fetched OCR data + :type result: requests.models.Response + :return: List of dicts of parsed OCR data. + :rtype: list + """ + if result is None: + return None + if canvas.ocr_file_path is None: + if isinstance(result, dict) or is_json(result): + ocr = parse_dict_ocr(result) + elif is_tsv(result) and isinstance(result, bytes): + if result.decode('utf-8') == result.decode('UTF-8-sig'): + ocr = parse_tsv_ocr(result) + else: + ocr = parse_fedora_ocr(result) + elif is_tsv(result): + ocr = parse_tsv_ocr(result) + elif canvas.ocr_file_path.endswith('.json'): + ocr = parse_dict_ocr(result) + elif canvas.ocr_file_path.endswith('.tsv') or canvas.ocr_file_path.endswith('.tab'): + ocr = parse_tsv_ocr(result) + elif canvas.ocr_file_path.endswith('.xml'): + ocr = parse_xml_ocr(result) + elif canvas.ocr_file_path.endswith('.hocr'): + ocr = parse_hocr_ocr(result) + if ocr: + return ocr + return None + +def is_json(to_test): + """Function to test if data is shaped like JSON. + + :param to_test: String or bytes + :type to_test: requests.models.Response + :return: True if shaped like JSON, False if not. + :rtype: bool + """ + if isinstance(to_test, bytes): + as_str = to_test.decode('utf-8') + else: + as_str = str(to_test) + try: + json.loads(as_str) + except ValueError: + return False + return True + +def is_tsv(to_test): + """Function to test if data is shaped like a TSV. + + :param to_test: String or bytes + :type to_test: requests.models.Response + :return: True if shaped like a TSV, False if not. + :rtype: bool + """ + if isinstance(to_test, bytes): + as_str = to_test.decode('utf-8') + as_list = as_str.splitlines() + else: + as_str = str(to_test) + as_list = as_str.split('\n') + if len(as_list) > 1: + if len(as_str.split('\t')) > 1: + return True + return False diff --git a/readux_ingest_ecds/services/services.py b/readux_ingest_ecds/services/services.py new file mode 100644 index 0000000..c15f966 --- /dev/null +++ b/readux_ingest_ecds/services/services.py @@ -0,0 +1,40 @@ +""" Utility functions for fetching remote data. """ +import json +import logging +import requests + +logger = logging.getLogger(__name__) +logging.getLogger("urllib3").setLevel(logging.ERROR) + +def fetch_url(url, timeout=30, data_format='json', verbosity=1): + """ Given a url, this function returns the data.""" + data = None + try: + resp = requests.get(url, timeout=timeout, verify=True) + except requests.exceptions.Timeout as err: + if verbosity > 2: + logger.warning('Connection timeoutout for {}'.format(url)) + return data + except Exception as err: + if verbosity > 2: + logger.warning('Connection failed for {}. ({})'.format(url, str(err))) + return data + + if resp.status_code != 200: + if verbosity > 2: + logger.warning('Connection failed status {}. ({})'.format(url, resp.status_code)) + return data + + if data_format == 'json': + try: + data = resp.json() + except json.decoder.JSONDecodeError as err: + if verbosity > 2: + logger.warning('Server send success status with bad content {}'.format(url)) + return data + + if data_format == 'text': + data = resp.text + else: + data = resp.content + return data diff --git a/readux_ingest_ecds/tasks.py b/readux_ingest_ecds/tasks.py index 43e194b..c5b5267 100644 --- a/readux_ingest_ecds/tasks.py +++ b/readux_ingest_ecds/tasks.py @@ -1,18 +1,18 @@ # pylint: disable = unused-argument """ Common tasks for ingest. """ +import os from celery import Celery from django.apps import apps from django.conf import settings from .helpers import get_iiif_models +from .services.ocr_services import get_ocr, add_ocr_annotations # Use `apps.get_model` to avoid circular import error. Because the parameters used to # create a background task have to be serializable, we can't just pass in the model object. Local = apps.get_model('readux_ingest_ecds.local') # pylint: disable = invalid-name -# Remote = apps.get_model('ingest.remote') -# S3Ingest = apps.get_model('ingest.S3Ingest') -Manifest = get_iiif_models()['Manifest'] +Manifest = get_iiif_models()['Manifest'] Canvas = get_iiif_models()['Canvas'] app = Celery('readux_ingest_ecds', result_extended=True) @@ -29,3 +29,19 @@ def local_ingest_task_ecds(ingest_id): """ local_ingest = Local.objects.get(pk=ingest_id) local_ingest.ingest() + if os.environ["DJANGO_ENV"] != 'test': # pragma: no cover + add_ocr_task.delay(local_ingest.manifest.pk) + else: + add_ocr_task(local_ingest.manifest.pk) + + +@app.task(name='adding_ocr_to_canvas', autoretry_for=(Manifest.DoesNotExist,), retry_backoff=5) +def add_ocr_task(manifest_id, *args, **kwargs): + """Function for parsing and adding OCR.""" + manifest = Manifest.objects.get(pk=manifest_id) + for canvas in manifest.canvas_set.all(): + ocr = get_ocr(canvas) + + if ocr is not None: + add_ocr_annotations(canvas, ocr) + canvas.save() # trigger reindex diff --git a/setup.cfg b/setup.cfg index 7faa55e..939072d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,3 +36,4 @@ install_requires = django-celery-results~=2.4.0 boto3 Pillow==9.4.0 # wagtail 4.2.4 depends on Pillow<10.0.0 and >=4.0.0 + requests>=1.3.1 diff --git a/test_app/fixtures/00000002.tsv b/test_app/fixtures/00000002.tsv new file mode 100644 index 0000000..791b3a0 --- /dev/null +++ b/test_app/fixtures/00000002.tsv @@ -0,0 +1,11 @@ +content x y w h +Manuscript 939 561 745 247 +, 1698 577 63 232 +Archives 1787 578 554 243 +and 969 739 213 235 +Rare 1242 754 310 240 +Book 1608 775 300 239 +Library 1997 795 450 249 +F 1516 1182 22 90 +EMORY 829 2728 560 161 +UNIVERSITY 1427 2748 971 173 diff --git a/test_app/fixtures/__init__.py b/test_app/fixtures/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test_app/fixtures/alto.xml b/test_app/fixtures/alto.xml new file mode 100755 index 0000000..482eddd --- /dev/null +++ b/test_app/fixtures/alto.xml @@ -0,0 +1,41 @@ + + + + pixel + + ./P100.tif + + + + + tesseract 4.0.0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test_app/fixtures/bad_hocr.hocr b/test_app/fixtures/bad_hocr.hocr new file mode 100755 index 0000000..995ee0d --- /dev/null +++ b/test_app/fixtures/bad_hocr.hocr @@ -0,0 +1,33 @@ + + + + + + + + + + +
+
+

+ + MAGNA + CAMPI + MARTII + β€” + + + ICHNOGRAPHIA + DESCRIPTA + SV + NT + +

+
+
+
+
+ + diff --git a/test_app/fixtures/bad_tei.xml b/test_app/fixtures/bad_tei.xml new file mode 100644 index 0000000..ea1594d --- /dev/null +++ b/test_app/fixtures/bad_tei.xml @@ -0,0 +1,7 @@ + + + + Nice marmont + + + \ No newline at end of file diff --git a/test_app/fixtures/canvases.json b/test_app/fixtures/canvases.json new file mode 100644 index 0000000..29c28f2 --- /dev/null +++ b/test_app/fixtures/canvases.json @@ -0,0 +1,29 @@ +[{ + "model": "canvases.canvas", + "pk": "7261fae2-a24e-4a1c-9743-516f6c4ea0c9", + "fields": { + "label": "", + "pid": "fedora:emory:5622", + "resource": "5622", + "summary": null, + "manifest": "464d82f6-6ae5-4503-9afc-8e3cdd92a3f1", + "position": 6, + "height": 3608, + "width": 1976 + } +}, +{ + "model": "canvases.canvas", + "pk": "a7f1bd69-766c-4dd4-ab66-f4051fdd4cff", + "fields": { + "label": "", + "pid": "15210893.5622.emory.edu$95", + "resource": "15210893", + "summary": null, + "manifest": "464d82f6-6ae5-4503-9afc-8e3cdd92a3f1", + "position": 96, + "height": 1976, + "width": 3608, + "is_starting_page": true + } +}] \ No newline at end of file diff --git a/test_app/fixtures/hocr.hocr b/test_app/fixtures/hocr.hocr new file mode 100755 index 0000000..d04d8fb --- /dev/null +++ b/test_app/fixtures/hocr.hocr @@ -0,0 +1,33 @@ + + + + + + + + + + +
+
+

+ + MAGNA + CAMPI + MARTII + β€” + + + ICHNOGRAPHIA + DESCRIPTA + SV + NT + +

+
+
+
+
+ + diff --git a/test_app/fixtures/hops.xml b/test_app/fixtures/hops.xml new file mode 100644 index 0000000..e5d5b8f --- /dev/null +++ b/test_app/fixtures/hops.xml @@ -0,0 +1,105 @@ + + + + + Cascade + 1 + US + 5.50 + 0.0000000 + Boil + + Use For: American ales and lagers +Aroma: Strong spicy, floral, grapefriut character +Substitutes: Centennial +Examples: Sierra Nevade Pale Ale, Anchor Liberty Ale +A hops with Northern Brewers Heritage + Both +
Pellet
+ 6.00 + 50.0 + 0.00 oz + 0.00 oz + - +
+ + Galena + 1 + US + 13.00 + 0.0000000 + Boil + + Use for: General bittering hops for all beers +Aroma: Strong, clean, balanced bittering +Substitute: Eroica, Northern Brewer, Cluster, Chinook +Examples: Catamount Porter + Bittering +
Pellet
+ 7.50 + 15.0 + 0.00 oz + 0.00 oz + - +
+ + Goldings, B.C. + 1 + Canada + 5.00 + 0.0000000 + Boil + + Used for: Bittering and finishing British ales, bitters, porters and stouts. +Aroma: Spicy, floral, rounded mild aroma. +Substitutes: East Kent Goldings, Fuggles + Aroma +
Pellet
+ 3.20 + 40.0 + 0.00 oz + 0.00 oz + - +
+ + Northern Brewer + 1 + Germany + 8.50 + 0.0000000 + Boil + + Also called Hallertauer Northern Brewers +Use for: Bittering and finishing both ales and lagers of all kinds +Aroma: Fine, dry, clean bittering hop. Unique flavor. +Substitute: Hallertauer Mittelfrueh, Hallertauer +Examples: Anchor Steam, Old Peculiar, + Both +
Pellet
+ 4.00 + 35.0 + 0.00 oz + 0.00 oz + - +
+ + Tettnang + 1 + Germany + 4.50 + 0.0000000 + Boil + + Use for: German ales, lagers and wheat beer +Aroma: Noble, mild, fine, slightly spicy +Substitutes: Saaz, Spalt +Examples: Sam Adams Octoberfest, Anderson Valley ESB + Aroma +
Pellet
+ 3.50 + 40.0 + 0.00 oz + 0.00 oz + - +
+
diff --git a/test_app/fixtures/info.json b/test_app/fixtures/info.json new file mode 100644 index 0000000..53d8bf1 --- /dev/null +++ b/test_app/fixtures/info.json @@ -0,0 +1,43 @@ +{ + "@context": "http://iiif.io/api/image/2/context.json", + "@id": "https://images.readux.ecds.emory.edu:8443/cantaloupe/iiif/2/osh-formal-mir_OSH-cover1.jpg", + "protocol": "http://iiif.io/api/image", + "width": 3000, + "height": 3000, + "sizes": [{ + "width": 122, + "height": 106 + }, + { + "width": 245, + "height": 212 + }, + { + "width": 490, + "height": 423 + }, + { + "width": 979, + "height": 847 + }, + { + "width": 1958, + "height": 1694 + }, + { + "width": 3916, + "height": 3387 + } + ], + "tiles": [{ + "width": 979, + "height": 847, + "scaleFactors": [1, 2, 4, 8, 16, 32] + }], + "profile": ["http://iiif.io/api/image/2/level2.json", { + "formats": ["jpg", "tif", "gif", "png"], + "maxArea": 400000000, + "qualities": ["bitonal", "default", "gray", "color"], + "supports": ["regionByPx", "sizeByW", "sizeByWhListed", "cors", "regionSquare", "sizeByDistortedWh", "sizeAboveFull", "canonicalLinkHeader", "sizeByConfinedWh", "sizeByPct", "jsonldMediaType", "regionByPct", "rotationArbitrary", "sizeByH", "baseUriRedirect", "rotationBy90s", "profileLinkHeader", "sizeByForcedWh", "sizeByWh", "mirroring"] + }] +} \ No newline at end of file diff --git a/test_app/fixtures/ocr_words.json b/test_app/fixtures/ocr_words.json new file mode 100644 index 0000000..1617976 --- /dev/null +++ b/test_app/fixtures/ocr_words.json @@ -0,0 +1,4714 @@ +{ + "ocr": [ + [ + [ + "Dope", + [ + 1146, + 950, + 1168, + 928, + 0 + ] + ], + [ + "the", + [ + 365, + 377, + 426, + 347, + 376 + ] + ], + [ + "intensest", + [ + 448, + 378, + 630, + 345, + 376 + ] + ], + [ + "abhorrence", + [ + 650, + 381, + 879, + 347, + 379 + ] + ], + [ + "and", + [ + 899, + 382, + 969, + 350, + 381 + ] + ], + [ + "invites", + [ + 992, + 384, + 1126, + 350, + 383 + ] + ], + [ + "mankind", + [ + 1159, + 385, + 1333, + 353, + 384 + ] + ], + [ + "to", + [ + 1349, + 384, + 1384, + 355, + 384 + ] + ], + [ + "kill", + [ + 1405, + 382, + 1462, + 351, + 381 + ] + ], + [ + "the", + [ + 1495, + 380, + 1556, + 349, + 379 + ] + ], + [ + "crim-", + [ + 1578, + 378, + 1679, + 353, + 375 + ] + ] + ], + [ + [ + "inal", + [ + 361, + 433, + 429, + 402, + 433 + ] + ], + [ + "on", + [ + 452, + 433, + 496, + 410, + 433 + ] + ], + [ + "sight.", + [ + 518, + 442, + 626, + 401, + 433 + ] + ], + [ + "This", + [ + 682, + 436, + 771, + 402, + 435 + ] + ], + [ + "charge", + [ + 790, + 446, + 924, + 405, + 436 + ] + ], + [ + "thus", + [ + 944, + 440, + 1031, + 408, + 439 + ] + ], + [ + "brought", + [ + 1051, + 442, + 1208, + 408, + 441 + ] + ], + [ + "against", + [ + 1226, + 442, + 1370, + 409, + 440 + ] + ], + [ + "the", + [ + 1383, + 438, + 1445, + 408, + 437 + ] + ], + [ + "negro,", + [ + 1465, + 443, + 1587, + 411, + 433 + ] + ], + [ + "and", + [ + 1610, + 430, + 1680, + 398, + 429 + ] + ] + ], + [ + [ + "as", + [ + 362, + 490, + 401, + 468, + 490 + ] + ], + [ + "constantly", + [ + 421, + 501, + 634, + 458, + 489 + ] + ], + [ + "reiterated", + [ + 663, + 493, + 858, + 459, + 491 + ] + ], + [ + "by", + [ + 886, + 504, + 936, + 463, + 494 + ] + ], + [ + "his", + [ + 965, + 496, + 1020, + 464, + 495 + ] + ], + [ + "enemies,", + [ + 1048, + 503, + 1225, + 466, + 497 + ] + ], + [ + "is", + [ + 1248, + 497, + 1275, + 466, + 497 + ] + ], + [ + "not", + [ + 1298, + 496, + 1358, + 468, + 494 + ] + ], + [ + "merely", + [ + 1379, + 499, + 1517, + 460, + 491 + ] + ], + [ + "aga'.ns", + [ + 1540, + 497, + 1665, + 463, + 485 + ] + ], + [ + ":", + [ + 1674, + 476, + 1675, + 459, + 476 + ] + ], + [ + ":", + [ + 1678, + 483, + 1683, + 462, + 483 + ] + ] + ], + [ + [ + "the", + [ + 362, + 547, + 423, + 517, + 547 + ] + ], + [ + "individual", + [ + 445, + 547, + 642, + 514, + 546 + ] + ], + [ + "culprit,", + [ + 663, + 559, + 805, + 516, + 547 + ] + ], + [ + "as", + [ + 828, + 550, + 868, + 527, + 549 + ] + ], + [ + "would", + [ + 888, + 552, + 1007, + 520, + 551 + ] + ], + [ + "be", + [ + 1038, + 553, + 1082, + 521, + 553 + ] + ], + [ + "in", + [ + 1103, + 554, + 1136, + 522, + 554 + ] + ], + [ + "the", + [ + 1159, + 555, + 1220, + 524, + 555 + ] + ], + [ + "case", + [ + 1241, + 553, + 1324, + 530, + 552 + ] + ], + [ + "with", + [ + 1354, + 549, + 1440, + 518, + 548 + ] + ], + [ + "an", + [ + 1462, + 546, + 1507, + 524, + 546 + ] + ], + [ + "indiv.-j-", + [ + 1530, + 544, + 1678, + 510, + 542 + ] + ] + ], + [ + [ + "ual", + [ + 362, + 604, + 418, + 572, + 603 + ] + ], + [ + "culprit", + [ + 439, + 614, + 568, + 571, + 603 + ] + ], + [ + "of", + [ + 587, + 604, + 624, + 571, + 604 + ] + ], + [ + "any", + [ + 642, + 615, + 718, + 581, + 604 + ] + ], + [ + "other", + [ + 750, + 606, + 855, + 574, + 605 + ] + ], + [ + "race,", + [ + 885, + 617, + 979, + 585, + 607 + ] + ], + [ + "but", + [ + 1003, + 610, + 1068, + 578, + 609 + ] + ], + [ + "it", + [ + 1099, + 612, + 1124, + 578, + 611 + ] + ], + [ + "is", + [ + 1156, + 612, + 1183, + 579, + 611 + ] + ], + [ + "in", + [ + 1215, + 610, + 1247, + 580, + 610 + ] + ], + [ + "a", + [ + 1279, + 609, + 1298, + 587, + 609 + ] + ], + [ + "large", + [ + 1330, + 616, + 1427, + 576, + 606 + ] + ], + [ + "measure", + [ + 1460, + 603, + 1629, + 574, + 600 + ] + ], + [ + "a", + [ + 1661, + 595, + 1681, + 574, + 595 + ] + ] + ], + [ + [ + "charge", + [ + 360, + 669, + 493, + 630, + 660 + ] + ], + [ + "against", + [ + 513, + 670, + 657, + 627, + 660 + ] + ], + [ + "the", + [ + 675, + 661, + 739, + 630, + 661 + ] + ], + [ + "colored", + [ + 759, + 663, + 902, + 630, + 662 + ] + ], + [ + "race", + [ + 924, + 665, + 1004, + 642, + 664 + ] + ], + [ + "as", + [ + 1025, + 667, + 1065, + 644, + 666 + ] + ], + [ + "such.", + [ + 1085, + 668, + 1189, + 637, + 667 + ] + ], + [ + "It", + [ + 1246, + 666, + 1271, + 634, + 666 + ] + ], + [ + "throws", + [ + 1290, + 665, + 1430, + 633, + 663 + ] + ], + [ + "over", + [ + 1456, + 661, + 1544, + 635, + 659 + ] + ], + [ + "every", + [ + 1564, + 664, + 1681, + 630, + 655 + ] + ] + ], + [ + [ + ";olored", + [ + 366, + 718, + 501, + 685, + 717 + ] + ], + [ + "man", + [ + 523, + 717, + 607, + 694, + 717 + ] + ], + [ + "a", + [ + 628, + 717, + 647, + 695, + 717 + ] + ], + [ + "mantle", + [ + 669, + 719, + 807, + 686, + 718 + ] + ], + [ + "of", + [ + 828, + 719, + 864, + 687, + 719 + ] + ], + [ + "odium", + [ + 883, + 722, + 1002, + 688, + 721 + ] + ], + [ + "and", + [ + 1035, + 723, + 1106, + 691, + 722 + ] + ], + [ + "sets", + [ + 1137, + 725, + 1213, + 698, + 725 + ] + ], + [ + "upon", + [ + 1246, + 733, + 1340, + 699, + 722 + ] + ], + [ + "him", + [ + 1362, + 720, + 1434, + 688, + 719 + ] + ], + [ + "a", + [ + 1464, + 717, + 1482, + 696, + 717 + ] + ], + [ + "mark", + [ + 1504, + 716, + 1607, + 681, + 714 + ] + ], + [ + "tor", + [ + 1629, + 712, + 1679, + 683, + 711 + ] + ] + ], + [ + [ + "popular", + [ + 359, + 785, + 508, + 743, + 774 + ] + ], + [ + "hate,", + [ + 529, + 783, + 628, + 743, + 774 + ] + ], + [ + "more", + [ + 651, + 775, + 750, + 752, + 774 + ] + ], + [ + "distressing", + [ + 771, + 788, + 989, + 743, + 776 + ] + ], + [ + "than", + [ + 1008, + 780, + 1097, + 748, + 779 + ] + ], + [ + "the", + [ + 1118, + 781, + 1181, + 749, + 781 + ] + ], + [ + "mark", + [ + 1203, + 781, + 1305, + 747, + 780 + ] + ], + [ + "set", + [ + 1326, + 779, + 1383, + 748, + 778 + ] + ], + [ + "upon", + [ + 1402, + 788, + 1498, + 752, + 776 + ] + ], + [ + "the", + [ + 1518, + 774, + 1582, + 742, + 773 + ] + ], + [ + "first", + [ + 1603, + 772, + 1681, + 737, + 770 + ] + ] + ], + [ + [ + "murderer.", + [ + 359, + 832, + 557, + 799, + 831 + ] + ], + [ + "It", + [ + 613, + 832, + 640, + 799, + 831 + ] + ], + [ + "points", + [ + 667, + 842, + 789, + 799, + 831 + ] + ], + [ + "him", + [ + 817, + 833, + 889, + 801, + 833 + ] + ], + [ + "out", + [ + 922, + 835, + 985, + 806, + 834 + ] + ], + [ + "as", + [ + 1016, + 836, + 1056, + 814, + 835 + ] + ], + [ + "an", + [ + 1088, + 837, + 1133, + 814, + 836 + ] + ], + [ + "object", + [ + 1167, + 848, + 1289, + 806, + 837 + ] + ], + [ + "of", + [ + 1320, + 837, + 1356, + 802, + 836 + ] + ], + [ + "suspicion", + [ + 1391, + 845, + 1579, + 799, + 833 + ] + ], + [ + "and", + [ + 1612, + 829, + 1683, + 794, + 828 + ] + ] + ], + [ + [ + "avoidance.", + [ + 357, + 890, + 574, + 857, + 888 + ] + ], + [ + "Now", + [ + 629, + 889, + 720, + 856, + 888 + ] + ], + [ + "it", + [ + 749, + 889, + 774, + 857, + 888 + ] + ], + [ + "is", + [ + 802, + 890, + 830, + 857, + 889 + ] + ], + [ + "in", + [ + 862, + 891, + 896, + 858, + 890 + ] + ], + [ + "this", + [ + 926, + 892, + 999, + 860, + 891 + ] + ], + [ + "form", + [ + 1029, + 893, + 1118, + 860, + 892 + ] + ], + [ + "that", + [ + 1147, + 894, + 1228, + 863, + 894 + ] + ], + [ + "you", + [ + 1248, + 904, + 1322, + 872, + 894 + ] + ], + [ + "and", + [ + 1343, + 893, + 1414, + 860, + 893 + ] + ], + [ + "I,", + [ + 1438, + 900, + 1460, + 860, + 892 + ] + ], + [ + "and", + [ + 1483, + 891, + 1554, + 857, + 890 + ] + ], + [ + "all", + [ + 1575, + 888, + 1619, + 856, + 887 + ] + ], + [ + "of", + [ + 1649, + 886, + 1685, + 852, + 885 + ] + ] + ], + [ + [ + "us,", + [ + 358, + 952, + 411, + 925, + 947 + ] + ], + [ + "are", + [ + 434, + 947, + 495, + 923, + 946 + ] + ], + [ + "required", + [ + 517, + 956, + 683, + 914, + 946 + ] + ], + [ + "to", + [ + 705, + 946, + 741, + 916, + 946 + ] + ], + [ + "meet", + [ + 762, + 947, + 861, + 917, + 946 + ] + ], + [ + "it", + [ + 889, + 948, + 914, + 915, + 947 + ] + ], + [ + "and", + [ + 941, + 949, + 1011, + 917, + 948 + ] + ], + [ + "refute", + [ + 1042, + 950, + 1161, + 917, + 949 + ] + ], + [ + "it,", + [ + 1182, + 959, + 1219, + 918, + 950 + ] + ], + [ + "if", + [ + 1243, + 951, + 1267, + 919, + 950 + ] + ], + [ + "that", + [ + 1294, + 951, + 1376, + 919, + 950 + ] + ], + [ + "can", + [ + 1403, + 950, + 1471, + 926, + 949 + ] + ], + [ + "be", + [ + 1501, + 949, + 1546, + 917, + 948 + ] + ], + [ + "done.", + [ + 1574, + 946, + 1682, + 914, + 945 + ] + ] + ], + [ + [ + "In", + [ + 359, + 1004, + 394, + 972, + 1004 + ] + ], + [ + "the", + [ + 415, + 1004, + 478, + 972, + 1004 + ] + ], + [ + "opinion", + [ + 499, + 1015, + 643, + 971, + 1003 + ] + ], + [ + "of", + [ + 664, + 1005, + 701, + 972, + 1004 + ] + ], + [ + "some", + [ + 728, + 1004, + 830, + 981, + 1003 + ] + ], + [ + "of", + [ + 859, + 1004, + 895, + 971, + 1004 + ] + ], + [ + "us,", + [ + 922, + 1011, + 976, + 983, + 1004 + ] + ], + [ + "it", + [ + 1000, + 1006, + 1024, + 974, + 1006 + ] + ], + [ + "is", + [ + 1045, + 1007, + 1073, + 974, + 1006 + ] + ], + [ + "thought", + [ + 1092, + 1018, + 1252, + 976, + 1007 + ] + ], + [ + "that", + [ + 1279, + 1008, + 1360, + 976, + 1007 + ] + ], + [ + "it", + [ + 1380, + 1007, + 1405, + 975, + 1006 + ] + ], + [ + "were", + [ + 1424, + 1006, + 1523, + 983, + 1006 + ] + ], + [ + "well", + [ + 1544, + 1004, + 1628, + 971, + 1003 + ] + ], + [ + "to", + [ + 1650, + 1002, + 1685, + 973, + 1002 + ] + ] + ], + [ + [ + "say", + [ + 357, + 1072, + 424, + 1040, + 1062 + ] + ], + [ + "nothing", + [ + 447, + 1072, + 599, + 1029, + 1061 + ] + ], + [ + "about", + [ + 618, + 1062, + 731, + 1029, + 1061 + ] + ], + [ + "it,", + [ + 751, + 1070, + 788, + 1028, + 1061 + ] + ], + [ + "that", + [ + 811, + 1062, + 892, + 1029, + 1061 + ] + ], + [ + "the", + [ + 924, + 1062, + 986, + 1031, + 1062 + ] + ], + [ + "least", + [ + 1017, + 1064, + 1111, + 1031, + 1063 + ] + ], + [ + "said", + [ + 1143, + 1064, + 1219, + 1032, + 1064 + ] + ], + [ + "about", + [ + 1252, + 1064, + 1365, + 1032, + 1063 + ] + ], + [ + "it", + [ + 1397, + 1063, + 1422, + 1032, + 1063 + ] + ], + [ + "the", + [ + 1453, + 1063, + 1515, + 1032, + 1063 + ] + ], + [ + "better", + [ + 1549, + 1062, + 1669, + 1031, + 1061 + ] + ] + ], + [ + [ + "In", + [ + 358, + 1120, + 393, + 1087, + 1119 + ] + ], + [ + "this", + [ + 426, + 1120, + 499, + 1086, + 1119 + ] + ], + [ + "opinion", + [ + 530, + 1130, + 674, + 1086, + 1119 + ] + ], + [ + "I", + [ + 710, + 1118, + 717, + 1086, + 1118 + ] + ], + [ + "do", + [ + 749, + 1119, + 794, + 1086, + 1119 + ] + ], + [ + "not", + [ + 827, + 1119, + 890, + 1089, + 1118 + ] + ], + [ + "concur", + [ + 921, + 1120, + 1055, + 1096, + 1119 + ] + ], + [ + "Taking", + [ + 1126, + 1128, + 1270, + 1088, + 1120 + ] + ], + [ + "this", + [ + 1305, + 1121, + 1378, + 1088, + 1120 + ] + ], + [ + "charge", + [ + 1414, + 1130, + 1549, + 1089, + 1119 + ] + ], + [ + "in", + [ + 1586, + 1118, + 1620, + 1086, + 1118 + ] + ], + [ + "its", + [ + 1644, + 1118, + 1687, + 1087, + 1117 + ] + ] + ], + [ + [ + "broad", + [ + 358, + 1177, + 467, + 1144, + 1177 + ] + ], + [ + "and", + [ + 488, + 1177, + 558, + 1145, + 1177 + ] + ], + [ + "comprehensive", + [ + 579, + 1187, + 886, + 1144, + 1176 + ] + ], + [ + "sense", + [ + 907, + 1177, + 1017, + 1153, + 1176 + ] + ], + [ + "in", + [ + 1046, + 1177, + 1080, + 1145, + 1177 + ] + ], + [ + "which", + [ + 1110, + 1178, + 1230, + 1145, + 1177 + ] + ], + [ + "it", + [ + 1261, + 1178, + 1285, + 1145, + 1177 + ] + ], + [ + "is", + [ + 1314, + 1177, + 1341, + 1145, + 1177 + ] + ], + [ + "presented,", + [ + 1370, + 1187, + 1583, + 1144, + 1176 + ] + ], + [ + "a:id", + [ + 1615, + 1175, + 1684, + 1143, + 1174 + ] + ] + ], + [ + [ + "as", + [ + 357, + 1235, + 396, + 1214, + 1234 + ] + ], + [ + "now", + [ + 424, + 1235, + 505, + 1212, + 1234 + ] + ], + [ + "stated,", + [ + 526, + 1243, + 660, + 1202, + 1235 + ] + ], + [ + "I", + [ + 685, + 1234, + 692, + 1202, + 1234 + ] + ], + [ + "feel", + [ + 714, + 1235, + 784, + 1201, + 1234 + ] + ], + [ + "that", + [ + 813, + 1235, + 894, + 1202, + 1234 + ] + ], + [ + "it", + [ + 915, + 1235, + 940, + 1201, + 1234 + ] + ], + [ + "ought", + [ + 960, + 1245, + 1074, + 1203, + 1234 + ] + ], + [ + "to", + [ + 1093, + 1235, + 1129, + 1205, + 1235 + ] + ], + [ + "be", + [ + 1151, + 1235, + 1195, + 1203, + 1235 + ] + ], + [ + "met,", + [ + 1216, + 1239, + 1303, + 1205, + 1235 + ] + ], + [ + "and", + [ + 1327, + 1234, + 1397, + 1202, + 1234 + ] + ], + [ + "as", + [ + 1419, + 1234, + 1460, + 1212, + 1234 + ] + ], + [ + "a", + [ + 1491, + 1234, + 1511, + 1212, + 1234 + ] + ], + [ + "colored", + [ + 1543, + 1234, + 1687, + 1200, + 1233 + ] + ] + ], + [ + [ + "man,", + [ + 356, + 1301, + 454, + 1270, + 1293 + ] + ], + [ + "I", + [ + 487, + 1292, + 495, + 1259, + 1292 + ] + ], + [ + "am", + [ + 526, + 1293, + 583, + 1270, + 1292 + ] + ], + [ + "grateful", + [ + 613, + 1303, + 770, + 1259, + 1292 + ] + ], + [ + "for", + [ + 798, + 1293, + 852, + 1259, + 1292 + ] + ], + [ + "the", + [ + 883, + 1292, + 947, + 1260, + 1292 + ] + ], + [ + "opportunity", + [ + 977, + 1303, + 1216, + 1260, + 1291 + ] + ], + [ + "now", + [ + 1251, + 1293, + 1333, + 1269, + 1292 + ] + ], + [ + "afforded", + [ + 1364, + 1293, + 1529, + 1258, + 1292 + ] + ], + [ + "me", + [ + 1564, + 1292, + 1621, + 1269, + 1292 + ] + ], + [ + "to", + [ + 1649, + 1292, + 1686, + 1262, + 1291 + ] + ] + ], + [ + [ + "meet", + [ + 358, + 1351, + 455, + 1321, + 1350 + ] + ], + [ + "it.", + [ + 474, + 1351, + 512, + 1318, + 1351 + ] + ], + [ + "For", + [ + 567, + 1351, + 632, + 1317, + 1351 + ] + ], + [ + "I", + [ + 655, + 1350, + 662, + 1317, + 1350 + ] + ], + [ + "believe", + [ + 685, + 1351, + 828, + 1318, + 1350 + ] + ], + [ + "it", + [ + 849, + 1351, + 874, + 1317, + 1350 + ] + ], + [ + "can", + [ + 893, + 1350, + 961, + 1327, + 1350 + ] + ], + [ + "be", + [ + 984, + 1350, + 1028, + 1318, + 1350 + ] + ], + [ + "met", + [ + 1058, + 1350, + 1133, + 1320, + 1350 + ] + ], + [ + "and", + [ + 1161, + 1350, + 1233, + 1317, + 1350 + ] + ], + [ + "successfully", + [ + 1261, + 1361, + 1510, + 1316, + 1350 + ] + ], + [ + "met.", + [ + 1532, + 1350, + 1620, + 1321, + 1350 + ] + ], + [ + "I", + [ + 1677, + 1349, + 1685, + 1316, + 1349 + ] + ] + ], + [ + [ + "am", + [ + 358, + 1409, + 413, + 1387, + 1408 + ] + ], + [ + "of", + [ + 436, + 1409, + 472, + 1375, + 1408 + ] + ], + [ + "opinion", + [ + 491, + 1420, + 634, + 1375, + 1408 + ] + ], + [ + "that", + [ + 663, + 1409, + 745, + 1377, + 1408 + ] + ], + [ + "a", + [ + 773, + 1409, + 792, + 1386, + 1409 + ] + ], + [ + "people", + [ + 821, + 1420, + 949, + 1376, + 1408 + ] + ], + [ + "too", + [ + 963, + 1409, + 1023, + 1379, + 1409 + ] + ], + [ + "spiritless", + [ + 1044, + 1419, + 1222, + 1375, + 1408 + ] + ], + [ + "to", + [ + 1242, + 1409, + 1279, + 1379, + 1409 + ] + ], + [ + "defend", + [ + 1300, + 1410, + 1434, + 1376, + 1408 + ] + ], + [ + "themselves", + [ + 1456, + 1409, + 1687, + 1376, + 1408 + ] + ] + ], + [ + [ + "are", + [ + 357, + 1467, + 416, + 1444, + 1466 + ] + ], + [ + "not", + [ + 439, + 1467, + 501, + 1436, + 1467 + ] + ], + [ + "worth", + [ + 519, + 1467, + 637, + 1435, + 1466 + ] + ], + [ + "defending.", + [ + 666, + 1477, + 877, + 1433, + 1467 + ] + ] + ], + [ + [ + "Without", + [ + 453, + 1525, + 620, + 1492, + 1524 + ] + ], + [ + "boasting,", + [ + 641, + 1536, + 823, + 1492, + 1524 + ] + ], + [ + "on", + [ + 846, + 1525, + 891, + 1502, + 1524 + ] + ], + [ + "this", + [ + 912, + 1525, + 985, + 1492, + 1524 + ] + ], + [ + "broad", + [ + 1007, + 1524, + 1116, + 1492, + 1524 + ] + ], + [ + "issue", + [ + 1139, + 1525, + 1236, + 1492, + 1524 + ] + ], + [ + "as", + [ + 1265, + 1525, + 1306, + 1502, + 1524 + ] + ], + [ + "now", + [ + 1336, + 1525, + 1418, + 1502, + 1524 + ] + ], + [ + "presented,", + [ + 1439, + 1536, + 1652, + 1492, + 1525 + ] + ], + [ + "1", + [ + 1677, + 1524, + 1686, + 1491, + 1524 + ] + ] + ], + [ + [ + "am", + [ + 359, + 1583, + 414, + 1559, + 1582 + ] + ], + [ + "ready", + [ + 438, + 1593, + 551, + 1549, + 1582 + ] + ], + [ + "to", + [ + 570, + 1583, + 607, + 1552, + 1583 + ] + ], + [ + "confront", + [ + 627, + 1583, + 798, + 1549, + 1582 + ] + ], + [ + "ex-Governor", + [ + 817, + 1582, + 1087, + 1548, + 1581 + ] + ], + [ + "Chamberlain,", + [ + 1117, + 1591, + 1400, + 1548, + 1582 + ] + ], + [ + "Bishop", + [ + 1434, + 1594, + 1568, + 1550, + 1583 + ] + ], + [ + "Fitz-", + [ + 1598, + 1585, + 1688, + 1550, + 1583 + ] + ] + ], + [ + [ + "gerald,", + [ + 360, + 1649, + 495, + 1607, + 1640 + ] + ], + [ + "Bishop", + [ + 528, + 1652, + 662, + 1607, + 1640 + ] + ], + [ + "Haygood,", + [ + 704, + 1651, + 896, + 1607, + 1639 + ] + ], + [ + "and", + [ + 927, + 1639, + 998, + 1607, + 1639 + ] + ], + [ + "Miss", + [ + 1045, + 1639, + 1131, + 1606, + 1639 + ] + ], + [ + "Frances", + [ + 1174, + 1641, + 1333, + 1606, + 1639 + ] + ], + [ + "Willard", + [ + 1362, + 1642, + 1512, + 1608, + 1641 + ] + ], + [ + "and", + [ + 1541, + 1642, + 1612, + 1609, + 1641 + ] + ], + [ + "all", + [ + 1642, + 1641, + 1686, + 1609, + 1641 + ] + ] + ], + [ + [ + "others,", + [ + 359, + 1707, + 495, + 1667, + 1698 + ] + ], + [ + "singly", + [ + 518, + 1709, + 639, + 1665, + 1698 + ] + ], + [ + "or", + [ + 660, + 1699, + 697, + 1676, + 1698 + ] + ], + [ + "altogether,", + [ + 716, + 1707, + 936, + 1665, + 1697 + ] + ], + [ + "without", + [ + 960, + 1697, + 1120, + 1664, + 1696 + ] + ], + [ + "any", + [ + 1141, + 1707, + 1217, + 1674, + 1696 + ] + ], + [ + "doubt", + [ + 1238, + 1699, + 1351, + 1665, + 1698 + ] + ], + [ + "of", + [ + 1371, + 1699, + 1407, + 1666, + 1699 + ] + ], + [ + "the", + [ + 1425, + 1700, + 1489, + 1668, + 1699 + ] + ], + [ + "result.", + [ + 1511, + 1700, + 1638, + 1668, + 1699 + ] + ] + ], + [ + [ + "But", + [ + 457, + 1756, + 526, + 1723, + 1755 + ] + ], + [ + "I", + [ + 546, + 1756, + 555, + 1723, + 1756 + ] + ], + [ + "want", + [ + 576, + 1756, + 678, + 1726, + 1756 + ] + ], + [ + "to", + [ + 696, + 1756, + 734, + 1727, + 1756 + ] + ], + [ + "be", + [ + 750, + 1755, + 795, + 1724, + 1755 + ] + ], + [ + "understood", + [ + 818, + 1754, + 1041, + 1721, + 1753 + ] + ], + [ + "at", + [ + 1062, + 1754, + 1098, + 1724, + 1753 + ] + ], + [ + "the", + [ + 1118, + 1754, + 1180, + 1722, + 1754 + ] + ], + [ + "outset.", + [ + 1208, + 1757, + 1343, + 1726, + 1755 + ] + ], + [ + "I", + [ + 1401, + 1757, + 1408, + 1725, + 1757 + ] + ], + [ + "do", + [ + 1438, + 1758, + 1482, + 1725, + 1757 + ] + ], + [ + "not", + [ + 1517, + 1759, + 1580, + 1728, + 1758 + ] + ], + [ + "pre-", + [ + 1609, + 1769, + 1687, + 1735, + 1758 + ] + ] + ], + [ + [ + "tend", + [ + 362, + 1814, + 449, + 1781, + 1813 + ] + ], + [ + "that", + [ + 468, + 1814, + 549, + 1782, + 1813 + ] + ], + [ + "negroes", + [ + 570, + 1824, + 726, + 1790, + 1813 + ] + ], + [ + "are", + [ + 747, + 1812, + 808, + 1789, + 1812 + ] + ], + [ + "saints", + [ + 829, + 1812, + 946, + 1779, + 1811 + ] + ], + [ + "or", + [ + 966, + 1811, + 1003, + 1788, + 1810 + ] + ], + [ + "angels.", + [ + 1019, + 1821, + 1161, + 1779, + 1810 + ] + ], + [ + "I", + [ + 1219, + 1812, + 1227, + 1780, + 1812 + ] + ], + [ + "do", + [ + 1243, + 1813, + 1286, + 1781, + 1813 + ] + ], + [ + "not", + [ + 1305, + 1815, + 1367, + 1785, + 1814 + ] + ], + [ + "deny", + [ + 1381, + 1826, + 1481, + 1782, + 1815 + ] + ], + [ + "that", + [ + 1493, + 1817, + 1575, + 1784, + 1816 + ] + ], + [ + "they", + [ + 1593, + 1821, + 1687, + 1784, + 1816 + ] + ] + ], + [ + [ + "are", + [ + 361, + 1871, + 421, + 1849, + 1871 + ] + ], + [ + "capable", + [ + 449, + 1882, + 602, + 1839, + 1871 + ] + ], + [ + "of", + [ + 630, + 1871, + 666, + 1837, + 1871 + ] + ], + [ + "committing", + [ + 693, + 1878, + 922, + 1836, + 1869 + ] + ], + [ + "the", + [ + 949, + 1868, + 1012, + 1836, + 1868 + ] + ], + [ + "crime", + [ + 1050, + 1869, + 1160, + 1836, + 1868 + ] + ], + [ + "imputed", + [ + 1190, + 1881, + 1350, + 1837, + 1870 + ] + ], + [ + "to", + [ + 1379, + 1873, + 1416, + 1843, + 1873 + ] + ], + [ + "them,", + [ + 1443, + 1883, + 1559, + 1842, + 1874 + ] + ], + [ + "but", + [ + 1583, + 1875, + 1649, + 1842, + 1874 + ] + ], + [ + "I", + [ + 1678, + 1874, + 1686, + 1842, + 1874 + ] + ] + ], + [ + [ + "utterly", + [ + 363, + 1939, + 500, + 1897, + 1928 + ] + ], + [ + "deny", + [ + 520, + 1939, + 620, + 1896, + 1928 + ] + ], + [ + "that", + [ + 640, + 1929, + 722, + 1896, + 1928 + ] + ], + [ + "they", + [ + 740, + 1937, + 834, + 1896, + 1927 + ] + ], + [ + "are", + [ + 854, + 1926, + 915, + 1903, + 1925 + ] + ], + [ + "any", + [ + 936, + 1935, + 1012, + 1902, + 1925 + ] + ], + [ + "more", + [ + 1041, + 1926, + 1140, + 1902, + 1925 + ] + ], + [ + "addicted", + [ + 1168, + 1929, + 1334, + 1895, + 1927 + ] + ], + [ + "to", + [ + 1363, + 1931, + 1399, + 1901, + 1930 + ] + ], + [ + "the", + [ + 1426, + 1932, + 1491, + 1900, + 1931 + ] + ], + [ + "commis-", + [ + 1518, + 1933, + 1687, + 1901, + 1932 + ] + ] + ], + [ + [ + "sion", + [ + 363, + 1987, + 439, + 1955, + 1986 + ] + ], + [ + "of", + [ + 461, + 1987, + 497, + 1953, + 1986 + ] + ], + [ + "that", + [ + 515, + 1986, + 596, + 1954, + 1986 + ] + ], + [ + "crime", + [ + 628, + 1985, + 739, + 1953, + 1985 + ] + ], + [ + "than", + [ + 770, + 1984, + 860, + 1952, + 1983 + ] + ], + [ + "is", + [ + 895, + 1982, + 923, + 1950, + 1982 + ] + ], + [ + "true", + [ + 954, + 1982, + 1034, + 1953, + 1982 + ] + ], + [ + "of", + [ + 1067, + 1983, + 1103, + 1950, + 1982 + ] + ], + [ + "any", + [ + 1134, + 1994, + 1209, + 1961, + 1983 + ] + ], + [ + "other", + [ + 1241, + 1987, + 1345, + 1954, + 1986 + ] + ], + [ + "variety", + [ + 1379, + 2000, + 1524, + 1956, + 1988 + ] + ], + [ + "of", + [ + 1556, + 1990, + 1593, + 1957, + 1990 + ] + ], + [ + "the", + [ + 1624, + 1991, + 1687, + 1960, + 1991 + ] + ] + ], + [ + [ + "human", + [ + 362, + 2044, + 499, + 2013, + 2043 + ] + ], + [ + "family.", + [ + 529, + 2054, + 670, + 2010, + 2043 + ] + ], + [ + "In", + [ + 737, + 2041, + 772, + 2009, + 2041 + ] + ], + [ + "entering", + [ + 806, + 2049, + 975, + 2007, + 2039 + ] + ], + [ + "upon", + [ + 1007, + 2049, + 1103, + 2016, + 2039 + ] + ], + [ + "my", + [ + 1139, + 2051, + 1202, + 2018, + 2040 + ] + ], + [ + "argument,", + [ + 1238, + 2055, + 1445, + 2019, + 2044 + ] + ], + [ + "I", + [ + 1471, + 2046, + 1478, + 2014, + 2046 + ] + ], + [ + "may", + [ + 1517, + 2058, + 1603, + 2024, + 2048 + ] + ], + [ + "be", + [ + 1643, + 2049, + 1687, + 2017, + 2049 + ] + ] + ], + [ + [ + "allowed", + [ + 362, + 2101, + 516, + 2068, + 2100 + ] + ], + [ + "to", + [ + 536, + 2101, + 572, + 2071, + 2101 + ] + ], + [ + "say,", + [ + 592, + 2111, + 674, + 2077, + 2100 + ] + ], + [ + "that", + [ + 702, + 2100, + 783, + 2067, + 2098 + ] + ], + [ + "1", + [ + 805, + 2097, + 811, + 2065, + 2098 + ] + ], + [ + "appear", + [ + 842, + 2107, + 976, + 2073, + 2096 + ] + ], + [ + "here", + [ + 997, + 2096, + 1083, + 2064, + 2095 + ] + ], + [ + "this", + [ + 1103, + 2098, + 1175, + 2065, + 2097 + ] + ], + [ + "evening", + [ + 1203, + 2112, + 1363, + 2068, + 2099 + ] + ], + [ + "not", + [ + 1384, + 2104, + 1446, + 2074, + 2103 + ] + ], + [ + "as", + [ + 1466, + 2104, + 1506, + 2082, + 2104 + ] + ], + [ + "the", + [ + 1538, + 2106, + 1601, + 2074, + 2106 + ] + ], + [ + "de-", + [ + 1627, + 2107, + 1687, + 2074, + 2106 + ] + ] + ], + [ + [ + "fender", + [ + 363, + 2158, + 491, + 2126, + 2157 + ] + ], + [ + "of", + [ + 511, + 2158, + 546, + 2124, + 2157 + ] + ], + [ + "any", + [ + 565, + 2168, + 640, + 2135, + 2157 + ] + ], + [ + "man", + [ + 661, + 2157, + 746, + 2133, + 2156 + ] + ], + [ + "guilty", + [ + 765, + 2166, + 882, + 2122, + 2154 + ] + ], + [ + "of", + [ + 903, + 2154, + 939, + 2120, + 2153 + ] + ], + [ + "this", + [ + 957, + 2153, + 1030, + 2120, + 2152 + ] + ], + [ + "atrocious", + [ + 1050, + 2156, + 1230, + 2122, + 2154 + ] + ], + [ + "crime,", + [ + 1250, + 2168, + 1372, + 2125, + 2157 + ] + ], + [ + "but", + [ + 1396, + 2161, + 1461, + 2128, + 2160 + ] + ], + [ + "as", + [ + 1481, + 2162, + 1522, + 2139, + 2161 + ] + ], + [ + "the", + [ + 1540, + 2164, + 1604, + 2132, + 2163 + ] + ], + [ + "de-", + [ + 1626, + 2165, + 1686, + 2132, + 2164 + ] + ] + ], + [ + [ + "fender", + [ + 364, + 2215, + 492, + 2183, + 2215 + ] + ], + [ + "of", + [ + 512, + 2215, + 548, + 2181, + 2214 + ] + ], + [ + "the", + [ + 567, + 2214, + 629, + 2182, + 2214 + ] + ], + [ + "colored", + [ + 649, + 2214, + 793, + 2180, + 2213 + ] + ], + [ + "people", + [ + 814, + 2223, + 942, + 2178, + 2210 + ] + ], + [ + "as", + [ + 961, + 2210, + 1002, + 2187, + 2209 + ] + ], + [ + "a", + [ + 1021, + 2209, + 1041, + 2187, + 2209 + ] + ], + [ + "class.", + [ + 1061, + 2212, + 1168, + 2178, + 2210 + ] + ] + ], + [ + [ + "In", + [ + 463, + 2271, + 497, + 2239, + 2271 + ] + ], + [ + "answer", + [ + 514, + 2271, + 660, + 2248, + 2271 + ] + ], + [ + "to", + [ + 680, + 2271, + 716, + 2241, + 2271 + ] + ], + [ + "the", + [ + 735, + 2270, + 798, + 2238, + 2269 + ] + ], + [ + "terrible", + [ + 818, + 2269, + 965, + 2235, + 2267 + ] + ], + [ + "indictment,", + [ + 986, + 2277, + 1213, + 2234, + 2266 + ] + ], + [ + "thus", + [ + 1236, + 2272, + 1323, + 2239, + 2271 + ] + ], + [ + "read,", + [ + 1343, + 2283, + 1440, + 2242, + 2273 + ] + ], + [ + "and", + [ + 1463, + 2276, + 1533, + 2244, + 2275 + ] + ], + [ + "speak-", + [ + 1555, + 2288, + 1685, + 2249, + 2278 + ] + ] + ], + [ + [ + "ing", + [ + 375, + 2335, + 435, + 2298, + 2329 + ] + ], + [ + "for", + [ + 454, + 2329, + 507, + 2295, + 2328 + ] + ], + [ + "the", + [ + 527, + 2329, + 590, + 2296, + 2328 + ] + ], + [ + "colored", + [ + 610, + 2329, + 752, + 2295, + 2327 + ] + ], + [ + "people", + [ + 769, + 2337, + 900, + 2292, + 2325 + ] + ], + [ + "as", + [ + 920, + 2324, + 960, + 2301, + 2324 + ] + ], + [ + "a", + [ + 975, + 2323, + 995, + 2301, + 2323 + ] + ], + [ + "class,", + [ + 1016, + 2331, + 1123, + 2290, + 2322 + ] + ], + [ + "1,", + [ + 1144, + 2333, + 1166, + 2292, + 2327 + ] + ], + [ + "in", + [ + 1185, + 2325, + 1217, + 2293, + 2324 + ] + ], + [ + "their", + [ + 1247, + 2327, + 1339, + 2296, + 2326 + ] + ], + [ + "stead,", + [ + 1367, + 2339, + 1484, + 2299, + 2329 + ] + ], + [ + "here", + [ + 1508, + 2334, + 1594, + 2301, + 2333 + ] + ], + [ + "an", + [ + 1614, + 2335, + 1656, + 2314, + 2335 + ] + ], + [ + "J", + [ + 1658, + 2336, + 1683, + 2305, + 2336 + ] + ] + ], + [ + [ + "now", + [ + 380, + 2385, + 460, + 2364, + 2384 + ] + ], + [ + "plead", + [ + 481, + 2396, + 586, + 2353, + 2385 + ] + ], + [ + "not", + [ + 607, + 2385, + 669, + 2355, + 2385 + ] + ], + [ + "guilty", + [ + 688, + 2394, + 805, + 2351, + 2383 + ] + ], + [ + "and", + [ + 821, + 2382, + 891, + 2349, + 2381 + ] + ], + [ + "shall", + [ + 912, + 2381, + 1003, + 2348, + 2380 + ] + ], + [ + "submit", + [ + 1017, + 2380, + 1156, + 2347, + 2379 + ] + ], + [ + "my", + [ + 1178, + 2391, + 1239, + 2359, + 2380 + ] + ], + [ + "case", + [ + 1259, + 2383, + 1343, + 2360, + 2382 + ] + ], + [ + "with", + [ + 1364, + 2386, + 1451, + 2353, + 2384 + ] + ], + [ + "confidence", + [ + 1473, + 2392, + 1688, + 2357, + 2389 + ] + ] + ], + [ + [ + "of", + [ + 367, + 2443, + 401, + 2410, + 2442 + ] + ], + [ + "acquittal", + [ + 430, + 2452, + 604, + 2409, + 2441 + ] + ], + [ + "by", + [ + 624, + 2452, + 674, + 2410, + 2442 + ] + ], + [ + "good", + [ + 696, + 2451, + 786, + 2407, + 2439 + ] + ], + [ + "men", + [ + 807, + 2439, + 890, + 2415, + 2438 + ] + ], + [ + "and", + [ + 912, + 2438, + 982, + 2404, + 2437 + ] + ], + [ + "women", + [ + 1003, + 2436, + 1148, + 2413, + 2435 + ] + ], + [ + "North", + [ + 1170, + 2437, + 1283, + 2404, + 2436 + ] + ], + [ + "and", + [ + 1305, + 2439, + 1374, + 2408, + 2438 + ] + ], + [ + "South.", + [ + 1396, + 2443, + 1523, + 2407, + 2441 + ] + ] + ], + [ + [ + "It", + [ + 466, + 2499, + 492, + 2467, + 2498 + ] + ], + [ + "is", + [ + 511, + 2499, + 539, + 2467, + 2499 + ] + ], + [ + "the", + [ + 558, + 2499, + 621, + 2467, + 2498 + ] + ], + [ + "misfortune", + [ + 642, + 2498, + 859, + 2464, + 2496 + ] + ], + [ + "of", + [ + 879, + 2495, + 915, + 2461, + 2494 + ] + ], + [ + "the", + [ + 933, + 2495, + 996, + 2462, + 2494 + ] + ], + [ + "colored", + [ + 1024, + 2493, + 1167, + 2460, + 2491 + ] + ], + [ + "people", + [ + 1195, + 2503, + 1324, + 2462, + 2492 + ] + ], + [ + "in", + [ + 1353, + 2495, + 1386, + 2463, + 2494 + ] + ], + [ + "tivs", + [ + 1424, + 2497, + 1496, + 2466, + 2496 + ] + ], + [ + "country", + [ + 1525, + 2510, + 1683, + 2472, + 2499 + ] + ] + ], + [ + [ + "that", + [ + 372, + 2556, + 451, + 2526, + 2555 + ] + ], + [ + "the", + [ + 471, + 2556, + 534, + 2524, + 2555 + ] + ], + [ + "sins", + [ + 556, + 2555, + 631, + 2523, + 2555 + ] + ], + [ + "of", + [ + 650, + 2555, + 686, + 2521, + 2554 + ] + ], + [ + "the", + [ + 704, + 2554, + 768, + 2522, + 2553 + ] + ], + [ + "few", + [ + 788, + 2552, + 860, + 2520, + 2551 + ] + ], + [ + "are", + [ + 880, + 2551, + 941, + 2528, + 2551 + ] + ], + [ + "visited", + [ + 961, + 2550, + 1091, + 2516, + 2549 + ] + ], + [ + "upon", + [ + 1112, + 2559, + 1207, + 2525, + 2548 + ] + ], + [ + "the", + [ + 1229, + 2549, + 1291, + 2517, + 2549 + ] + ], + [ + "many,", + [ + 1322, + 2561, + 1448, + 2527, + 2549 + ] + ], + [ + "and", + [ + 1481, + 2553, + 1551, + 2522, + 2552 + ] + ], + [ + "I", + [ + 1585, + 2554, + 1593, + 2523, + 2554 + ] + ], + [ + "am", + [ + 1628, + 2555, + 1683, + 2534, + 2554 + ] + ] + ], + [ + [ + "here", + [ + 371, + 2612, + 457, + 2583, + 2612 + ] + ], + [ + "to", + [ + 476, + 2613, + 512, + 2583, + 2612 + ] + ], + [ + "speak", + [ + 534, + 2623, + 646, + 2578, + 2611 + ] + ], + [ + "for", + [ + 667, + 2611, + 720, + 2578, + 2610 + ] + ], + [ + "the", + [ + 739, + 2610, + 802, + 2578, + 2609 + ] + ], + [ + "many", + [ + 823, + 2618, + 935, + 2585, + 2608 + ] + ], + [ + "whose", + [ + 957, + 2606, + 1083, + 2575, + 2606 + ] + ], + [ + "reputation", + [ + 1104, + 2616, + 1309, + 2573, + 2604 + ] + ], + [ + "is", + [ + 1339, + 2606, + 1367, + 2573, + 2605 + ] + ], + [ + "put", + [ + 1383, + 2616, + 1447, + 2581, + 2606 + ] + ], + [ + "in", + [ + 1476, + 2608, + 1507, + 2576, + 2607 + ] + ], + [ + "peril", + [ + 1530, + 2618, + 1613, + 2579, + 2609 + ] + ], + [ + "by", + [ + 1638, + 2614, + 1682, + 2583, + 2611 + ] + ] + ] + ] +} \ No newline at end of file diff --git a/test_app/fixtures/sample.tsv b/test_app/fixtures/sample.tsv new file mode 100644 index 0000000..d990c86 --- /dev/null +++ b/test_app/fixtures/sample.tsv @@ -0,0 +1,6 @@ +content x y w h +Jordan 459 391 89 43 +453 397 397 3 + 1 2 3 4 + hello 10 20 30 40 + welp 11 21 31 41 diff --git a/test_app/fixtures/tei.xml b/test_app/fixtures/tei.xml new file mode 100644 index 0000000..13e26bf --- /dev/null +++ b/test_app/fixtures/tei.xml @@ -0,0 +1,132 @@ + + + + + Af-beeldinghe van d'eerste eeuwe der Societeyt Iesu voor ooghen ghestelt door de Duyts-Nederlantsche provincie der seluer societeyt., p. 10 + + + Emory University Library and Information Technology Services + + +

Abbyy file derived from OCR of Bolland, Johannes, 1596-1665, Henschenius, Godefridus, 1601-1681, Tollenaere, Jean de, 1582-1643, Poirters, Adrien, 1605-1674, Galle, Cornelis, 1576-1650,, Natalis, Michel, 1610-1668,, Diepenbeeck, Abraham van, 1596-1675,, Plantijnsche Drukkerij. Af-beeldinghe van d'eerste eeuwe der Societeyt Iesu voor ooghen ghestelt door de Duyts-Nederlantsche provincie der seluer societeyt., ['1640'].

+
+
+
+ + + + + + mm + + + + + AEN DEN LESIIU + + + tnaken, om te fchijnen bouen alle andere te kracycn, en die te mer- + + + drucken ? + + + Of ty dit ergbens in'tbeleydt man dit heel ftuck^, met de minfee + + + merfmacdehjckheyt man eenighe andere Orden oft Religte, ghedaen + + + hebben, datftellen ley ten oordeele manden onpartijdighen Lepr; + + + den Tvelcken bier minden fal d'af-beeldingbe mande eerfie eeulve on- + + + fer S octet eyt, die "toy met on fen H. Vader gbeerne kennen de laetfle + + + en de minfee te %jjn, onderfoo mele oude ende treffelijcke Or dens Van + + + S.Augufinus, Beneditlus, Bernardus, Norbertus ,Domimcwi, Fran- + + + cifciis, ende meer andere , die met mcerdere mrucht en glorie inde + + + H.Kercke merkeert hebben. 'tis defen gheoorloft ghelveeil 'tvocdt + + + gberucbt,dathen naeghingb,en noch heden-fdaeghs molght, als eenen + + + toet-feen man bun innerhjcl^ yvefen, aende "Svereldt, nu mondeltjck^ + + + inde predtkatien, nu fchrtftelijck_ inde gbedruckfe boecken, moor oo- + + + ghen te ftellen, om daer aen het goudt manden ijuer en liefde te keu- + + + ren, met de loelckefy de glorie Godts en des naefeenfaligbeyt, neffens + + + hunne eygbene molmaecktheyt ghetracht hebben te moorderen. Soa + + + *n magb het ons dan oock_noch tot blaeme noch tot phande ghedijen, + + + dat Ivy onfe meeder de Socteteyt, die ons iuffchen feo meel drucks + + + β– en lijdens, foo mele opmallen ende ouerlafeen , feo mele merVolghin- + + + gben en martehenfihterals eene nae-vrucht op'teynde der Tvereldt, + + + aen de H.Kercke ghebaert heeft, met eene lof-rijeke danckbaerheyt + + + oppellen: te mm0ds "dry d'eere ende de glorie man alle haere daden aen + + + Godt den Hecre a/leen, en met aen oris feluen, toe en eyghenen. + + + Daerotn pet ghy de Socteteyt m't moor- bladt man dit Boeck. in + + + pnnte gbeflelt met d'ooghen opTvaerts ten bemel gbeflaghen, tvaerfy + + + met een' oprechte meymngbe Tvederom benen phickt, al datfe man + + + daer ontfangben heeft, als ofse op alles loaer medefy biergheprefen en + + + *verciert "ioordt, met een' ingbekeertbeyt en Tveer-flagh des herten, + + + (lommelingb andnvoordde, datfe allefftns moor heeft, Tot meerdere + + + eere ende glorie Godts. Inde rechte handt houdtfe onfe Conftitu- + + + tien ende Regbelen; indeflmcke op eenen dry-meet bet kruya met de + + + bernende ~totrcldt} in de Tpelcke den mierighen ijuer Van S.Ignatius, + + + Xaue- + + + + +
\ No newline at end of file diff --git a/test_app/iiif/migrations/0002_ocr.py b/test_app/iiif/migrations/0002_ocr.py new file mode 100644 index 0000000..eac9ab3 --- /dev/null +++ b/test_app/iiif/migrations/0002_ocr.py @@ -0,0 +1,21 @@ +# Generated by Django 3.2.23 on 2024-01-24 20:43 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('iiif', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='OCR', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('canvas', models.ForeignKey(on_delete=django.db.models.deletion.DO_NOTHING, to='iiif.canvas')), + ], + ), + ] diff --git a/test_app/iiif/migrations/0003_auto_20240125_1328.py b/test_app/iiif/migrations/0003_auto_20240125_1328.py new file mode 100644 index 0000000..6590b7e --- /dev/null +++ b/test_app/iiif/migrations/0003_auto_20240125_1328.py @@ -0,0 +1,48 @@ +# Generated by Django 3.2.23 on 2024-01-25 13:28 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('iiif', '0002_ocr'), + ] + + operations = [ + migrations.AddField( + model_name='ocr', + name='content', + field=models.TextField(blank=True, default=' ', null=True), + ), + migrations.AddField( + model_name='ocr', + name='h', + field=models.IntegerField(default=0), + ), + migrations.AddField( + model_name='ocr', + name='order', + field=models.IntegerField(default=0), + ), + migrations.AddField( + model_name='ocr', + name='resource_type', + field=models.CharField(choices=[('cnt:ContentAsText', 'ocr'), ('dctypes:Text', 'text')], default='dctypes:Text', max_length=50), + ), + migrations.AddField( + model_name='ocr', + name='w', + field=models.IntegerField(default=0), + ), + migrations.AddField( + model_name='ocr', + name='x', + field=models.IntegerField(default=0), + ), + migrations.AddField( + model_name='ocr', + name='y', + field=models.IntegerField(default=0), + ), + ] diff --git a/test_app/iiif/migrations/0004_canvas_default_ocr.py b/test_app/iiif/migrations/0004_canvas_default_ocr.py new file mode 100644 index 0000000..732e760 --- /dev/null +++ b/test_app/iiif/migrations/0004_canvas_default_ocr.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.23 on 2024-01-25 14:29 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('iiif', '0003_auto_20240125_1328'), + ] + + operations = [ + migrations.AddField( + model_name='canvas', + name='default_ocr', + field=models.CharField(choices=[('word', 'word'), ('line', 'line'), ('both', 'both')], default='word', max_length=30), + ), + ] diff --git a/test_app/iiif/models.py b/test_app/iiif/models.py index 9f620fd..bf7eef4 100644 --- a/test_app/iiif/models.py +++ b/test_app/iiif/models.py @@ -21,6 +21,31 @@ class Canvas(models.Model): height = models.IntegerField(default=0) ocr_file_path = models.CharField(max_length=500, null=True, blank=True) manifest = models.ForeignKey(Manifest, on_delete=models.DO_NOTHING) + preferred_ocr = ( + ('word', 'word'), + ('line', 'line'), + ('both', 'both') + ) + # TODO: move this to the manifest level. + default_ocr = models.CharField(max_length=30, choices=preferred_ocr, default="word") + +class OCR(models.Model): + OCR = 'cnt:ContentAsText' + TEXT = 'dctypes:Text' + TYPE_CHOICES = ( + (OCR, 'ocr'), + (TEXT, 'text') + ) + + canvas = models.ForeignKey(Canvas, on_delete=models.DO_NOTHING) + x = models.IntegerField(default=0) + y = models.IntegerField(default=0) + w = models.IntegerField(default=0) + h = models.IntegerField(default=0) + order = models.IntegerField(default=0) + content = models.TextField(blank=True, null=True, default=' ') + resource_type = models.CharField(max_length=50, choices=TYPE_CHOICES, default=TEXT) + class RelatedLink(models.Model): """ Links to related resources """ diff --git a/test_app/requirements.txt b/test_app/requirements.txt index 8a27504..9fac3ec 100644 --- a/test_app/requirements.txt +++ b/test_app/requirements.txt @@ -1,10 +1,13 @@ Django>=3.2.0,<4.0 # https://www.djangoproject.com/ ../ # readux_ingest_ecds -django-storages==1.14 # https://github.com/jschneier/django-storages -django-environ==0.11.2 # https://github.com/joke2k/django-environ +django-storages>=1.14 # https://github.com/jschneier/django-storages +django-environ>=0.11.2 # https://github.com/joke2k/django-environ pytest>=7.4.3 # https://github.com/pytest-dev/pytest -pytest-sugar==0.9.7 # https://github.com/Frozenball/pytest-sugar -pytest-django==4.5.2 # https://github.com/pytest-dev/pytest-django -factory-boy~=3.2.1 # https://github.com/FactoryBoy/factory_boy -faker~=20.1.0 +pytest-sugar>=0.9.7 # https://github.com/Frozenball/pytest-sugar +pytest-django>=4.5.2 # https://github.com/pytest-dev/pytest-django +factory-boy>=3.2.1 # https://github.com/FactoryBoy/factory_boy +faker>=20.1.0 moto==4.2.0 # https://github.com/spulec/moto +requests-oauthlib>=1.3.1,<2.0 +httpretty>=1.1.4,<2.0 +hocr-spec~=0.2.0 \ No newline at end of file diff --git a/test_app/test_app/settings.py b/test_app/test_app/settings.py index 55598a2..24a81bc 100644 --- a/test_app/test_app/settings.py +++ b/test_app/test_app/settings.py @@ -26,11 +26,16 @@ IIIF_RELATED_LINK_MODEL = 'iiif.RelatedLink' IIIF_CANVAS_MODEL = 'iiif.Canvas' IIIF_COLLECTION_MODEL = 'iiif.Collection' +IIIF_OCR_MODEL = 'iiif.OCR' INGEST_TMP_DIR = os.path.join('tmp') INGEST_PROCESSING_DIR = os.path.join('tmp', 'processing') INGEST_OCR_DIR = os.path.join('tmp', 'ocr') INGEST_TRIGGER_BUCKET = 'readux-ingest-ecds-test' +# Readux settings +DATASTREAM_PREFIX = 'http://repo.library.emory.edu/fedora/objects/' +DATASTREAM_SUFFIX = '/datastreams/position/content' + # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/3.2/howto/deployment/checklist/ diff --git a/test_app/tests/factories.py b/test_app/tests/factories.py index 0c1be18..3fb63b7 100644 --- a/test_app/tests/factories.py +++ b/test_app/tests/factories.py @@ -38,7 +38,7 @@ class LocalFactory(DjangoModelFactory): class Meta: model = Local - bundle = FileField(filename='bundle.zip', filepath=join(settings.FIXTURE_DIR, 'bundle.zip')) + bundle = FileField(from_path=join(settings.FIXTURE_DIR, 'bundle.zip')) image_server = SubFactory(ImageServerFactory) manifest = None diff --git a/test_app/tests/test_admin.py b/test_app/tests/test_admin.py index 30b1702..dc5ba8c 100644 --- a/test_app/tests/test_admin.py +++ b/test_app/tests/test_admin.py @@ -9,21 +9,14 @@ # from django.urls.base import reverse # from django_celery_results.models import TaskResult from moto import mock_s3 -from iiif.models import Manifest, Canvas, Collection +from iiif.models import Manifest, Canvas, Collection, OCR from .factories import ImageServerFactory, UserFactory, LocalFactory, ManifestFactory, CollectionFactory from readux_ingest_ecds.models import Local from readux_ingest_ecds.admin import LocalAdmin @mock_s3 class IngestAdminTest(TestCase): - # @classmethod - # def setUpClass(cls): - # cls.sftp_server = MockSFTP() - - # @classmethod - # def tearDownClass(cls): - # cls.sftp_server.stop_server() - + """ Tests Ingest Admin """ def setUp(self): """ Set instance variables. """ self.fixture_path = settings.FIXTURE_DIR @@ -42,6 +35,7 @@ def test_local_admin_save(self): original_manifest_count = Manifest.objects.count() original_canvas_count = Canvas.objects.count() + original_ocr_count = OCR.objects.count() request_factory = RequestFactory() @@ -68,6 +62,7 @@ def test_local_admin_save(self): # in the ingest assert Manifest.objects.count() == original_manifest_count + 1 assert Canvas.objects.count() == original_canvas_count + 10 + assert OCR.objects.count() == original_ocr_count + 4630 def test_local_admin_response_add(self): """It should redirect to new manifest""" diff --git a/test_app/tests/test_local.py b/test_app/tests/test_local.py index 13c2ef2..5f96408 100644 --- a/test_app/tests/test_local.py +++ b/test_app/tests/test_local.py @@ -11,8 +11,8 @@ from django.conf import settings from .factories import ImageServerFactory from readux_ingest_ecds.models import Local -from readux_ingest_ecds.services import create_manifest -from iiif.models import Canvas +from readux_ingest_ecds.services.iiif_services import create_manifest +from iiif.models import Canvas, OCR pytestmark = pytest.mark.django_db(transaction=True) # pylint: disable = invalid-name @@ -92,7 +92,7 @@ def test_creating_manifest(self): def test_metadata_from_excel(self): """ It should create a manifest with metadata supplied in an Excel file. """ local = self.mock_local('bundle.zip') - local.process() + local.prep() assert 'pid' in local.metadata.keys() @@ -102,7 +102,7 @@ def test_metadata_from_excel(self): def test_metadata_from_csv(self): """ It should create a manifest with metadata supplied in a CSV file. """ local = self.mock_local('csv_meta.zip', with_manifest=True) - local.process() + local.prep() assert 'pid' in local.metadata.keys() @@ -122,14 +122,14 @@ def test_metadata_from_tsv(self): def test_no_metadata_file(self): """ It should create a Manifest even when no metadata file is supplied. """ local = self.mock_local('no_meta_file.zip', with_manifest=True) - local.process() + local.prep() # New manifest should have a default pid - UUID in test app. assert UUID(local.manifest.pid, version=4) def test_unzip_bundle(self): local = self.mock_local('csv_meta.zip') - local.process() + local.prep() local.refresh_from_db() local.unzip_bundle() @@ -138,7 +138,7 @@ def test_unzip_bundle(self): def test_create_canvases(self): local = self.mock_local('csv_meta.zip') - local.process() + local.prep() local.refresh_from_db() local.unzip_bundle() local.create_canvases() @@ -150,7 +150,7 @@ def test_ignoring_junk(self): Any hidden files should not be uploaded. """ local = self.mock_local('bundle_with_junk.zip') - local.process() + local.prep() local.unzip_bundle() with ZipFile(os.path.join(self.fixture_path, 'bundle_with_junk.zip'), 'r') as zip_ref: @@ -180,7 +180,7 @@ def test_creating_canvases(self): every path. """ local = self.mock_local('bundle.zip', with_manifest=True) - local.process() + local.prep() local.unzip_bundle() local.create_canvases() @@ -207,6 +207,6 @@ def test_it_creates_manifest_with_metadata_property(self): } local = self.mock_local('no_meta_file.zip', metadata=metadata) local.manifest = create_manifest(local) - local.process() + local.prep() assert local.manifest.pid == '808' assert local.manifest.title == 'Goodie Mob'