Skip to content

Commit

Permalink
Fix setting the OCR file path
Browse files Browse the repository at this point in the history
  • Loading branch information
jayvarner committed Jan 25, 2024
1 parent 37f3823 commit 3c0d533
Show file tree
Hide file tree
Showing 8 changed files with 43 additions and 7 deletions.
7 changes: 4 additions & 3 deletions readux_ingest_ecds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,17 +166,18 @@ def create_canvases(self):
Canvas = get_iiif_models()['Canvas']
images = None
with open(self.trigger_file, 'r') as t_file:
images =t_file.read().splitlines()
images = t_file.read().splitlines()
images.sort()

for index, image in enumerate(images):
position = index + 1
image_name = os.path.splitext(image)[0]
canvas_pid = f'{image_name}.tiff'
width, height = canvas_dimensions(image_name)
ocr_directory = os.path.join(settings.INGEST_OCR_DIR, self.manifest.pid)
try:
ocr_file = [ocr for ocr in os.listdir(settings.INGEST_OCR_DIR) if image_name in ocr][0]
ocr_file_path = os.path.abspath(os.path.join(settings.INGEST_OCR_DIR, ocr_file))
ocr_file = [ocr for ocr in os.listdir(ocr_directory) if image_name in ocr][0]
ocr_file_path = os.path.abspath(os.path.join(ocr_directory, ocr_file))
except IndexError:
ocr_file_path = None

Expand Down
5 changes: 4 additions & 1 deletion readux_ingest_ecds/services/ocr_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def get_ocr(canvas):
return parse_tei_ocr(result)

result = fetch_positional_ocr(canvas)

return add_positional_ocr(canvas, result)

def fetch_tei_ocr(canvas):
Expand Down Expand Up @@ -126,7 +127,9 @@ def fetch_positional_ocr(canvas):
if canvas.image_server.storage_service == 's3':
return canvas.image_server.bucket.Object(canvas.ocr_file_path).get()['Body'].read()

return fetch_url(url, data_format='text/plain')
if canvas.image_server.storage_service == 'local':
with open(canvas.ocr_file_path, 'r') as ocr:
return ocr.read()

def parse_alto_ocr(result):
"""Function to parse fetched ALTO OCR data for a given canvas.
Expand Down
1 change: 0 additions & 1 deletion readux_ingest_ecds/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def add_ocr_task(manifest_id, *args, **kwargs):
manifest = Manifest.objects.get(pk=manifest_id)
for canvas in manifest.canvas_set.all():
ocr = get_ocr(canvas)

if ocr is not None:
add_ocr_annotations(canvas, ocr)
canvas.save() # trigger reindex
18 changes: 18 additions & 0 deletions test_app/iiif/migrations/0005_imageserver_storage_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.2.23 on 2024-01-25 16:14

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('iiif', '0004_canvas_default_ocr'),
]

operations = [
migrations.AddField(
model_name='imageserver',
name='storage_service',
field=models.CharField(default='local', max_length=25),
),
]
5 changes: 5 additions & 0 deletions test_app/iiif/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class Collection(models.Model):

class ImageServer(models.Model):
server_base = models.CharField(max_length=255)
storage_service = models.CharField(max_length=25, default='local')

class Manifest(models.Model):
pid = models.CharField(max_length=255, primary_key=True, default=uuid4, editable=True)
Expand All @@ -29,6 +30,10 @@ class Canvas(models.Model):
# TODO: move this to the manifest level.
default_ocr = models.CharField(max_length=30, choices=preferred_ocr, default="word")

@property
def image_server(self):
return self.manifest.image_server

class OCR(models.Model):
OCR = 'cnt:ContentAsText'
TEXT = 'dctypes:Text'
Expand Down
2 changes: 1 addition & 1 deletion test_app/tests/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from iiif.models import ImageServer, Manifest, User, Collection

class ImageServerFactory(DjangoModelFactory):
server_base = 'http://images.ecds.emory.edu'
server_base = 'http://iiif.ecds.emory.edu'

class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring
model = ImageServer
Expand Down
2 changes: 1 addition & 1 deletion test_app/tests/test_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_local_admin_save(self):
# in the ingest
assert Manifest.objects.count() == original_manifest_count + 1
assert Canvas.objects.count() == original_canvas_count + 10
assert OCR.objects.count() == original_ocr_count + 4630
assert OCR.objects.count() == original_ocr_count + 1073

def test_local_admin_response_add(self):
"""It should redirect to new manifest"""
Expand Down
10 changes: 10 additions & 0 deletions test_app/tests/test_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,16 @@ def test_creating_canvases(self):
assert Canvas.objects.get(pid=f'{pid}_00000010.tiff').width == 32
assert Canvas.objects.get(pid=f'{pid}_00000010.tiff').height == 43

ocr_path = os.path.abspath(
os.path.join(
settings.INGEST_OCR_DIR,
pid,
f'{pid}_00000008.tsv'
)
)

assert Canvas.objects.get(pid=f'{pid}_00000008.tiff').ocr_file_path == ocr_path

def test_it_creates_manifest_with_metadata_property(self):
metadata = {
'pid': '808',
Expand Down

0 comments on commit 3c0d533

Please sign in to comment.