Fix setting the OCR file path

ecds · Jan 25, 2024 · 3c0d533 · 3c0d533
1 parent 37f3823
commit 3c0d533
Show file tree

Hide file tree

Showing 8 changed files with 43 additions and 7 deletions.
diff --git a/readux_ingest_ecds/models.py b/readux_ingest_ecds/models.py
@@ -166,17 +166,18 @@ def create_canvases(self):
         Canvas = get_iiif_models()['Canvas']
         images = None
         with open(self.trigger_file, 'r') as t_file:
-            images =t_file.read().splitlines()
+            images = t_file.read().splitlines()
         images.sort()
 
         for index, image in enumerate(images):
             position = index + 1
             image_name = os.path.splitext(image)[0]
             canvas_pid = f'{image_name}.tiff'
             width, height = canvas_dimensions(image_name)
+            ocr_directory = os.path.join(settings.INGEST_OCR_DIR, self.manifest.pid)
             try:
-                ocr_file = [ocr for ocr in os.listdir(settings.INGEST_OCR_DIR) if image_name in ocr][0]
-                ocr_file_path = os.path.abspath(os.path.join(settings.INGEST_OCR_DIR, ocr_file))
+                ocr_file = [ocr for ocr in os.listdir(ocr_directory) if image_name in ocr][0]
+                ocr_file_path = os.path.abspath(os.path.join(ocr_directory, ocr_file))
             except IndexError:
                 ocr_file_path = None
 

diff --git a/readux_ingest_ecds/services/ocr_services.py b/readux_ingest_ecds/services/ocr_services.py
@@ -40,6 +40,7 @@ def get_ocr(canvas):
         return parse_tei_ocr(result)
 
     result = fetch_positional_ocr(canvas)
+
     return add_positional_ocr(canvas, result)
 
 def fetch_tei_ocr(canvas):
@@ -126,7 +127,9 @@ def fetch_positional_ocr(canvas):
         if canvas.image_server.storage_service == 's3':
             return canvas.image_server.bucket.Object(canvas.ocr_file_path).get()['Body'].read()
 
-    return fetch_url(url, data_format='text/plain')
+        if canvas.image_server.storage_service == 'local':
+            with open(canvas.ocr_file_path, 'r') as ocr:
+                return ocr.read()
 
 def parse_alto_ocr(result):
     """Function to parse fetched ALTO OCR data for a given canvas.

diff --git a/readux_ingest_ecds/tasks.py b/readux_ingest_ecds/tasks.py
@@ -41,7 +41,6 @@ def add_ocr_task(manifest_id, *args, **kwargs):
     manifest = Manifest.objects.get(pk=manifest_id)
     for canvas in manifest.canvas_set.all():
         ocr = get_ocr(canvas)
-
         if ocr is not None:
             add_ocr_annotations(canvas, ocr)
             canvas.save()  # trigger reindex
diff --git a/test_app/iiif/migrations/0005_imageserver_storage_service.py b/test_app/iiif/migrations/0005_imageserver_storage_service.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.2.23 on 2024-01-25 16:14
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('iiif', '0004_canvas_default_ocr'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='imageserver',
+            name='storage_service',
+            field=models.CharField(default='local', max_length=25),
+        ),
+    ]
diff --git a/test_app/iiif/models.py b/test_app/iiif/models.py
@@ -7,6 +7,7 @@ class Collection(models.Model):
 
 class ImageServer(models.Model):
     server_base = models.CharField(max_length=255)
+    storage_service = models.CharField(max_length=25, default='local')
 
 class Manifest(models.Model):
     pid = models.CharField(max_length=255, primary_key=True, default=uuid4, editable=True)
@@ -29,6 +30,10 @@ class Canvas(models.Model):
     # TODO: move this to the manifest level.
     default_ocr = models.CharField(max_length=30, choices=preferred_ocr, default="word")
 
+    @property
+    def image_server(self):
+        return self.manifest.image_server
+
 class OCR(models.Model):
     OCR = 'cnt:ContentAsText'
     TEXT = 'dctypes:Text'

diff --git a/test_app/tests/factories.py b/test_app/tests/factories.py
@@ -7,7 +7,7 @@
 from iiif.models import ImageServer, Manifest, User, Collection
 
 class ImageServerFactory(DjangoModelFactory):
-    server_base = 'http://images.ecds.emory.edu'
+    server_base = 'http://iiif.ecds.emory.edu'
 
     class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring
         model = ImageServer

diff --git a/test_app/tests/test_admin.py b/test_app/tests/test_admin.py
@@ -62,7 +62,7 @@ def test_local_admin_save(self):
         # in the ingest
         assert Manifest.objects.count() == original_manifest_count + 1
         assert Canvas.objects.count() == original_canvas_count + 10
-        assert OCR.objects.count() == original_ocr_count + 4630
+        assert OCR.objects.count() == original_ocr_count + 1073
 
     def test_local_admin_response_add(self):
         """It should redirect to new manifest"""

diff --git a/test_app/tests/test_local.py b/test_app/tests/test_local.py
@@ -200,6 +200,16 @@ def test_creating_canvases(self):
         assert Canvas.objects.get(pid=f'{pid}_00000010.tiff').width == 32
         assert Canvas.objects.get(pid=f'{pid}_00000010.tiff').height == 43
 
+        ocr_path = os.path.abspath(
+            os.path.join(
+                settings.INGEST_OCR_DIR,
+                pid,
+                f'{pid}_00000008.tsv'
+            )
+        )
+
+        assert Canvas.objects.get(pid=f'{pid}_00000008.tiff').ocr_file_path == ocr_path
+
     def test_it_creates_manifest_with_metadata_property(self):
         metadata = {
             'pid': '808',