From aa3694c71dbc6d16912b9cf1878fcd888db819df Mon Sep 17 00:00:00 2001
From: Jay Varner <jayvarner@gmail.com>
Date: Tue, 17 Dec 2024 09:10:13 -0500
Subject: [PATCH] Prevent double OCR

---
 readux_ingest_ecds/services/ocr_services.py | 36 ++++++++++++++-------
 test_app/tests/test_ocr.py                  | 17 ++++++++++
 2 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/readux_ingest_ecds/services/ocr_services.py b/readux_ingest_ecds/services/ocr_services.py
index edbe474..06a693b 100644
--- a/readux_ingest_ecds/services/ocr_services.py
+++ b/readux_ingest_ecds/services/ocr_services.py
@@ -350,6 +350,7 @@ def parse_tsv_ocr(result):
         h = int(row["h"])
         x = int(row["x"])
         y = int(row["y"])
+
         ocr.append(
             {
                 "content": content,
@@ -434,18 +435,29 @@ def add_ocr_annotations(canvas, ocr):
             or word["content"].isspace()
         ):
             word["content"] = " "
-        anno = OCR()
-        anno.canvas = canvas
-        anno.x = word["x"]
-        anno.y = word["y"]
-        anno.w = word["w"]
-        anno.h = word["h"]
-        anno.resource_type = anno.OCR
-        anno.content = word["content"]
-        anno.order = word_order
-        anno.set_span_element()
-        annotations.append(anno)
-        word_order += 1
+        try:
+            OCR.objects.get(
+                w=word["w"],
+                h=word["h"],
+                x=word["x"],
+                y=word["y"],
+                content=word["content"],
+                canvas=canvas,
+            )
+        except OCR.DoesNotExist:
+            anno = OCR()
+            anno.canvas = canvas
+            anno.x = word["x"]
+            anno.y = word["y"]
+            anno.w = word["w"]
+            anno.h = word["h"]
+            anno.resource_type = anno.OCR
+            anno.content = word["content"]
+            anno.order = word_order
+            anno.set_span_element()
+            if anno not in annotations:
+                annotations.append(anno)
+                word_order += 1
 
     return annotations
 
diff --git a/test_app/tests/test_ocr.py b/test_app/tests/test_ocr.py
index 0520f9e..42f4841 100644
--- a/test_app/tests/test_ocr.py
+++ b/test_app/tests/test_ocr.py
@@ -5,6 +5,7 @@
 from readux_ingest_ecds.services import ocr_services
 from readux_ingest_ecds.tasks import add_ocr_task_local
 from .factories import CanvasFactory, LocalFactory, ManifestFactory, UserFactory
+from iiif.models import OCR
 
 
 class OCRTest(TestCase):
@@ -42,3 +43,19 @@ def test_empty_xml(self):
         )
         assert "XMLSyntaxError" in mail.outbox[0].body
         assert "iip" in mail.outbox[0].body
+
+    def test_prevent_double_ocr(self):
+        """"""
+        canvas = CanvasFactory.create(
+            ocr_file_path=os.path.join(self.fixture_path, "alto4.xml"),
+            manifest=ManifestFactory.create(),
+        )
+
+        ocr = ocr_services.get_ocr(canvas)
+        annos = ocr_services.add_ocr_annotations(canvas, ocr)
+        print(len(annos))
+        OCR.objects.bulk_create(annos)
+        assert len(annos) == 178
+        assert OCR.objects.count() == 178
+        dupe_annos = ocr_services.add_ocr_annotations(canvas, ocr)
+        assert len(dupe_annos) == 0