diff --git a/readux_ingest_ecds/services/ocr_services.py b/readux_ingest_ecds/services/ocr_services.py index edbe474..06a693b 100644 --- a/readux_ingest_ecds/services/ocr_services.py +++ b/readux_ingest_ecds/services/ocr_services.py @@ -350,6 +350,7 @@ def parse_tsv_ocr(result): h = int(row["h"]) x = int(row["x"]) y = int(row["y"]) + ocr.append( { "content": content, @@ -434,18 +435,29 @@ def add_ocr_annotations(canvas, ocr): or word["content"].isspace() ): word["content"] = " " - anno = OCR() - anno.canvas = canvas - anno.x = word["x"] - anno.y = word["y"] - anno.w = word["w"] - anno.h = word["h"] - anno.resource_type = anno.OCR - anno.content = word["content"] - anno.order = word_order - anno.set_span_element() - annotations.append(anno) - word_order += 1 + try: + OCR.objects.get( + w=word["w"], + h=word["h"], + x=word["x"], + y=word["y"], + content=word["content"], + canvas=canvas, + ) + except OCR.DoesNotExist: + anno = OCR() + anno.canvas = canvas + anno.x = word["x"] + anno.y = word["y"] + anno.w = word["w"] + anno.h = word["h"] + anno.resource_type = anno.OCR + anno.content = word["content"] + anno.order = word_order + anno.set_span_element() + if anno not in annotations: + annotations.append(anno) + word_order += 1 return annotations diff --git a/test_app/tests/test_ocr.py b/test_app/tests/test_ocr.py index 0520f9e..42f4841 100644 --- a/test_app/tests/test_ocr.py +++ b/test_app/tests/test_ocr.py @@ -5,6 +5,7 @@ from readux_ingest_ecds.services import ocr_services from readux_ingest_ecds.tasks import add_ocr_task_local from .factories import CanvasFactory, LocalFactory, ManifestFactory, UserFactory +from iiif.models import OCR class OCRTest(TestCase): @@ -42,3 +43,19 @@ def test_empty_xml(self): ) assert "XMLSyntaxError" in mail.outbox[0].body assert "iip" in mail.outbox[0].body + + def test_prevent_double_ocr(self): + """""" + canvas = CanvasFactory.create( + ocr_file_path=os.path.join(self.fixture_path, "alto4.xml"), + manifest=ManifestFactory.create(), + ) + + ocr = ocr_services.get_ocr(canvas) + annos = ocr_services.add_ocr_annotations(canvas, ocr) + print(len(annos)) + OCR.objects.bulk_create(annos) + assert len(annos) == 178 + assert OCR.objects.count() == 178 + dupe_annos = ocr_services.add_ocr_annotations(canvas, ocr) + assert len(dupe_annos) == 0