Skip to content

Commit

Permalink
Prevent double OCR
Browse files Browse the repository at this point in the history
  • Loading branch information
jayvarner committed Dec 17, 2024
1 parent 0285d25 commit aa3694c
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 12 deletions.
36 changes: 24 additions & 12 deletions readux_ingest_ecds/services/ocr_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ def parse_tsv_ocr(result):
h = int(row["h"])
x = int(row["x"])
y = int(row["y"])

ocr.append(
{
"content": content,
Expand Down Expand Up @@ -434,18 +435,29 @@ def add_ocr_annotations(canvas, ocr):
or word["content"].isspace()
):
word["content"] = " "
anno = OCR()
anno.canvas = canvas
anno.x = word["x"]
anno.y = word["y"]
anno.w = word["w"]
anno.h = word["h"]
anno.resource_type = anno.OCR
anno.content = word["content"]
anno.order = word_order
anno.set_span_element()
annotations.append(anno)
word_order += 1
try:
OCR.objects.get(
w=word["w"],
h=word["h"],
x=word["x"],
y=word["y"],
content=word["content"],
canvas=canvas,
)
except OCR.DoesNotExist:
anno = OCR()
anno.canvas = canvas
anno.x = word["x"]
anno.y = word["y"]
anno.w = word["w"]
anno.h = word["h"]
anno.resource_type = anno.OCR
anno.content = word["content"]
anno.order = word_order
anno.set_span_element()
if anno not in annotations:
annotations.append(anno)
word_order += 1

return annotations

Expand Down
17 changes: 17 additions & 0 deletions test_app/tests/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from readux_ingest_ecds.services import ocr_services
from readux_ingest_ecds.tasks import add_ocr_task_local
from .factories import CanvasFactory, LocalFactory, ManifestFactory, UserFactory
from iiif.models import OCR


class OCRTest(TestCase):
Expand Down Expand Up @@ -42,3 +43,19 @@ def test_empty_xml(self):
)
assert "XMLSyntaxError" in mail.outbox[0].body
assert "iip" in mail.outbox[0].body

def test_prevent_double_ocr(self):
""""""
canvas = CanvasFactory.create(
ocr_file_path=os.path.join(self.fixture_path, "alto4.xml"),
manifest=ManifestFactory.create(),
)

ocr = ocr_services.get_ocr(canvas)
annos = ocr_services.add_ocr_annotations(canvas, ocr)
print(len(annos))
OCR.objects.bulk_create(annos)
assert len(annos) == 178
assert OCR.objects.count() == 178
dupe_annos = ocr_services.add_ocr_annotations(canvas, ocr)
assert len(dupe_annos) == 0

0 comments on commit aa3694c

Please sign in to comment.