From 038e5c0d1bcdf409aa1f342d0a7f110316280f0d Mon Sep 17 00:00:00 2001 From: Jay Varner Date: Wed, 2 Oct 2024 09:01:08 -0400 Subject: [PATCH] Background add OCR manage command for volumes. --- .../management/commands/add_ocr.py | 28 ++++++++++++++----- readux_ingest_ecds/tasks.py | 11 ++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/readux_ingest_ecds/management/commands/add_ocr.py b/readux_ingest_ecds/management/commands/add_ocr.py index ac72144..d114bc1 100644 --- a/readux_ingest_ecds/management/commands/add_ocr.py +++ b/readux_ingest_ecds/management/commands/add_ocr.py @@ -1,4 +1,5 @@ from django.core.management.base import BaseCommand, CommandError +from readux_ingest_ecds.tasks import add_ocr_manage_task from readux_ingest_ecds.helpers import get_iiif_models from readux_ingest_ecds.services.ocr_services import ( add_ocr_to_canvases, @@ -15,19 +16,32 @@ class Command(BaseCommand): help = "(Re)Build OCR for a volume or canvas." def add_arguments(self, parser): - parser.add_argument("--volume", type=str, help="PID for volume/manifest.") - + parser.add_argument( + "--volume", type=str, help="PID for volume. Same as --manifest." + ) + parser.add_argument( + "--manifest", type=str, help="PID for manifest. Same as --volume." + ) parser.add_argument("--canvas", type=str, help="PID for canvas.") def handle(self, *args, **options): - if options["volume"]: + if options["volume"] or options["manifest"]: + pid = ( + options["volume"] + if options["volume"] is not None + else options["manifest"] + ) try: - manifest = Manifest.objects.get(pid=options["volume"]) + manifest = Manifest.objects.get(pid=pid) except Manifest.DoesNotExist: - raise CommandError(f'Manifest {options["volume"]} does not exist') + raise CommandError(f"Manifest {pid} does not exist") - add_ocr_to_canvases(manifest) - self.stdout.write(self.style.SUCCESS(f"OCR create for {manifest.pid}")) + add_ocr_manage_task.delay(manifest.pid) + self.stdout.write( + self.style.SUCCESS( + f"A background task has started to add OCR to {manifest.pid}. This could take a while depending on volume length. NOTE: The OCR is not necessarily created according to page order." + ) + ) elif options["canvas"]: try: canvas = Canvas.objects.get(pid=options["canvas"]) diff --git a/readux_ingest_ecds/tasks.py b/readux_ingest_ecds/tasks.py index 27902de..0821f28 100644 --- a/readux_ingest_ecds/tasks.py +++ b/readux_ingest_ecds/tasks.py @@ -133,3 +133,14 @@ def s3_ingest_task(ingest_id, *args, **kwargs): print(ingest_id) s3_ingest = S3Ingest.objects.get(pk=ingest_id) s3_ingest.ingest() + + +@app.task( + name="add_volume_ocr_manage_task", + autoretry_for=(Exception,), + retry_backoff=True, + max_retries=20, +) +def add_ocr_manage_task(volume_pid, *args, **kwargs): + """Add OCR for Volume/Manifest via Manage Command""" + add_ocr_to_canvases(volume_pid)