diff --git a/readux_ingest_ecds/services/ocr_services.py b/readux_ingest_ecds/services/ocr_services.py index 04e1014..8cefb5e 100644 --- a/readux_ingest_ecds/services/ocr_services.py +++ b/readux_ingest_ecds/services/ocr_services.py @@ -378,7 +378,6 @@ def add_ocr_annotations(canvas, ocr): word_order = 1 annotations = [] for word in ocr: - print(f'adding word {word}') # A quick check to make sure the header row didn't slip through. if word['x'] == 'x': continue @@ -391,7 +390,6 @@ def add_ocr_annotations(canvas, ocr): word['content'].isspace() ): word['content'] = ' ' - print(f'creating anno for {word}') anno = OCR() anno.canvas = canvas anno.x = word['x'] @@ -401,11 +399,12 @@ def add_ocr_annotations(canvas, ocr): anno.resource_type = anno.OCR anno.content = word['content'] anno.order = word_order - print(f'pushing {word}') annotations.append(anno) word_order += 1 - print('saving') + # bulk_create does not call the model's save method. Saving the OCR annotation + # at the same time as creating it is very slow for unknown reasons. Once this + # method finishes, the next method that called will save all the new OCR annotations. OCR.objects.bulk_create(annotations) def add_oa_annotations(annotation_list_url): diff --git a/readux_ingest_ecds/tasks.py b/readux_ingest_ecds/tasks.py index 82f7fe5..98fa1dd 100644 --- a/readux_ingest_ecds/tasks.py +++ b/readux_ingest_ecds/tasks.py @@ -43,4 +43,8 @@ def add_ocr_task(manifest_id, *args, **kwargs): ocr = get_ocr(canvas) if ocr is not None: add_ocr_annotations(canvas, ocr) + # The add_ocr_annotations method uses bulk_create() which does not call save() on the model. + # Calling save() is really slow and I don't know why. Calling save() after the annotation + # has been created, calling save is as fast as expected. + [ocr.save() for ocr in canvas.annotation_set.all()] canvas.save() # trigger reindex