also add table and image bounding boxes to metadata

NVIDIA · Nov 23, 2024 · a9a74b7 · a9a74b7
1 parent 321052f
commit a9a74b7
Showing 1 changed file with 58 additions and 53 deletions.
diff --git a/src/nv_ingest/extraction_workflows/pdf/doughnut_helper.py b/src/nv_ingest/extraction_workflows/pdf/doughnut_helper.py
@@ -161,69 +161,74 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
 
             page_nearby_blocks = {
                 "text": {"content": [], "bbox": [], "type": []},
-                "images": {"content": [], "bbox": []},
-                "structured": {"content": [], "bbox": []},
+                "images": {"content": [], "bbox": [], "type": []},
+                "structured": {"content": [], "bbox": [], "type": []},
             }
 
             for cls, bbox, txt in zip(classes, bboxes, texts):
 
-                if extract_text and (cls in doughnut_utils.ACCEPTED_TEXT_CLASSES):
-                    txt = doughnut_utils.postprocess_text(txt, cls)
+                transformed_bbox = doughnut_utils.reverse_transform_bbox(
+                    bbox=bbox,
+                    bbox_offset=bbox_offset,
+                    original_width=DEFAULT_MAX_WIDTH,
+                    original_height=DEFAULT_MAX_HEIGHT,
+                )
 
+                if cls in doughnut_utils.ACCEPTED_TEXT_CLASSES:
                     if identify_nearby_objects:
-                        bbox = doughnut_utils.reverse_transform_bbox(
-                            bbox=bbox,
-                            bbox_offset=bbox_offset,
-                            original_width=DEFAULT_MAX_WIDTH,
-                            original_height=DEFAULT_MAX_HEIGHT,
-                        )
                         page_nearby_blocks["text"]["content"].append(txt)
-                        page_nearby_blocks["text"]["bbox"].append(bbox)
+                        page_nearby_blocks["text"]["bbox"].append(transformed_bbox)
                         page_nearby_blocks["text"]["type"].append(cls)
 
-                    accumulated_text.append(txt)
-
-                if extract_tables and (cls == "Table"):
-                    try:
-                        txt = txt.encode().decode("unicode_escape")  # remove double backlashes
-                    except UnicodeDecodeError:
-                        pass
-                    bbox = doughnut_utils.reverse_transform_bbox(
-                        bbox=bbox,
-                        bbox_offset=bbox_offset,
-                        original_width=DEFAULT_MAX_WIDTH,
-                        original_height=DEFAULT_MAX_HEIGHT,
-                    )
-                    table = LatexTable(latex=txt, bbox=bbox, max_width=DEFAULT_MAX_WIDTH, max_height=DEFAULT_MAX_HEIGHT)
-                    accumulated_tables.append(table)
-
-                if extract_images and (cls == "Picture"):
-                    if page_image is None:
-                        scale_tuple = (DEFAULT_MAX_WIDTH, DEFAULT_MAX_HEIGHT)
-                        padding_tuple = (DEFAULT_MAX_WIDTH, DEFAULT_MAX_HEIGHT)
-                        page_image, *_ = pdfium_pages_to_numpy(
-                            [pages[page_idx]], scale_tuple=scale_tuple, padding_tuple=padding_tuple
-                        )
-                        page_image = page_image[0]
-
-                    img_numpy = crop_image(page_image, bbox)
-                    if img_numpy is not None:
-                        base64_img = numpy_to_base64(img_numpy)
-                        bbox = doughnut_utils.reverse_transform_bbox(
-                            bbox=bbox,
-                            bbox_offset=bbox_offset,
-                            original_width=DEFAULT_MAX_WIDTH,
-                            original_height=DEFAULT_MAX_HEIGHT,
-                        )
-                        image = Base64Image(
-                            image=base64_img,
-                            bbox=bbox,
-                            width=img_numpy.shape[1],
-                            height=img_numpy.shape[0],
-                            max_width=DEFAULT_MAX_WIDTH,
-                            max_height=DEFAULT_MAX_HEIGHT,
+                    if extract_text:
+                        txt = doughnut_utils.postprocess_text(txt, cls)
+                        accumulated_text.append(txt)
+
+                if cls == "Table":
+                    if identify_nearby_objects:
+                        page_nearby_blocks["structured"]["content"].append(txt)
+                        page_nearby_blocks["structured"]["bbox"].append(transformed_bbox)
+                        page_nearby_blocks["structured"]["type"].append(cls)
+
+                    if extract_tables:
+                        try:
+                            txt = txt.encode().decode("unicode_escape")  # remove double backlashes
+                        except UnicodeDecodeError:
+                            pass
+
+                        table = LatexTable(
+                            latex=txt, bbox=transformed_bbox, max_width=DEFAULT_MAX_WIDTH, max_height=DEFAULT_MAX_HEIGHT
                         )
-                        accumulated_images.append(image)
+                        accumulated_tables.append(table)
+
+                if cls == "Picture":
+                    if identify_nearby_objects:
+                        page_nearby_blocks["images"]["content"].append(txt)
+                        page_nearby_blocks["images"]["bbox"].append(transformed_bbox)
+                        page_nearby_blocks["images"]["type"].append(cls)
+
+                    if extract_images:
+                        if page_image is None:
+                            scale_tuple = (DEFAULT_MAX_WIDTH, DEFAULT_MAX_HEIGHT)
+                            padding_tuple = (DEFAULT_MAX_WIDTH, DEFAULT_MAX_HEIGHT)
+                            page_image, *_ = pdfium_pages_to_numpy(
+                                [pages[page_idx]], scale_tuple=scale_tuple, padding_tuple=padding_tuple
+                            )
+                            page_image = page_image[0]
+
+                        img_numpy = crop_image(page_image, bbox)
+
+                        if img_numpy is not None:
+                            base64_img = numpy_to_base64(img_numpy)
+                            image = Base64Image(
+                                image=base64_img,
+                                bbox=transformed_bbox,
+                                width=img_numpy.shape[1],
+                                height=img_numpy.shape[0],
+                                max_width=DEFAULT_MAX_WIDTH,
+                                max_height=DEFAULT_MAX_HEIGHT,
+                            )
+                            accumulated_images.append(image)
 
             # Construct tables
             if extract_tables: