Add full page dimensions to location metadata (#137)

NVIDIA · Oct 14, 2024 · c707a2b · c707a2b
1 parent f396969
commit c707a2b
Show file tree

Hide file tree

Showing 5 changed files with 37 additions and 21 deletions.
diff --git a/docs/content-metadata.md b/docs/content-metadata.md
@@ -30,10 +30,12 @@ Metadata: Descriptive data which can be associated with Sources, Content(Image o
 |  | Caption | Any caption or subheader associated with Image | Extracted |
 |  | Text | Extracted text from a structured chart | Extracted | Pending Research |
 |  | Image location | Location (x,y) of chart within an image | Extracted |  |
+|  | Image location max dimensions | Max dimensions (x\_max,y\_max) of location (x,y) | Extracted |  |
 |  | uploaded\_image\_uri | Mirrors source\_metadata.source\_location |  |  |
 | Table Metadata (tables within documents) | Table format | Structured (dataframe / lists of rows and columns), or serialized as markdown, html, latex, simple (cells separated just as spaces) | Extracted |
 |  | Table content | Extracted text content, formatted according to table\_metadata.table\_format. Important: Tables should not be chunked | Extracted |  |
 |  | Table location | Bounding box of the table | Extracted |  |
+|  | Table location max dimensions | Max dimensions (x\_max,y\_max) of bounding box of the table  | Extracted |  |
 |  | Caption | Detected captions for the table/chart | Extracted |  |
 |  | Title | TODO | Extracted |  |
 |  | Subtitle | TODO | Extracted |  |

diff --git a/src/nv_ingest/extraction_workflows/pdf/doughnut_helper.py b/src/nv_ingest/extraction_workflows/pdf/doughnut_helper.py
@@ -123,9 +123,12 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
     }
 
     pages = []
+    page_sizes = []
     for page_idx in range(pdf_metadata.page_count):
         page = doc.get_page(page_idx)
         pages.append(page)
+        page_width, page_height = doc.get_page_size(page_idx)
+        page_sizes.append((page_width, page_height))
 
     # Split into batches.
     i = 0
@@ -147,6 +150,7 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
 
         for page_idx, raw_text, bbox_offset in responses:
             page_image = None
+            page_width, page_height = page_sizes[page_idx]
 
             classes, bboxes, texts = doughnut_utils.extract_classes_bboxes(raw_text)
 
@@ -173,7 +177,7 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
                     except UnicodeDecodeError:
                         pass
                     bbox = doughnut_utils.reverse_transform_bbox(bbox, bbox_offset)
-                    table = LatexTable(latex=txt, bbox=bbox)
+                    table = LatexTable(latex=txt, bbox=bbox, max_width=page_width, max_height=page_height)
                     accumulated_tables.append(table)
 
                 elif extract_images and (cls == "Picture"):
@@ -190,7 +194,12 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
                         base64_img = numpy_to_base64(img_numpy)
                         bbox = doughnut_utils.reverse_transform_bbox(bbox, bbox_offset)
                         image = Base64Image(
-                            image=base64_img, bbox=bbox, width=img_numpy.shape[1], height=img_numpy.shape[0]
+                            image=base64_img,
+                            bbox=bbox,
+                            width=img_numpy.shape[1],
+                            height=img_numpy.shape[0],
+                            max_width=page_width,
+                            max_height=page_height,
                         )
                         accumulated_images.append(image)
 

diff --git a/src/nv_ingest/extraction_workflows/pdf/pdfium_helper.py b/src/nv_ingest/extraction_workflows/pdf/pdfium_helper.py
@@ -371,7 +371,9 @@ def handle_table_chart_extraction(
                 base64_img = numpy_to_base64(cropped)
 
                 table_content = call_image_inference_model(paddle_client, "paddle", cropped, trace_info=trace_info)
-                table_data = ImageTable(table_content, base64_img, (w1, h1, w2, h2))
+                table_data = ImageTable(
+                    content=table_content, image=base64_img, bbox=(w1, h1, w2, h2), max_width=width, max_height=height
+                )
                 tables_and_charts.append((page_idx, table_data))
             elif extract_charts and label == "chart":
                 cropped = crop_image(original_image, (h1, w1, h2, w2))
@@ -382,7 +384,9 @@ def handle_table_chart_extraction(
                 )
                 cached_result = call_image_inference_model(cached_client, "cached", cropped, trace_info=trace_info)
                 chart_content = join_cached_and_deplot_output(cached_result, deplot_result)
-                chart_data = ImageChart(chart_content, base64_img, (w1, h1, w2, h2))
+                chart_data = ImageChart(
+                    content=chart_content, image=base64_img, bbox=(w1, h1, w2, h2), max_width=width, max_height=height
+                )
                 tables_and_charts.append((page_idx, chart_data))
 
 
@@ -472,6 +476,7 @@ def pdfium(
     text_depth = text_depth if text_depth == TextTypeEnum.PAGE else TextTypeEnum.DOCUMENT
     for page_idx in range(pdf_metadata.page_count):
         page = doc.get_page(page_idx)
+        page_width, page_height = doc.get_page_size(page_idx)
 
         # https://pypdfium2.readthedocs.io/en/stable/python_api.html#module-pypdfium2._helpers.textpage
         if extract_text:
@@ -507,7 +512,9 @@ def pdfium(
                         image_base64: str = numpy_to_base64(image_numpy)
                         image_bbox = obj.get_pos()
                         image_size = obj.get_size()
-                        image_data = Base64Image(image_base64, image_bbox, image_size[0], image_size[1])
+                        image_data = Base64Image(
+                            image=image_base64, bbox=image_bbox, width=image_size[0], height=image_size[1], max_width=page_width, max_height=page_height
+                        )
 
                         extracted_image_data = construct_image_metadata(
                             image_data,

diff --git a/src/nv_ingest/schemas/metadata_schema.py b/src/nv_ingest/schemas/metadata_schema.py
@@ -252,6 +252,7 @@ class ImageMetadataSchema(BaseModelNoExt):
     caption: str = ""
     text: str = ""
     image_location: tuple = (0, 0, 0, 0)
+    image_location_max_dimensions: tuple = (0, 0)
     uploaded_image_url: str = ""
     width: int = 0
     height: int = 0
@@ -262,6 +263,7 @@ class TableMetadataSchema(BaseModelNoExt):
     table_format: TableFormatEnum
     table_content: str = ""
     table_location: tuple = (0, 0, 0, 0)
+    table_location_max_dimensions: tuple = (0, 0)
     uploaded_image_uri: str = ""
 
 

diff --git a/src/nv_ingest/util/pdf/metadata_aggregators.py b/src/nv_ingest/util/pdf/metadata_aggregators.py
@@ -26,30 +26,30 @@
 from nv_ingest.util.exception_handlers.pdf import pdfium_exception_handler
 
 
-@dataclass
-class DataFrameTable:
-    df: pd.DataFrame
-    bbox: Tuple[int, int, int, int]
-
-
 @dataclass
 class ImageTable:
     content: str
     image: str
     bbox: Tuple[int, int, int, int]
+    max_width: int
+    max_height: int
 
 
 @dataclass
 class ImageChart:
     content: str
     image: str
     bbox: Tuple[int, int, int, int]
+    max_width: int
+    max_height: int
 
 
 @dataclass
 class LatexTable:
     latex: pd.DataFrame
     bbox: Tuple[int, int, int, int]
+    max_width: int
+    max_height: int
 
 
 @dataclass
@@ -58,6 +58,8 @@ class Base64Image:
     bbox: Tuple[int, int, int, int]
     width: int
     height: int
+    max_width: int
+    max_height: int
 
 
 @dataclass
@@ -252,7 +254,7 @@ def construct_image_metadata(
         "caption": "",
         "text": "",
         "image_location": image_base64.bbox,
-        "width": image_base64.width,
+        "image_location_max_dimensions": (image_base64.max_width, image_base64.max_height),
         "height": image_base64.height,
     }
 
@@ -275,7 +277,7 @@ def construct_image_metadata(
 # TODO(Devin): Disambiguate tables and charts, create two distinct processing methods
 @pdfium_exception_handler(descriptor="pdfium")
 def construct_table_and_chart_metadata(
-    table: Union[DataFrameTable, ImageTable, ImageChart],
+    table: Union[ImageTable, ImageChart],
     page_idx: int,
     page_count: int,
     source_metadata: Dict,
@@ -309,14 +311,7 @@ def construct_table_and_chart_metadata(
     +--------------------------------+--------------------------+------------+---+
     """
 
-    if isinstance(table, DataFrameTable):
-        content = table.df.to_markdown(index=False)
-        structured_content_text = content
-        table_format = TableFormatEnum.MARKDOWN
-        subtype = ContentSubtypeEnum.TABLE
-        description = StdContentDescEnum.PDF_TABLE
-
-    elif isinstance(table, ImageTable):
+    if isinstance(table, ImageTable):
         content = table.image
         structured_content_text = table.content
         table_format = TableFormatEnum.IMAGE
@@ -351,6 +346,7 @@ def construct_table_and_chart_metadata(
         "table_format": table_format,
         "table_content": structured_content_text,
         "table_location": table.bbox,
+        "table_location_max_dimensions": (table.max_width, table.max_height),
     }
 
     ext_unified_metadata = base_unified_metadata.copy()