Skip to content

Commit

Permalink
Add full page dimensions to location metadata (#137)
Browse files Browse the repository at this point in the history
  • Loading branch information
edknv authored Oct 14, 2024
1 parent f396969 commit c707a2b
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 21 deletions.
2 changes: 2 additions & 0 deletions docs/content-metadata.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ Metadata: Descriptive data which can be associated with Sources, Content(Image o
| | Caption | Any caption or subheader associated with Image | Extracted |
| | Text | Extracted text from a structured chart | Extracted | Pending Research |
| | Image location | Location (x,y) of chart within an image | Extracted | |
| | Image location max dimensions | Max dimensions (x\_max,y\_max) of location (x,y) | Extracted | |
| | uploaded\_image\_uri | Mirrors source\_metadata.source\_location | | |
| Table Metadata (tables within documents) | Table format | Structured (dataframe / lists of rows and columns), or serialized as markdown, html, latex, simple (cells separated just as spaces) | Extracted |
| | Table content | Extracted text content, formatted according to table\_metadata.table\_format. Important: Tables should not be chunked | Extracted | |
| | Table location | Bounding box of the table | Extracted | |
| | Table location max dimensions | Max dimensions (x\_max,y\_max) of bounding box of the table | Extracted | |
| | Caption | Detected captions for the table/chart | Extracted | |
| | Title | TODO | Extracted | |
| | Subtitle | TODO | Extracted | |
Expand Down
13 changes: 11 additions & 2 deletions src/nv_ingest/extraction_workflows/pdf/doughnut_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,12 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
}

pages = []
page_sizes = []
for page_idx in range(pdf_metadata.page_count):
page = doc.get_page(page_idx)
pages.append(page)
page_width, page_height = doc.get_page_size(page_idx)
page_sizes.append((page_width, page_height))

# Split into batches.
i = 0
Expand All @@ -147,6 +150,7 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table

for page_idx, raw_text, bbox_offset in responses:
page_image = None
page_width, page_height = page_sizes[page_idx]

classes, bboxes, texts = doughnut_utils.extract_classes_bboxes(raw_text)

Expand All @@ -173,7 +177,7 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
except UnicodeDecodeError:
pass
bbox = doughnut_utils.reverse_transform_bbox(bbox, bbox_offset)
table = LatexTable(latex=txt, bbox=bbox)
table = LatexTable(latex=txt, bbox=bbox, max_width=page_width, max_height=page_height)
accumulated_tables.append(table)

elif extract_images and (cls == "Picture"):
Expand All @@ -190,7 +194,12 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
base64_img = numpy_to_base64(img_numpy)
bbox = doughnut_utils.reverse_transform_bbox(bbox, bbox_offset)
image = Base64Image(
image=base64_img, bbox=bbox, width=img_numpy.shape[1], height=img_numpy.shape[0]
image=base64_img,
bbox=bbox,
width=img_numpy.shape[1],
height=img_numpy.shape[0],
max_width=page_width,
max_height=page_height,
)
accumulated_images.append(image)

Expand Down
13 changes: 10 additions & 3 deletions src/nv_ingest/extraction_workflows/pdf/pdfium_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,9 @@ def handle_table_chart_extraction(
base64_img = numpy_to_base64(cropped)

table_content = call_image_inference_model(paddle_client, "paddle", cropped, trace_info=trace_info)
table_data = ImageTable(table_content, base64_img, (w1, h1, w2, h2))
table_data = ImageTable(
content=table_content, image=base64_img, bbox=(w1, h1, w2, h2), max_width=width, max_height=height
)
tables_and_charts.append((page_idx, table_data))
elif extract_charts and label == "chart":
cropped = crop_image(original_image, (h1, w1, h2, w2))
Expand All @@ -382,7 +384,9 @@ def handle_table_chart_extraction(
)
cached_result = call_image_inference_model(cached_client, "cached", cropped, trace_info=trace_info)
chart_content = join_cached_and_deplot_output(cached_result, deplot_result)
chart_data = ImageChart(chart_content, base64_img, (w1, h1, w2, h2))
chart_data = ImageChart(
content=chart_content, image=base64_img, bbox=(w1, h1, w2, h2), max_width=width, max_height=height
)
tables_and_charts.append((page_idx, chart_data))


Expand Down Expand Up @@ -472,6 +476,7 @@ def pdfium(
text_depth = text_depth if text_depth == TextTypeEnum.PAGE else TextTypeEnum.DOCUMENT
for page_idx in range(pdf_metadata.page_count):
page = doc.get_page(page_idx)
page_width, page_height = doc.get_page_size(page_idx)

# https://pypdfium2.readthedocs.io/en/stable/python_api.html#module-pypdfium2._helpers.textpage
if extract_text:
Expand Down Expand Up @@ -507,7 +512,9 @@ def pdfium(
image_base64: str = numpy_to_base64(image_numpy)
image_bbox = obj.get_pos()
image_size = obj.get_size()
image_data = Base64Image(image_base64, image_bbox, image_size[0], image_size[1])
image_data = Base64Image(
image=image_base64, bbox=image_bbox, width=image_size[0], height=image_size[1], max_width=page_width, max_height=page_height
)

extracted_image_data = construct_image_metadata(
image_data,
Expand Down
2 changes: 2 additions & 0 deletions src/nv_ingest/schemas/metadata_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ class ImageMetadataSchema(BaseModelNoExt):
caption: str = ""
text: str = ""
image_location: tuple = (0, 0, 0, 0)
image_location_max_dimensions: tuple = (0, 0)
uploaded_image_url: str = ""
width: int = 0
height: int = 0
Expand All @@ -262,6 +263,7 @@ class TableMetadataSchema(BaseModelNoExt):
table_format: TableFormatEnum
table_content: str = ""
table_location: tuple = (0, 0, 0, 0)
table_location_max_dimensions: tuple = (0, 0)
uploaded_image_uri: str = ""


Expand Down
28 changes: 12 additions & 16 deletions src/nv_ingest/util/pdf/metadata_aggregators.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,30 +26,30 @@
from nv_ingest.util.exception_handlers.pdf import pdfium_exception_handler


@dataclass
class DataFrameTable:
df: pd.DataFrame
bbox: Tuple[int, int, int, int]


@dataclass
class ImageTable:
content: str
image: str
bbox: Tuple[int, int, int, int]
max_width: int
max_height: int


@dataclass
class ImageChart:
content: str
image: str
bbox: Tuple[int, int, int, int]
max_width: int
max_height: int


@dataclass
class LatexTable:
latex: pd.DataFrame
bbox: Tuple[int, int, int, int]
max_width: int
max_height: int


@dataclass
Expand All @@ -58,6 +58,8 @@ class Base64Image:
bbox: Tuple[int, int, int, int]
width: int
height: int
max_width: int
max_height: int


@dataclass
Expand Down Expand Up @@ -252,7 +254,7 @@ def construct_image_metadata(
"caption": "",
"text": "",
"image_location": image_base64.bbox,
"width": image_base64.width,
"image_location_max_dimensions": (image_base64.max_width, image_base64.max_height),
"height": image_base64.height,
}

Expand All @@ -275,7 +277,7 @@ def construct_image_metadata(
# TODO(Devin): Disambiguate tables and charts, create two distinct processing methods
@pdfium_exception_handler(descriptor="pdfium")
def construct_table_and_chart_metadata(
table: Union[DataFrameTable, ImageTable, ImageChart],
table: Union[ImageTable, ImageChart],
page_idx: int,
page_count: int,
source_metadata: Dict,
Expand Down Expand Up @@ -309,14 +311,7 @@ def construct_table_and_chart_metadata(
+--------------------------------+--------------------------+------------+---+
"""

if isinstance(table, DataFrameTable):
content = table.df.to_markdown(index=False)
structured_content_text = content
table_format = TableFormatEnum.MARKDOWN
subtype = ContentSubtypeEnum.TABLE
description = StdContentDescEnum.PDF_TABLE

elif isinstance(table, ImageTable):
if isinstance(table, ImageTable):
content = table.image
structured_content_text = table.content
table_format = TableFormatEnum.IMAGE
Expand Down Expand Up @@ -351,6 +346,7 @@ def construct_table_and_chart_metadata(
"table_format": table_format,
"table_content": structured_content_text,
"table_location": table.bbox,
"table_location_max_dimensions": (table.max_width, table.max_height),
}

ext_unified_metadata = base_unified_metadata.copy()
Expand Down

0 comments on commit c707a2b

Please sign in to comment.