Skip to content

Commit

Permalink
Extract tables and charts by default in ingestor extract (#227)
Browse files Browse the repository at this point in the history
  • Loading branch information
edknv authored Nov 14, 2024
1 parent a31cd94 commit 442e34e
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 20 deletions.
8 changes: 5 additions & 3 deletions client/src/nv_ingest_client/client/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,11 +360,13 @@ def extract(self, **kwargs: Any) -> "Ingestor":
Ingestor
Returns self for chaining.
"""
extract_tables = kwargs.get("extract_tables", False)
extract_charts = kwargs.get("extract_charts", False)
extract_tables = kwargs.pop("extract_tables", True)
extract_charts = kwargs.pop("extract_charts", True)

for document_type in self._job_specs.file_types:
extract_task = ExtractTask(document_type, **kwargs)
extract_task = ExtractTask(
document_type, extract_tables=extract_tables, extract_charts=extract_charts, **kwargs
)
self._job_specs.add_task(extract_task, document_type=document_type)

if extract_tables is True:
Expand Down
4 changes: 4 additions & 0 deletions client/src/nv_ingest_client/primitives/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
# SPDX-License-Identifier: Apache-2.0

from .caption import CaptionTask
from .chart_extraction import ChartExtractionTask
from .dedup import DedupTask
from .embed import EmbedTask
from .extract import ExtractTask
from .filter import FilterTask
from .split import SplitTask
from .store import StoreTask
from .table_extraction import TableExtractionTask
from .task_base import Task
from .task_base import TaskType
from .task_base import is_valid_task_type
Expand All @@ -17,10 +19,12 @@

__all__ = [
"CaptionTask",
"ChartExtractionTask",
"ExtractTask",
"is_valid_task_type",
"SplitTask",
"StoreTask",
"TableExtractionTask",
"Task",
"task_factory",
"TaskType",
Expand Down
19 changes: 11 additions & 8 deletions src/nv_ingest/extraction_workflows/pdf/pdfium_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,15 +471,18 @@ def pdfium_extractor(
pdfium_config,
trace_info=trace_info,
):
extracted_data.append(
construct_table_and_chart_metadata(
table_and_charts,
page_idx,
pdf_metadata.page_count,
source_metadata,
base_unified_metadata,
if (extract_tables and (table_and_charts.type_string == "table")) or (
extract_charts and (table_and_charts.type_string == "chart")
):
extracted_data.append(
construct_table_and_chart_metadata(
table_and_charts,
page_idx,
pdf_metadata.page_count,
source_metadata,
base_unified_metadata,
)
)
)

logger.debug(f"Extracted {len(extracted_data)} items from PDF.")

Expand Down
57 changes: 48 additions & 9 deletions tests/nv_ingest_client/client/test_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,18 @@
from unittest.mock import patch

import pytest
from nv_ingest_client.client import NvIngestClient
from nv_ingest_client.client import Ingestor
from nv_ingest_client.client import NvIngestClient
from nv_ingest_client.primitives import BatchJobSpec
from nv_ingest_client.primitives.jobs import JobStateEnum
from nv_ingest_client.primitives.tasks import ChartExtractionTask
from nv_ingest_client.primitives.tasks import DedupTask
from nv_ingest_client.primitives.tasks import EmbedTask
from nv_ingest_client.primitives.tasks import ExtractTask
from nv_ingest_client.primitives.tasks import FilterTask
from nv_ingest_client.primitives.tasks import SplitTask
from nv_ingest_client.primitives.tasks import StoreTask
from nv_ingest_client.primitives.tasks import TableExtractionTask
from nv_ingest_client.primitives.tasks import VdbUploadTask

MODULE_UNDER_TEST = "nv_ingest_client.client.interface"
Expand Down Expand Up @@ -80,7 +82,42 @@ def test_embed_task_some_args(ingestor):
def test_extract_task_no_args(ingestor):
ingestor.extract()

assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[0], ExtractTask)
task = ingestor._job_specs.job_specs["pdf"][0]._tasks[0]
assert isinstance(task, ExtractTask)
assert task._extract_tables is True
assert task._extract_charts is True

assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[1], TableExtractionTask)
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[2], ChartExtractionTask)


def test_extract_task_args_tables_false(ingestor):
ingestor.extract(extract_tables=False)

task = ingestor._job_specs.job_specs["pdf"][0]._tasks[0]
assert isinstance(task, ExtractTask)
assert task._extract_tables is False
assert task._extract_charts is True


def test_extract_task_args_charts_false(ingestor):
ingestor.extract(extract_charts=False)

task = ingestor._job_specs.job_specs["pdf"][0]._tasks[0]
assert isinstance(task, ExtractTask)
assert task._extract_tables is True
assert task._extract_charts is False

assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[1], TableExtractionTask)


def test_extract_task_args_tables_and_charts_false(ingestor):
ingestor.extract(extract_tables=False, extract_charts=False)

task = ingestor._job_specs.job_specs["pdf"][0]._tasks[0]
assert isinstance(task, ExtractTask)
assert task._extract_tables is False
assert task._extract_charts is False


def test_extract_task_some_args(ingestor):
Expand Down Expand Up @@ -156,11 +193,13 @@ def test_chain(ingestor):
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[0], DedupTask)
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[1], EmbedTask)
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[2], ExtractTask)
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[3], FilterTask)
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[4], SplitTask)
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[5], StoreTask)
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[6], VdbUploadTask)
assert len(ingestor._job_specs.job_specs["pdf"][0]._tasks) == 7
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[3], TableExtractionTask)
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[4], ChartExtractionTask)
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[5], FilterTask)
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[6], SplitTask)
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[7], StoreTask)
assert isinstance(ingestor._job_specs.job_specs["pdf"][0]._tasks[8], VdbUploadTask)
assert len(ingestor._job_specs.job_specs["pdf"][0]._tasks) == 9


def test_ingest(ingestor, mock_client):
Expand Down Expand Up @@ -190,8 +229,8 @@ def test_ingest_async(ingestor, mock_client):
ingestor._job_states["job_id_1"] = MagicMock(state=JobStateEnum.COMPLETED)
ingestor._job_states["job_id_2"] = MagicMock(state=JobStateEnum.FAILED)

mock_client.fetch_job_result.side_effect = (
lambda job_id, *args, **kwargs: "result_1" if job_id == "job_id_1" else "result_2"
mock_client.fetch_job_result.side_effect = lambda job_id, *args, **kwargs: (
"result_1" if job_id == "job_id_1" else "result_2"
)

combined_future = ingestor.ingest_async(timeout=15)
Expand Down

0 comments on commit 442e34e

Please sign in to comment.