From 54ae021479197d7ed450aabc004027591aae38cb Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Thu, 7 Nov 2024 12:27:32 -0500 Subject: [PATCH] Move to Docling v2 APIs This bumps our Docling dependency to version 2, while using the backwards-compatibility layer they have for pdf_parser_v1 and legacy document output formats. Signed-off-by: Ben Browning --- requirements.txt | 2 +- src/instructlab/sdg/utils/chunkers.py | 29 ++++++++++++++++++--------- src/instructlab/sdg/utils/taxonomy.py | 8 +++++--- tests/test_chunkers.py | 3 --- 4 files changed, 26 insertions(+), 16 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0f48ca93..6222b1f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 click>=8.1.7,<9.0.0 datasets>=2.18.0,<3.0.0 -docling>=1.15.0,<2.0.0 +docling>=2.3.0,<3.0.0 GitPython>=3.1.42,<4.0.0 httpx>=0.25.0,<1.0.0 instructlab-schema>=0.4.0 diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 97cb9084..881153dc 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -10,8 +10,15 @@ # Third Party from datasets import Dataset -from docling.datamodel.document import ConvertedDocument, DocumentConversionInput -from docling.document_converter import ConversionStatus, DocumentConverter +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import ( + ConversionStatus, + DocumentConverter, + PdfFormatOption, +) +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from langchain_text_splitters import Language, RecursiveCharacterTextSplitter from tabulate import tabulate from transformers import AutoTokenizer @@ -210,10 +217,14 @@ def chunk_documents(self) -> List: if self.document_paths == []: return [] - model_artifacts_path = DocumentConverter.download_models_hf() - converter = DocumentConverter(artifacts_path=model_artifacts_path) - inputs = DocumentConversionInput.from_paths(self.filepaths) - parsed_documents = converter.convert(inputs) + model_artifacts_path = StandardPdfPipeline.download_models_hf() + pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path) + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } + ) + parsed_documents = converter.convert_all(self.filepaths) docling_artifacts_path = self.export_documents(parsed_documents) @@ -539,7 +550,7 @@ def build_chunks_from_docling_json( document_chunks.append("\n\n".join(current_buffer)) return document_chunks - def export_documents(self, converted_docs: Iterable[ConvertedDocument]): + def export_documents(self, converted_docs: Iterable[ConversionResult]): """Write converted documents to json files Check for successful conversions and write those to the docling artifacts directory. @@ -559,11 +570,11 @@ def export_documents(self, converted_docs: Iterable[ConvertedDocument]): # Export Deep Search document JSON format: with (docling_artifacts_path / f"{doc_filename}.json").open("w") as fp: - fp.write(json.dumps(doc.render_as_dict())) + fp.write(json.dumps(doc.legacy_document.export_to_dict())) # Export Markdown format: with (docling_artifacts_path / f"{doc_filename}.md").open("w") as fp: - fp.write(doc.render_as_markdown()) + fp.write(doc.legacy_document.export_to_markdown()) else: logger.info(f"Document {doc.input.file} failed to convert.") failure_count += 1 diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 302f7e37..a6f9b381 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -10,7 +10,9 @@ # Third Party from datasets import Dataset -from docling_parse.docling_parse import pdf_parser # pylint: disable=no-name-in-module + +# pylint: disable=no-name-in-module +from docling_parse.docling_parse import pdf_parser_v1 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS from instructlab.schema.taxonomy import ( TaxonomyMessageFormat, @@ -25,7 +27,7 @@ from .chunkers import DocumentChunker # Initialize the pdf parser -PDFParser = pdf_parser() +PDFParser = pdf_parser_v1() logger = logging.getLogger(__name__) @@ -165,7 +167,7 @@ def _get_documents( ) elif file_path.lower().endswith(".pdf"): - # Process PDF files using docling_parse's pdf_parser + # Process PDF files using docling_parse's pdf_parser_v1 doc_key = f"key_{os.path.basename(file_path)}" # Unique document key logger.info(f"Loading PDF document from {file_path}") diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index 7d2923d9..04970d24 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -5,9 +5,6 @@ import tempfile # Third Party -from docling.datamodel.base_models import PipelineOptions -from docling.datamodel.document import ConvertedDocument, DocumentConversionInput -from docling.document_converter import ConversionStatus, DocumentConverter import pytest # First Party