From 54ae021479197d7ed450aabc004027591aae38cb Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Thu, 7 Nov 2024 12:27:32 -0500
Subject: [PATCH] Move to Docling v2 APIs

This bumps our Docling dependency to version 2, while using the
backwards-compatibility layer they have for pdf_parser_v1 and legacy
document output formats.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 requirements.txt                      |  2 +-
 src/instructlab/sdg/utils/chunkers.py | 29 ++++++++++++++++++---------
 src/instructlab/sdg/utils/taxonomy.py |  8 +++++---
 tests/test_chunkers.py                |  3 ---
 4 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 0f48ca93..6222b1f8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
-docling>=1.15.0,<2.0.0
+docling>=2.3.0,<3.0.0
 GitPython>=3.1.42,<4.0.0
 httpx>=0.25.0,<1.0.0
 instructlab-schema>=0.4.0
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 97cb9084..881153dc 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -10,8 +10,15 @@
 
 # Third Party
 from datasets import Dataset
-from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
-from docling.document_converter import ConversionStatus, DocumentConverter
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import (
+    ConversionStatus,
+    DocumentConverter,
+    PdfFormatOption,
+)
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
 from tabulate import tabulate
 from transformers import AutoTokenizer
@@ -210,10 +217,14 @@ def chunk_documents(self) -> List:
         if self.document_paths == []:
             return []
 
-        model_artifacts_path = DocumentConverter.download_models_hf()
-        converter = DocumentConverter(artifacts_path=model_artifacts_path)
-        inputs = DocumentConversionInput.from_paths(self.filepaths)
-        parsed_documents = converter.convert(inputs)
+        model_artifacts_path = StandardPdfPipeline.download_models_hf()
+        pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path)
+        converter = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+            }
+        )
+        parsed_documents = converter.convert_all(self.filepaths)
 
         docling_artifacts_path = self.export_documents(parsed_documents)
 
@@ -539,7 +550,7 @@ def build_chunks_from_docling_json(
             document_chunks.append("\n\n".join(current_buffer))
         return document_chunks
 
-    def export_documents(self, converted_docs: Iterable[ConvertedDocument]):
+    def export_documents(self, converted_docs: Iterable[ConversionResult]):
         """Write converted documents to json files
 
         Check for successful conversions and write those to the docling artifacts directory.
@@ -559,11 +570,11 @@ def export_documents(self, converted_docs: Iterable[ConvertedDocument]):
 
                 # Export Deep Search document JSON format:
                 with (docling_artifacts_path / f"{doc_filename}.json").open("w") as fp:
-                    fp.write(json.dumps(doc.render_as_dict()))
+                    fp.write(json.dumps(doc.legacy_document.export_to_dict()))
 
                 # Export Markdown format:
                 with (docling_artifacts_path / f"{doc_filename}.md").open("w") as fp:
-                    fp.write(doc.render_as_markdown())
+                    fp.write(doc.legacy_document.export_to_markdown())
             else:
                 logger.info(f"Document {doc.input.file} failed to convert.")
                 failure_count += 1
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index 302f7e37..a6f9b381 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -10,7 +10,9 @@
 
 # Third Party
 from datasets import Dataset
-from docling_parse.docling_parse import pdf_parser  # pylint: disable=no-name-in-module
+
+# pylint: disable=no-name-in-module
+from docling_parse.docling_parse import pdf_parser_v1
 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
 from instructlab.schema.taxonomy import (
     TaxonomyMessageFormat,
@@ -25,7 +27,7 @@
 from .chunkers import DocumentChunker
 
 # Initialize the pdf parser
-PDFParser = pdf_parser()
+PDFParser = pdf_parser_v1()
 
 logger = logging.getLogger(__name__)
 
@@ -165,7 +167,7 @@ def _get_documents(
                                 )
 
                         elif file_path.lower().endswith(".pdf"):
-                            # Process PDF files using docling_parse's pdf_parser
+                            # Process PDF files using docling_parse's pdf_parser_v1
                             doc_key = f"key_{os.path.basename(file_path)}"  # Unique document key
                             logger.info(f"Loading PDF document from {file_path}")
 
diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
index 7d2923d9..04970d24 100644
--- a/tests/test_chunkers.py
+++ b/tests/test_chunkers.py
@@ -5,9 +5,6 @@
 import tempfile
 
 # Third Party
-from docling.datamodel.base_models import PipelineOptions
-from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
-from docling.document_converter import ConversionStatus, DocumentConverter
 import pytest
 
 # First Party