From 1146df3730ba48e5c784b3a5d320f16f3513f9d5 Mon Sep 17 00:00:00 2001
From: Khaled Sulayman <ksulayma@redhat.com>
Date: Wed, 25 Sep 2024 09:36:59 -0400
Subject: [PATCH] add function to parse pdf files using docling into jsonl
 files and call DocProcessor on those files

---
 src/instructlab/sdg/utils/chunking.py     | 22 +++++++++++++++++-----
 src/instructlab/sdg/utils/docprocessor.py | 13 -------------
 src/instructlab/sdg/utils/taxonomy.py     |  3 +--
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunking.py b/src/instructlab/sdg/utils/chunking.py
index e2b9843a..94551cc1 100644
--- a/src/instructlab/sdg/utils/chunking.py
+++ b/src/instructlab/sdg/utils/chunking.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Standard
+import json
 import logging
 import re
 from pathlib import Path
 from typing import List
 
 # Third Party
+from docling.document_converter import DocumentConverter
+from docling.datamodel.base_models import PipelineOptions
 from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
 from instructlab.sdg.utils.docprocessor import DocProcessor
 
@@ -47,7 +50,7 @@ def _extract_filetypes_from_docs(documents: List):
     return md_docs, pdf_docs
 
 
-def chunk_document(documents: List | str, server_ctx_size, chunk_word_count, qna_yaml_path=None) -> List[str]:
+def chunk_documents(documents: List | str, server_ctx_size, chunk_word_count, qna_yaml_path=None) -> List[str]:
     """
     Iterate over the documents and split them into chunks based on the word count provided by the user.
     Args:
@@ -123,10 +126,19 @@ def chunk_pdfs(pdf_docs: List, qna_yaml_path=None):
     TODO
     """
     tokenizer_name = "TODO"
+    converter = DocumentConverter(pipeline_options=PipelineOptions())
+    parsed_pdfs = converter.convert(pdf_docs)
+    parsed_dicts = [p.render_as_dict() for p in parsed_pdfs]
 
-    chunked_pdfs = []
-    for doc in pdf_docs:
-        dp = DocProcessor(Path(doc).parent, tokenizer_name, user_config_path=qna_yaml_path) 
-        # TODO
+    docling_jsons_path = Path("TODO")
+
+    for pd in parsed_dicts:
+        fp = docling_jsons_path / "TODO.jsonl"
+
+        with open(fp, 'w') as jsonl_file:
+            for entry in pd:
+                jsonl_file.write(json.dumps(entry) + '\n')
+
+    chunked_pdfs = DocProcessor(parsed_doc_dir=docling_jsons_path, tokenizer_name=tokenizer_name, user_config_path="TODO")
 
     return chunked_pdfs
diff --git a/src/instructlab/sdg/utils/docprocessor.py b/src/instructlab/sdg/utils/docprocessor.py
index dc5616a1..45d8b6cc 100644
--- a/src/instructlab/sdg/utils/docprocessor.py
+++ b/src/instructlab/sdg/utils/docprocessor.py
@@ -5,8 +5,6 @@
 
 # Third Party
 from datasets import Dataset, concatenate_datasets
-from docling.document_converter import DocumentConverter
-from docling.datamodel.base_models import PipelineOptions
 from tabulate import tabulate
 from transformers import AutoTokenizer
 import yaml
@@ -424,14 +422,3 @@ def get_processed_dataset(self) -> Dataset:
         return safe_concatenate_datasets(datasets)
 
 
-def _parse_pdf_to_md(document):
-    converter = DocumentConverter(pipeline_options=PipelineOptions())
-    result = converter.convert_single(document)
-    return result.output.export_to_markdown()
-
-
-def ensure_markdown(document: str):
-    if document.endswith(".pdf"):
-        return _parse_pdf_to_md(document)
-    return document
-
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index 3a57ca91..0f16c3f7 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -10,7 +10,6 @@
 import tempfile
 
 # Third Party
-from instructlab.sdg.utils.docprocessor import ensure_markdown
 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
 from instructlab.schema.taxonomy import (
     TaxonomyMessageFormat,
@@ -275,7 +274,7 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count
     samples = []
     # document is the same for the whole leaf node
     chunks = (
-        chunking.chunk_document(
+        chunking.chunk_documents(
             documents=leaf_node[0]["document"],
             server_ctx_size=server_ctx_size,
             chunk_word_count=chunk_word_count,