add function to parse pdf files using docling into jsonl files and ca…

…ll DocProcessor on those files
instructlab · Sep 25, 2024 · 1146df3 · 1146df3
1 parent 6edcb46
commit 1146df3
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 20 deletions.
diff --git a/src/instructlab/sdg/utils/chunking.py b/src/instructlab/sdg/utils/chunking.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Standard
+import json
 import logging
 import re
 from pathlib import Path
 from typing import List
 
 # Third Party
+from docling.document_converter import DocumentConverter
+from docling.datamodel.base_models import PipelineOptions
 from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
 from instructlab.sdg.utils.docprocessor import DocProcessor
 
@@ -47,7 +50,7 @@ def _extract_filetypes_from_docs(documents: List):
     return md_docs, pdf_docs
 
 
-def chunk_document(documents: List | str, server_ctx_size, chunk_word_count, qna_yaml_path=None) -> List[str]:
+def chunk_documents(documents: List | str, server_ctx_size, chunk_word_count, qna_yaml_path=None) -> List[str]:
     """
     Iterate over the documents and split them into chunks based on the word count provided by the user.
     Args:
@@ -123,10 +126,19 @@ def chunk_pdfs(pdf_docs: List, qna_yaml_path=None):
     TODO
     """
     tokenizer_name = "TODO"
+    converter = DocumentConverter(pipeline_options=PipelineOptions())
+    parsed_pdfs = converter.convert(pdf_docs)
+    parsed_dicts = [p.render_as_dict() for p in parsed_pdfs]
 
-    chunked_pdfs = []
-    for doc in pdf_docs:
-        dp = DocProcessor(Path(doc).parent, tokenizer_name, user_config_path=qna_yaml_path) 
-        # TODO
+    docling_jsons_path = Path("TODO")
+
+    for pd in parsed_dicts:
+        fp = docling_jsons_path / "TODO.jsonl"
+
+        with open(fp, 'w') as jsonl_file:
+            for entry in pd:
+                jsonl_file.write(json.dumps(entry) + '\n')
+
+    chunked_pdfs = DocProcessor(parsed_doc_dir=docling_jsons_path, tokenizer_name=tokenizer_name, user_config_path="TODO")
 
     return chunked_pdfs
diff --git a/src/instructlab/sdg/utils/docprocessor.py b/src/instructlab/sdg/utils/docprocessor.py
@@ -5,8 +5,6 @@
 
 # Third Party
 from datasets import Dataset, concatenate_datasets
-from docling.document_converter import DocumentConverter
-from docling.datamodel.base_models import PipelineOptions
 from tabulate import tabulate
 from transformers import AutoTokenizer
 import yaml
@@ -424,14 +422,3 @@ def get_processed_dataset(self) -> Dataset:
         return safe_concatenate_datasets(datasets)
 
 
-def _parse_pdf_to_md(document):
-    converter = DocumentConverter(pipeline_options=PipelineOptions())
-    result = converter.convert_single(document)
-    return result.output.export_to_markdown()
-
-
-def ensure_markdown(document: str):
-    if document.endswith(".pdf"):
-        return _parse_pdf_to_md(document)
-    return document
-
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -10,7 +10,6 @@
 import tempfile
 
 # Third Party
-from instructlab.sdg.utils.docprocessor import ensure_markdown
 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
 from instructlab.schema.taxonomy import (
     TaxonomyMessageFormat,
@@ -275,7 +274,7 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count
     samples = []
     # document is the same for the whole leaf node
     chunks = (
-        chunking.chunk_document(
+        chunking.chunk_documents(
             documents=leaf_node[0]["document"],
             server_ctx_size=server_ctx_size,
             chunk_word_count=chunk_word_count,