From 1146df3730ba48e5c784b3a5d320f16f3513f9d5 Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Wed, 25 Sep 2024 09:36:59 -0400 Subject: [PATCH] add function to parse pdf files using docling into jsonl files and call DocProcessor on those files --- src/instructlab/sdg/utils/chunking.py | 22 +++++++++++++++++----- src/instructlab/sdg/utils/docprocessor.py | 13 ------------- src/instructlab/sdg/utils/taxonomy.py | 3 +-- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/instructlab/sdg/utils/chunking.py b/src/instructlab/sdg/utils/chunking.py index e2b9843a..94551cc1 100644 --- a/src/instructlab/sdg/utils/chunking.py +++ b/src/instructlab/sdg/utils/chunking.py @@ -1,12 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +import json import logging import re from pathlib import Path from typing import List # Third Party +from docling.document_converter import DocumentConverter +from docling.datamodel.base_models import PipelineOptions from langchain_text_splitters import Language, RecursiveCharacterTextSplitter from instructlab.sdg.utils.docprocessor import DocProcessor @@ -47,7 +50,7 @@ def _extract_filetypes_from_docs(documents: List): return md_docs, pdf_docs -def chunk_document(documents: List | str, server_ctx_size, chunk_word_count, qna_yaml_path=None) -> List[str]: +def chunk_documents(documents: List | str, server_ctx_size, chunk_word_count, qna_yaml_path=None) -> List[str]: """ Iterate over the documents and split them into chunks based on the word count provided by the user. Args: @@ -123,10 +126,19 @@ def chunk_pdfs(pdf_docs: List, qna_yaml_path=None): TODO """ tokenizer_name = "TODO" + converter = DocumentConverter(pipeline_options=PipelineOptions()) + parsed_pdfs = converter.convert(pdf_docs) + parsed_dicts = [p.render_as_dict() for p in parsed_pdfs] - chunked_pdfs = [] - for doc in pdf_docs: - dp = DocProcessor(Path(doc).parent, tokenizer_name, user_config_path=qna_yaml_path) - # TODO + docling_jsons_path = Path("TODO") + + for pd in parsed_dicts: + fp = docling_jsons_path / "TODO.jsonl" + + with open(fp, 'w') as jsonl_file: + for entry in pd: + jsonl_file.write(json.dumps(entry) + '\n') + + chunked_pdfs = DocProcessor(parsed_doc_dir=docling_jsons_path, tokenizer_name=tokenizer_name, user_config_path="TODO") return chunked_pdfs diff --git a/src/instructlab/sdg/utils/docprocessor.py b/src/instructlab/sdg/utils/docprocessor.py index dc5616a1..45d8b6cc 100644 --- a/src/instructlab/sdg/utils/docprocessor.py +++ b/src/instructlab/sdg/utils/docprocessor.py @@ -5,8 +5,6 @@ # Third Party from datasets import Dataset, concatenate_datasets -from docling.document_converter import DocumentConverter -from docling.datamodel.base_models import PipelineOptions from tabulate import tabulate from transformers import AutoTokenizer import yaml @@ -424,14 +422,3 @@ def get_processed_dataset(self) -> Dataset: return safe_concatenate_datasets(datasets) -def _parse_pdf_to_md(document): - converter = DocumentConverter(pipeline_options=PipelineOptions()) - result = converter.convert_single(document) - return result.output.export_to_markdown() - - -def ensure_markdown(document: str): - if document.endswith(".pdf"): - return _parse_pdf_to_md(document) - return document - diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 3a57ca91..0f16c3f7 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -10,7 +10,6 @@ import tempfile # Third Party -from instructlab.sdg.utils.docprocessor import ensure_markdown from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS from instructlab.schema.taxonomy import ( TaxonomyMessageFormat, @@ -275,7 +274,7 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count samples = [] # document is the same for the whole leaf node chunks = ( - chunking.chunk_document( + chunking.chunk_documents( documents=leaf_node[0]["document"], server_ctx_size=server_ctx_size, chunk_word_count=chunk_word_count,