Skip to content

Commit

Permalink
add function to parse pdf files using docling into jsonl files and ca…
Browse files Browse the repository at this point in the history
…ll DocProcessor on those files
  • Loading branch information
khaledsulayman committed Sep 25, 2024
1 parent 6edcb46 commit 1146df3
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 20 deletions.
22 changes: 17 additions & 5 deletions src/instructlab/sdg/utils/chunking.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
import json
import logging
import re
from pathlib import Path
from typing import List

# Third Party
from docling.document_converter import DocumentConverter

Check failure on line 11 in src/instructlab/sdg/utils/chunking.py

View workflow job for this annotation

GitHub Actions / pylint

E0401: Unable to import 'docling.document_converter' (import-error)
from docling.datamodel.base_models import PipelineOptions

Check failure on line 12 in src/instructlab/sdg/utils/chunking.py

View workflow job for this annotation

GitHub Actions / pylint

E0401: Unable to import 'docling.datamodel.base_models' (import-error)
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from instructlab.sdg.utils.docprocessor import DocProcessor

Expand Down Expand Up @@ -47,7 +50,7 @@ def _extract_filetypes_from_docs(documents: List):
return md_docs, pdf_docs


def chunk_document(documents: List | str, server_ctx_size, chunk_word_count, qna_yaml_path=None) -> List[str]:
def chunk_documents(documents: List | str, server_ctx_size, chunk_word_count, qna_yaml_path=None) -> List[str]:
"""
Iterate over the documents and split them into chunks based on the word count provided by the user.
Args:
Expand Down Expand Up @@ -123,10 +126,19 @@ def chunk_pdfs(pdf_docs: List, qna_yaml_path=None):
TODO
"""
tokenizer_name = "TODO"
converter = DocumentConverter(pipeline_options=PipelineOptions())
parsed_pdfs = converter.convert(pdf_docs)
parsed_dicts = [p.render_as_dict() for p in parsed_pdfs]

chunked_pdfs = []
for doc in pdf_docs:
dp = DocProcessor(Path(doc).parent, tokenizer_name, user_config_path=qna_yaml_path)
# TODO
docling_jsons_path = Path("TODO")

for pd in parsed_dicts:
fp = docling_jsons_path / "TODO.jsonl"

with open(fp, 'w') as jsonl_file:
for entry in pd:
jsonl_file.write(json.dumps(entry) + '\n')

chunked_pdfs = DocProcessor(parsed_doc_dir=docling_jsons_path, tokenizer_name=tokenizer_name, user_config_path="TODO")

return chunked_pdfs
13 changes: 0 additions & 13 deletions src/instructlab/sdg/utils/docprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@

# Third Party
from datasets import Dataset, concatenate_datasets
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import PipelineOptions
from tabulate import tabulate
from transformers import AutoTokenizer
import yaml
Expand Down Expand Up @@ -424,14 +422,3 @@ def get_processed_dataset(self) -> Dataset:
return safe_concatenate_datasets(datasets)


def _parse_pdf_to_md(document):
converter = DocumentConverter(pipeline_options=PipelineOptions())
result = converter.convert_single(document)
return result.output.export_to_markdown()


def ensure_markdown(document: str):
if document.endswith(".pdf"):
return _parse_pdf_to_md(document)
return document

3 changes: 1 addition & 2 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import tempfile

# Third Party
from instructlab.sdg.utils.docprocessor import ensure_markdown
from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
from instructlab.schema.taxonomy import (
TaxonomyMessageFormat,
Expand Down Expand Up @@ -275,7 +274,7 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count
samples = []
# document is the same for the whole leaf node
chunks = (
chunking.chunk_document(
chunking.chunk_documents(
documents=leaf_node[0]["document"],
server_ctx_size=server_ctx_size,
chunk_word_count=chunk_word_count,
Expand Down

0 comments on commit 1146df3

Please sign in to comment.