Skip to content

Commit

Permalink
Update testing for Document Chunker classes
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
  • Loading branch information
khaledsulayman committed Nov 1, 2024
1 parent 4ee15b4 commit de8b6f9
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 55 deletions.
7 changes: 4 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
click>=8.1.7,<9.0.0
datasets>=2.18.0,<3.0.0
docling>=1.15.0,<2.0.0
GitPython>=3.1.42,<4.0.0
httpx>=0.25.0,<1.0.0
instructlab-schema>=0.4.0
langchain-text-splitters
openai>=1.13.3,<2.0.0
# Note: this dependency goes along with langchain-text-splitters and may be
# removed once that one is removed.
# do not use 8.4.0 due to a bug in the library
# https://github.com/instructlab/instructlab/issues/1389
pypdf>=5.0.0
openai>=1.13.3,<2.0.0
tabulate>=0.9.0
tenacity>=8.3.0,!=8.4.0
transformers>=4.44.2
torch>=2.3.0,<2.4.0
transformers>=4.41.2
xdg-base-dirs>=6.0.1
144 changes: 144 additions & 0 deletions tests/test_chunkers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# SPDX-License-Identifier: Apache-2.0

# First Party
from instructlab.sdg.utils.chunkers import (
FileTypes,
DocumentChunker,
TextSplitChunker,
ContextAwareChunker,
)
from pathlib import Path

# Local
from .testdata import testdata

# Third Party
import pytest
from docling.datamodel.base_models import PipelineOptions
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import ConversionStatus, DocumentConverter


@pytest.fixture
def documents_dir():
return Path(__file__) / "testdata" / "sample_documents"


@pytest.mark.parametrize(
"filepaths, chunker_type",
[
[Path("document.md")],
[Path("document.pdf")],
],
[
TextSplitChunker,
ContextAwareChunker,
],
)
def test_chunker_factory(filepaths, chunker_type):
"""Test that the DocumentChunker factory class returns the proper Chunker type"""
leaf_node = [{
"documents": "Lorem ipsum",
"taxonomy_path": Path(),
"filepaths": filepaths,
}]
chunker = DocumentChunker(leaf_node=leaf_node)
assert isinstance(chunker, chunker_type)


def test_split_docs_by_filetype(filepaths):
documents = ["Lorem", "ipsum"]
md_doc_path = Path("document.md")
pdf_doc_path = Path("document.pdf")

doc_dict = DocumentChunker._split_docs_by_filetype()
# TODO

with pytest.raises(ValueError("Received multiple document types")):
md_chunker = DocumentChunker(leaf_node=invalid_leaf_node)


class TestTextSplitChunker():
def test_chunk_documents():
pass

def test_chunk_docs_wc_exceeds_ctx_window(self):
with pytest.raises(ValueError) as exc:
chunking.chunk_document(
documents=testdata.documents,
chunk_word_count=1000,
server_ctx_size=1034,
)
assert (
"Given word count (1000) per doc will exceed the server context window size (1034)"
in str(exc.value)
)

def test_chunk_docs_long_lines(self):
# TODO see if this is applicable to context-aware
chunk_words = 50
chunks = chunking.chunk_document(
documents=testdata.long_line_documents,
chunk_word_count=chunk_words,
server_ctx_size=4096,
)
max_tokens = chunking._num_tokens_from_words(chunk_words)
max_chars = chunking._num_chars_from_tokens(max_tokens)
max_chars += chunking._DEFAULT_CHUNK_OVERLAP # add in the chunk overlap
max_chars += 50 # and a bit extra for some really long words
for chunk in chunks:
assert len(chunk) <= max_chars

def test_chunk_docs_chunk_overlap_error(self):
# TODO check if applicable to context-aware
with pytest.raises(ValueError) as exc:
chunking.chunk_document(
documents=testdata.documents,
chunk_word_count=5,
server_ctx_size=1034,
)
assert (
"Got a larger chunk overlap (100) than chunk size (24), should be smaller"
in str(exc.value)
)


class TestContextAwareChunker():
def test_chunk_documents():
pass

def test_path_validator():
pass

def test_load_qna_yaml():
pass

def test_process_parsed_docling_json():
pass

def test_fuse_texts():
pass

def test_create_tokenizer():
pass

def test_get_token_count():
pass

def test_add_heading_formatting():
pass

def test_generate_table_from_parsed_rep():
pass

def test_get_table():
pass

def test_get_table_page_number():
pass

def test_build_chunks_from_docling_json():
pass

def test_export_document():
pass
52 changes: 0 additions & 52 deletions tests/test_chunking.py

This file was deleted.

0 comments on commit de8b6f9

Please sign in to comment.