Skip to content

Commit

Permalink
Update testing for Document Chunker classes
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
Co-authored-by: Aakanksha Duggal <[email protected]>
  • Loading branch information
khaledsulayman and aakankshaduggal committed Nov 6, 2024
1 parent 6851ad5 commit e5d35f8
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 68 deletions.
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
click>=8.1.7,<9.0.0
datasets>=2.18.0,<3.0.0
docling>=1.15.0,<2.0.0
GitPython>=3.1.42,<4.0.0
httpx>=0.25.0,<1.0.0
instructlab-schema>=0.4.0
langchain-text-splitters
openai>=1.13.3,<2.0.0
# Note: this dependency goes along with langchain-text-splitters and may be
# removed once that one is removed.
# do not use 8.4.0 due to a bug in the library
# https://github.com/instructlab/instructlab/issues/1389
pypdf>=5.0.0
openai>=1.13.3,<2.0.0
tabulate>=0.9.0
tenacity>=8.3.0,!=8.4.0
torch>=2.3.0,<2.5.0
Expand Down
64 changes: 50 additions & 14 deletions tests/test_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
# Local
from .testdata import testdata


# def read_document_contents(document_path: Path):
# # TODO
# if document_path.suffix == ".md":
Expand All @@ -29,18 +28,18 @@
# pass


def build_leaf_node(document_paths: str | list):
# TODO maybe check for directory
if not isinstance(document_paths, list):
document_paths = [document_paths]
# def build_leaf_node(document_paths: str | list):
# # TODO maybe check for directory
# if not isinstance(document_paths, list):
# document_paths = [document_paths]

return [
{
"taxonomy_path": "", # TODO
"filepaths": document_paths,
"documents": [read_document_contents(d) for d in document_paths],
}
]
# return [
# {
# "taxonomy_path": "", # TODO
# "filepaths": document_paths,
# "documents": [read_document_contents(d) for d in document_paths],
# }
# ]


@pytest.fixture
Expand Down Expand Up @@ -130,7 +129,45 @@ def test_chunker_factory_unsupported_filetype():
# )


# class TestContextAwareChunker():
class TestContextAwareChunker:
@pytest.fixture
def sample_pdf_path():
return Path("sample_documents/sample.pdf") # Replace with actual test path

def test_context_aware_chunker_pdf_parsing(sample_pdf_path):
"""Test that ContextAwareChunker correctly parses and chunks PDF content."""
leaf_node = [
{
"documents": ["Sample PDF content"],
"taxonomy_path": "sample/path",
"filepaths": [sample_pdf_path],
}
]
chunker = ContextAwareChunker(leaf_node=leaf_node)
chunks = chunker.chunk_documents()
assert chunks # Ensure chunks were created
assert all(
"content" in chunk for chunk in chunks
) # Verify content presence in chunks

def test_context_aware_chunker_tokenizer():
"""Test that the tokenizer is created and token count is accurate."""
leaf_node = [
{
"documents": ["Sample content"],
"taxonomy_path": "sample/path",
"filepaths": [Path("document.pdf")],
}
]
chunker = ContextAwareChunker(leaf_node=leaf_node)
tokenizer = chunker.create_tokenizer(
"sample_model_name"
) # Pass in model name if required
tokens = tokenizer.encode("Test text")
assert tokens # Ensure tokenization occurred
assert len(tokens) > 0


# @pytest.fixture
# def chunker(documents_dir):
# pass
Expand Down Expand Up @@ -173,4 +210,3 @@ def test_chunker_factory_unsupported_filetype():

# def test_export_document():
# pass

52 changes: 0 additions & 52 deletions tests/test_chunking.py

This file was deleted.

0 comments on commit e5d35f8

Please sign in to comment.