Skip to content

Commit

Permalink
Update testing for Document Chunker classes
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
Co-authored-by: Aakanksha Duggal <[email protected]>
  • Loading branch information
khaledsulayman and aakankshaduggal committed Nov 7, 2024
1 parent 6851ad5 commit 00d4d1f
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 181 deletions.
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
click>=8.1.7,<9.0.0
datasets>=2.18.0,<3.0.0
docling>=1.15.0,<2.0.0
GitPython>=3.1.42,<4.0.0
httpx>=0.25.0,<1.0.0
instructlab-schema>=0.4.0
langchain-text-splitters
openai>=1.13.3,<2.0.0
# Note: this dependency goes along with langchain-text-splitters and may be
# removed once that one is removed.
# do not use 8.4.0 due to a bug in the library
# https://github.com/instructlab/instructlab/issues/1389
pypdf>=5.0.0
openai>=1.13.3,<2.0.0
tabulate>=0.9.0
tenacity>=8.3.0,!=8.4.0
torch>=2.3.0,<2.5.0
Expand Down
4 changes: 2 additions & 2 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections import defaultdict
from enum import Enum
from pathlib import Path
from typing import DefaultDict, Iterable, List, Tuple
from typing import DefaultDict, Iterable, List, Tuple, Optional
import json
import logging
import re
Expand Down Expand Up @@ -50,7 +50,7 @@ def __new__(
cls,
leaf_node,
taxonomy_path,
output_dir: Path,
output_dir: Optional[Path],
server_ctx_size=4096,
chunk_word_count=1024,
tokenizer_model_name: str | None = None,
Expand Down
136 changes: 11 additions & 125 deletions tests/test_chunkers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
import tempfile
from pathlib import Path

# Third Party
Expand All @@ -21,31 +22,9 @@
from .testdata import testdata


# def read_document_contents(document_path: Path):
# # TODO
# if document_path.suffix == ".md":
# pass
# if document_path.suffix == ".pdf":
# pass


def build_leaf_node(document_paths: str | list):
# TODO maybe check for directory
if not isinstance(document_paths, list):
document_paths = [document_paths]

return [
{
"taxonomy_path": "", # TODO
"filepaths": document_paths,
"documents": [read_document_contents(d) for d in document_paths],
}
]


@pytest.fixture
def documents_dir():
return Path(__file__) / "testdata" / "sample_documents"
return Path(__file__).parent / "testdata" / "sample_documents"


@pytest.mark.parametrize(
Expand All @@ -55,122 +34,29 @@ def documents_dir():
([Path("document.pdf")], ContextAwareChunker),
],
)
def test_chunker_factory(filepaths, chunker_type):
def test_chunker_factory(filepaths, chunker_type, documents_dir):
"""Test that the DocumentChunker factory class returns the proper Chunker type"""
leaf_node = [
{
"documents": ["Lorem ipsum"],
"taxonomy_path": "sample/path",
"taxonomy_path": "",
"filepaths": filepaths,
}
]
chunker = DocumentChunker(leaf_node=leaf_node)
assert isinstance(chunker, chunker_type)
with tempfile.TemporaryDirectory() as temp_dir:
chunker = DocumentChunker(leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
assert isinstance(chunker, chunker_type)


def test_chunker_factory_unsupported_filetype():
def test_chunker_factory_unsupported_filetype(documents_dir):
"""Test that the DocumentChunker factory class fails when provided an unsupported document"""
leaf_node = [
{
"documents": ["Lorem ipsum"],
"taxonomy_path": "sample/path",
"taxonomy_path": "",
"filepaths": [Path("document.jpg")],
}
]
with pytest.raises(ValueError):
_ = DocumentChunker(leaf_node=leaf_node)


# class TestTextSplitChunker():
# @pytest.fixture
# def chunker():
# pass

# def test_chunk_documents():
# pass

# def test_chunk_docs_wc_exceeds_ctx_window(self):
# with pytest.raises(ValueError) as exc:
# chunking.chunk_document(
# documents=testdata.documents,
# chunk_word_count=1000,
# server_ctx_size=1034,
# )
# assert (
# "Given word count (1000) per doc will exceed the server context window size (1034)"
# in str(exc.value)
# )

# def test_chunk_docs_long_lines(self):
# # TODO see if this is applicable to context-aware
# chunk_words = 50
# chunks = chunking.chunk_document(
# documents=testdata.long_line_documents,
# chunk_word_count=chunk_words,
# server_ctx_size=4096,
# )
# max_tokens = chunking._num_tokens_from_words(chunk_words)
# max_chars = chunking._num_chars_from_tokens(max_tokens)
# max_chars += chunking._DEFAULT_CHUNK_OVERLAP # add in the chunk overlap
# max_chars += 50 # and a bit extra for some really long words
# for chunk in chunks:
# assert len(chunk) <= max_chars

# def test_chunk_docs_chunk_overlap_error(self):
# # TODO check if applicable to context-aware
# with pytest.raises(ValueError) as exc:
# chunking.chunk_document(
# documents=testdata.documents,
# chunk_word_count=5,
# server_ctx_size=1034,
# )
# assert (
# "Got a larger chunk overlap (100) than chunk size (24), should be smaller"
# in str(exc.value)
# )


# class TestContextAwareChunker():
# @pytest.fixture
# def chunker(documents_dir):
# pass

# def test_chunk_documents():
# pass

# def test_path_validator():
# pass

# def test_load_qna_yaml():
# pass

# def test_process_parsed_docling_json():
# pass

# def test_fuse_texts():
# pass

# def test_create_tokenizer():
# pass

# def test_get_token_count():
# pass

# def test_add_heading_formatting():
# pass

# def test_generate_table_from_parsed_rep():
# pass

# def test_get_table():
# pass

# def test_get_table_page_number():
# pass

# def test_build_chunks_from_docling_json():
# pass

# def test_export_document():
# pass

with tempfile.TemporaryDirectory() as temp_dir:
_ = DocumentChunker(leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
52 changes: 0 additions & 52 deletions tests/test_chunking.py

This file was deleted.

Binary file added tests/testdata/sample_documents/moo_deng.pdf
Binary file not shown.
2 changes: 2 additions & 0 deletions tests/testdata/sample_documents/qna.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
version: 3
domain: pop_culture

0 comments on commit 00d4d1f

Please sign in to comment.