-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update testing for Document Chunker classes
Signed-off-by: Khaled Sulayman <[email protected]>
- Loading branch information
1 parent
4ee15b4
commit de8b6f9
Showing
3 changed files
with
148 additions
and
55 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,18 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
click>=8.1.7,<9.0.0 | ||
datasets>=2.18.0,<3.0.0 | ||
docling>=1.15.0,<2.0.0 | ||
GitPython>=3.1.42,<4.0.0 | ||
httpx>=0.25.0,<1.0.0 | ||
instructlab-schema>=0.4.0 | ||
langchain-text-splitters | ||
openai>=1.13.3,<2.0.0 | ||
# Note: this dependency goes along with langchain-text-splitters and may be | ||
# removed once that one is removed. | ||
# do not use 8.4.0 due to a bug in the library | ||
# https://github.com/instructlab/instructlab/issues/1389 | ||
pypdf>=5.0.0 | ||
openai>=1.13.3,<2.0.0 | ||
tabulate>=0.9.0 | ||
tenacity>=8.3.0,!=8.4.0 | ||
transformers>=4.44.2 | ||
torch>=2.3.0,<2.4.0 | ||
transformers>=4.41.2 | ||
xdg-base-dirs>=6.0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
# First Party | ||
from instructlab.sdg.utils.chunkers import ( | ||
FileTypes, | ||
DocumentChunker, | ||
TextSplitChunker, | ||
ContextAwareChunker, | ||
) | ||
from pathlib import Path | ||
|
||
# Local | ||
from .testdata import testdata | ||
|
||
# Third Party | ||
import pytest | ||
from docling.datamodel.base_models import PipelineOptions | ||
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput | ||
from docling.document_converter import ConversionStatus, DocumentConverter | ||
|
||
|
||
@pytest.fixture | ||
def documents_dir(): | ||
return Path(__file__) / "testdata" / "sample_documents" | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"filepaths, chunker_type", | ||
[ | ||
[Path("document.md")], | ||
[Path("document.pdf")], | ||
], | ||
[ | ||
TextSplitChunker, | ||
ContextAwareChunker, | ||
], | ||
) | ||
def test_chunker_factory(filepaths, chunker_type): | ||
"""Test that the DocumentChunker factory class returns the proper Chunker type""" | ||
leaf_node = [{ | ||
"documents": "Lorem ipsum", | ||
"taxonomy_path": Path(), | ||
"filepaths": filepaths, | ||
}] | ||
chunker = DocumentChunker(leaf_node=leaf_node) | ||
assert isinstance(chunker, chunker_type) | ||
|
||
|
||
def test_split_docs_by_filetype(filepaths): | ||
documents = ["Lorem", "ipsum"] | ||
md_doc_path = Path("document.md") | ||
pdf_doc_path = Path("document.pdf") | ||
|
||
doc_dict = DocumentChunker._split_docs_by_filetype() | ||
# TODO | ||
|
||
with pytest.raises(ValueError("Received multiple document types")): | ||
md_chunker = DocumentChunker(leaf_node=invalid_leaf_node) | ||
|
||
|
||
class TestTextSplitChunker(): | ||
def test_chunk_documents(): | ||
pass | ||
|
||
def test_chunk_docs_wc_exceeds_ctx_window(self): | ||
with pytest.raises(ValueError) as exc: | ||
chunking.chunk_document( | ||
documents=testdata.documents, | ||
chunk_word_count=1000, | ||
server_ctx_size=1034, | ||
) | ||
assert ( | ||
"Given word count (1000) per doc will exceed the server context window size (1034)" | ||
in str(exc.value) | ||
) | ||
|
||
def test_chunk_docs_long_lines(self): | ||
# TODO see if this is applicable to context-aware | ||
chunk_words = 50 | ||
chunks = chunking.chunk_document( | ||
documents=testdata.long_line_documents, | ||
chunk_word_count=chunk_words, | ||
server_ctx_size=4096, | ||
) | ||
max_tokens = chunking._num_tokens_from_words(chunk_words) | ||
max_chars = chunking._num_chars_from_tokens(max_tokens) | ||
max_chars += chunking._DEFAULT_CHUNK_OVERLAP # add in the chunk overlap | ||
max_chars += 50 # and a bit extra for some really long words | ||
for chunk in chunks: | ||
assert len(chunk) <= max_chars | ||
|
||
def test_chunk_docs_chunk_overlap_error(self): | ||
# TODO check if applicable to context-aware | ||
with pytest.raises(ValueError) as exc: | ||
chunking.chunk_document( | ||
documents=testdata.documents, | ||
chunk_word_count=5, | ||
server_ctx_size=1034, | ||
) | ||
assert ( | ||
"Got a larger chunk overlap (100) than chunk size (24), should be smaller" | ||
in str(exc.value) | ||
) | ||
|
||
|
||
class TestContextAwareChunker(): | ||
def test_chunk_documents(): | ||
pass | ||
|
||
def test_path_validator(): | ||
pass | ||
|
||
def test_load_qna_yaml(): | ||
pass | ||
|
||
def test_process_parsed_docling_json(): | ||
pass | ||
|
||
def test_fuse_texts(): | ||
pass | ||
|
||
def test_create_tokenizer(): | ||
pass | ||
|
||
def test_get_token_count(): | ||
pass | ||
|
||
def test_add_heading_formatting(): | ||
pass | ||
|
||
def test_generate_table_from_parsed_rep(): | ||
pass | ||
|
||
def test_get_table(): | ||
pass | ||
|
||
def test_get_table_page_number(): | ||
pass | ||
|
||
def test_build_chunks_from_docling_json(): | ||
pass | ||
|
||
def test_export_document(): | ||
pass |
This file was deleted.
Oops, something went wrong.