Update testing for Document Chunker classes

Signed-off-by: Khaled Sulayman <[email protected]>
instructlab · Nov 1, 2024 · de8b6f9 · de8b6f9
1 parent 4ee15b4
commit de8b6f9
Show file tree

Hide file tree

Showing 3 changed files with 148 additions and 55 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,17 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
+docling>=1.15.0,<2.0.0
 GitPython>=3.1.42,<4.0.0
 httpx>=0.25.0,<1.0.0
 instructlab-schema>=0.4.0
 langchain-text-splitters
-openai>=1.13.3,<2.0.0
 # Note: this dependency goes along with langchain-text-splitters and may be
 #       removed once that one is removed.
 # do not use 8.4.0 due to a bug in the library
 # https://github.com/instructlab/instructlab/issues/1389
-pypdf>=5.0.0
+openai>=1.13.3,<2.0.0
 tabulate>=0.9.0
 tenacity>=8.3.0,!=8.4.0
-transformers>=4.44.2
+torch>=2.3.0,<2.4.0
+transformers>=4.41.2
 xdg-base-dirs>=6.0.1
diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# First Party
+from instructlab.sdg.utils.chunkers import (
+    FileTypes,
+    DocumentChunker,
+    TextSplitChunker,
+    ContextAwareChunker,
+)
+from pathlib import Path
+
+# Local
+from .testdata import testdata
+
+# Third Party
+import pytest
+from docling.datamodel.base_models import PipelineOptions
+from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
+from docling.document_converter import ConversionStatus, DocumentConverter
+
+
+@pytest.fixture
+def documents_dir():
+    return Path(__file__) / "testdata" / "sample_documents"
+
+
+@pytest.mark.parametrize(
+    "filepaths, chunker_type",
+    [
+        [Path("document.md")],
+        [Path("document.pdf")],
+    ],
+    [
+        TextSplitChunker,
+        ContextAwareChunker,
+    ],
+)
+def test_chunker_factory(filepaths, chunker_type):
+    """Test that the DocumentChunker factory class returns the proper Chunker type"""
+    leaf_node = [{
+        "documents": "Lorem ipsum",
+        "taxonomy_path": Path(),
+        "filepaths": filepaths,
+    }]
+    chunker = DocumentChunker(leaf_node=leaf_node)
+    assert isinstance(chunker, chunker_type)
+
+
+def test_split_docs_by_filetype(filepaths):
+    documents = ["Lorem", "ipsum"]
+    md_doc_path = Path("document.md")
+    pdf_doc_path = Path("document.pdf")
+
+    doc_dict = DocumentChunker._split_docs_by_filetype()
+    # TODO
+
+    with pytest.raises(ValueError("Received multiple document types")):
+        md_chunker = DocumentChunker(leaf_node=invalid_leaf_node)
+
+
+class TestTextSplitChunker():
+    def test_chunk_documents():
+        pass
+
+    def test_chunk_docs_wc_exceeds_ctx_window(self):
+        with pytest.raises(ValueError) as exc:
+            chunking.chunk_document(
+                documents=testdata.documents,
+                chunk_word_count=1000,
+                server_ctx_size=1034,
+            )
+        assert (
+            "Given word count (1000) per doc will exceed the server context window size (1034)"
+            in str(exc.value)
+        )
+
+    def test_chunk_docs_long_lines(self):
+        # TODO see if this is applicable to context-aware
+        chunk_words = 50
+        chunks = chunking.chunk_document(
+            documents=testdata.long_line_documents,
+            chunk_word_count=chunk_words,
+            server_ctx_size=4096,
+        )
+        max_tokens = chunking._num_tokens_from_words(chunk_words)
+        max_chars = chunking._num_chars_from_tokens(max_tokens)
+        max_chars += chunking._DEFAULT_CHUNK_OVERLAP  # add in the chunk overlap
+        max_chars += 50  # and a bit extra for some really long words
+        for chunk in chunks:
+            assert len(chunk) <= max_chars
+
+    def test_chunk_docs_chunk_overlap_error(self):
+        # TODO check if applicable to context-aware
+        with pytest.raises(ValueError) as exc:
+            chunking.chunk_document(
+                documents=testdata.documents,
+                chunk_word_count=5,
+                server_ctx_size=1034,
+            )
+        assert (
+            "Got a larger chunk overlap (100) than chunk size (24), should be smaller"
+            in str(exc.value)
+        )
+
+
+class TestContextAwareChunker():
+    def test_chunk_documents():
+        pass
+
+    def test_path_validator():
+        pass
+
+    def test_load_qna_yaml():
+        pass
+
+    def test_process_parsed_docling_json():
+        pass
+
+    def test_fuse_texts():
+        pass
+
+    def test_create_tokenizer():
+        pass
+
+    def test_get_token_count():
+        pass
+
+    def test_add_heading_formatting():
+        pass
+
+    def test_generate_table_from_parsed_rep():
+        pass
+
+    def test_get_table():
+        pass
+
+    def test_get_table_page_number():
+        pass
+
+    def test_build_chunks_from_docling_json():
+        pass
+
+    def test_export_document():
+        pass
diff --git a/tests/test_chunking.py b/tests/test_chunking.py