Update testing for Document Chunker classes

Signed-off-by: Khaled Sulayman <[email protected]>
instructlab · Nov 5, 2024 · 4941d64 · 4941d64
1 parent 4ee15b4
commit 4941d64
Show file tree

Hide file tree

Showing 3 changed files with 171 additions and 55 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,17 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
+docling>=1.15.0,<2.0.0
 GitPython>=3.1.42,<4.0.0
 httpx>=0.25.0,<1.0.0
 instructlab-schema>=0.4.0
 langchain-text-splitters
-openai>=1.13.3,<2.0.0
 # Note: this dependency goes along with langchain-text-splitters and may be
 #       removed once that one is removed.
 # do not use 8.4.0 due to a bug in the library
 # https://github.com/instructlab/instructlab/issues/1389
-pypdf>=5.0.0
+openai>=1.13.3,<2.0.0
 tabulate>=0.9.0
 tenacity>=8.3.0,!=8.4.0
-transformers>=4.44.2
+torch>=2.3.0,<2.4.0
+transformers>=4.41.2
 xdg-base-dirs>=6.0.1
diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# First Party
+from instructlab.sdg.utils.chunkers import (
+    FileTypes,
+    DocumentChunker,
+    TextSplitChunker,
+    ContextAwareChunker,
+)
+from pathlib import Path
+
+# Local
+from .testdata import testdata
+
+# Third Party
+import pytest
+from docling.datamodel.base_models import PipelineOptions
+from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
+from docling.document_converter import ConversionStatus, DocumentConverter
+
+
+def read_document_contents(document_path: Path):
+    # TODO
+    if document_path.suffix == ".md":
+        pass
+    if document_path.suffix == ".pdf":
+        pass
+
+
+def build_leaf_node(document_paths: str | list):
+    # TODO maybe check for directory
+    if not isinstance(document_paths, list):
+        document_paths = [document_paths]
+
+    return [{
+        "taxonomy_path": "",  # TODO
+        "filepaths": document_paths,
+        "documents": [read_document_contents(d) for d in document_paths],
+    }]
+
+
+@pytest.fixture
+def documents_dir():
+    return Path(__file__) / "testdata" / "sample_documents"
+
+
+@pytest.mark.parametrize(
+    "filepaths, chunker_type",
+    [
+        ([Path("document.md")], TextSplitChunker),
+        ([Path("document.pdf")], ContextAwareChunker),
+    ],
+)
+def test_chunker_factory(filepaths, chunker_type):
+    """Test that the DocumentChunker factory class returns the proper Chunker type"""
+    leaf_node = [{
+        "documents": ["Lorem ipsum"],
+        "taxonomy_path": "sample/path",
+        "filepaths": filepaths,
+    }]
+    chunker = DocumentChunker(leaf_node=leaf_node)
+    assert isinstance(chunker, chunker_type)
+
+
+def test_chunker_factory_unsupported_filetype():
+    """Test that the DocumentChunker factory class fails when provided an unsupported document"""
+    leaf_node = [{
+        "documents": ["Lorem ipsum"],
+        "taxonomy_path": "sample/path",
+        "filepaths": [Path("document.jpg")],
+    }]
+    with pytest.raises(ValueError):
+        _ = DocumentChunker(leaf_node=leaf_node)
+
+
+# class TestTextSplitChunker():
+#     @pytest.fixture
+#     def chunker():
+#         pass
+
+#     def test_chunk_documents():
+#         pass
+
+#     def test_chunk_docs_wc_exceeds_ctx_window(self):
+#         with pytest.raises(ValueError) as exc:
+#             chunking.chunk_document(
+#                 documents=testdata.documents,
+#                 chunk_word_count=1000,
+#                 server_ctx_size=1034,
+#             )
+#         assert (
+#             "Given word count (1000) per doc will exceed the server context window size (1034)"
+#             in str(exc.value)
+#         )
+
+#     def test_chunk_docs_long_lines(self):
+#         # TODO see if this is applicable to context-aware
+#         chunk_words = 50
+#         chunks = chunking.chunk_document(
+#             documents=testdata.long_line_documents,
+#             chunk_word_count=chunk_words,
+#             server_ctx_size=4096,
+#         )
+#         max_tokens = chunking._num_tokens_from_words(chunk_words)
+#         max_chars = chunking._num_chars_from_tokens(max_tokens)
+#         max_chars += chunking._DEFAULT_CHUNK_OVERLAP  # add in the chunk overlap
+#         max_chars += 50  # and a bit extra for some really long words
+#         for chunk in chunks:
+#             assert len(chunk) <= max_chars
+
+#     def test_chunk_docs_chunk_overlap_error(self):
+#         # TODO check if applicable to context-aware
+#         with pytest.raises(ValueError) as exc:
+#             chunking.chunk_document(
+#                 documents=testdata.documents,
+#                 chunk_word_count=5,
+#                 server_ctx_size=1034,
+#             )
+#         assert (
+#             "Got a larger chunk overlap (100) than chunk size (24), should be smaller"
+#             in str(exc.value)
+#         )
+
+
+# class TestContextAwareChunker():
+#     @pytest.fixture
+#     def chunker(documents_dir):
+#         pass
+
+#     def test_chunk_documents():
+#         pass
+
+#     def test_path_validator():
+#         pass
+
+#     def test_load_qna_yaml():
+#         pass
+
+#     def test_process_parsed_docling_json():
+#         pass
+
+#     def test_fuse_texts():
+#         pass
+
+#     def test_create_tokenizer():
+#         pass
+
+#     def test_get_token_count():
+#         pass
+
+#     def test_add_heading_formatting():
+#         pass
+
+#     def test_generate_table_from_parsed_rep():
+#         pass
+
+#     def test_get_table():
+#         pass
+
+#     def test_get_table_page_number():
+#         pass
+
+#     def test_build_chunks_from_docling_json():
+#         pass
+
+#     def test_export_document():
+#         pass
diff --git a/tests/test_chunking.py b/tests/test_chunking.py