Update testing for Document Chunker classes

Signed-off-by: Khaled Sulayman <[email protected]> Co-authored-by: Aakanksha Duggal <[email protected]>
instructlab · Nov 7, 2024 · 00d4d1f · 00d4d1f
1 parent 6851ad5
commit 00d4d1f
Show file tree

Hide file tree

Showing 6 changed files with 17 additions and 181 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,16 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
+docling>=1.15.0,<2.0.0
 GitPython>=3.1.42,<4.0.0
 httpx>=0.25.0,<1.0.0
 instructlab-schema>=0.4.0
 langchain-text-splitters
-openai>=1.13.3,<2.0.0
 # Note: this dependency goes along with langchain-text-splitters and may be
 #       removed once that one is removed.
 # do not use 8.4.0 due to a bug in the library
 # https://github.com/instructlab/instructlab/issues/1389
-pypdf>=5.0.0
+openai>=1.13.3,<2.0.0
 tabulate>=0.9.0
 tenacity>=8.3.0,!=8.4.0
 torch>=2.3.0,<2.5.0

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -3,7 +3,7 @@
 from collections import defaultdict
 from enum import Enum
 from pathlib import Path
-from typing import DefaultDict, Iterable, List, Tuple
+from typing import DefaultDict, Iterable, List, Tuple, Optional
 import json
 import logging
 import re
@@ -50,7 +50,7 @@ def __new__(
         cls,
         leaf_node,
         taxonomy_path,
-        output_dir: Path,
+        output_dir: Optional[Path],
         server_ctx_size=4096,
         chunk_word_count=1024,
         tokenizer_model_name: str | None = None,

diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Standard
+import tempfile
 from pathlib import Path
 
 # Third Party
@@ -21,31 +22,9 @@
 from .testdata import testdata
 
 
-# def read_document_contents(document_path: Path):
-#     # TODO
-#     if document_path.suffix == ".md":
-#         pass
-#     if document_path.suffix == ".pdf":
-#         pass
-
-
-def build_leaf_node(document_paths: str | list):
-    # TODO maybe check for directory
-    if not isinstance(document_paths, list):
-        document_paths = [document_paths]
-
-    return [
-        {
-            "taxonomy_path": "",  # TODO
-            "filepaths": document_paths,
-            "documents": [read_document_contents(d) for d in document_paths],
-        }
-    ]
-
-
 @pytest.fixture
 def documents_dir():
-    return Path(__file__) / "testdata" / "sample_documents"
+    return Path(__file__).parent / "testdata" / "sample_documents"
 
 
 @pytest.mark.parametrize(
@@ -55,122 +34,29 @@ def documents_dir():
         ([Path("document.pdf")], ContextAwareChunker),
     ],
 )
-def test_chunker_factory(filepaths, chunker_type):
+def test_chunker_factory(filepaths, chunker_type, documents_dir):
     """Test that the DocumentChunker factory class returns the proper Chunker type"""
     leaf_node = [
         {
             "documents": ["Lorem ipsum"],
-            "taxonomy_path": "sample/path",
+            "taxonomy_path": "",
             "filepaths": filepaths,
         }
     ]
-    chunker = DocumentChunker(leaf_node=leaf_node)
-    assert isinstance(chunker, chunker_type)
+    with tempfile.TemporaryDirectory() as temp_dir:
+        chunker = DocumentChunker(leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
+        assert isinstance(chunker, chunker_type)
 
 
-def test_chunker_factory_unsupported_filetype():
+def test_chunker_factory_unsupported_filetype(documents_dir):
     """Test that the DocumentChunker factory class fails when provided an unsupported document"""
     leaf_node = [
         {
             "documents": ["Lorem ipsum"],
-            "taxonomy_path": "sample/path",
+            "taxonomy_path": "",
             "filepaths": [Path("document.jpg")],
         }
     ]
     with pytest.raises(ValueError):
-        _ = DocumentChunker(leaf_node=leaf_node)
-
-
-# class TestTextSplitChunker():
-#     @pytest.fixture
-#     def chunker():
-#         pass
-
-#     def test_chunk_documents():
-#         pass
-
-#     def test_chunk_docs_wc_exceeds_ctx_window(self):
-#         with pytest.raises(ValueError) as exc:
-#             chunking.chunk_document(
-#                 documents=testdata.documents,
-#                 chunk_word_count=1000,
-#                 server_ctx_size=1034,
-#             )
-#         assert (
-#             "Given word count (1000) per doc will exceed the server context window size (1034)"
-#             in str(exc.value)
-#         )
-
-#     def test_chunk_docs_long_lines(self):
-#         # TODO see if this is applicable to context-aware
-#         chunk_words = 50
-#         chunks = chunking.chunk_document(
-#             documents=testdata.long_line_documents,
-#             chunk_word_count=chunk_words,
-#             server_ctx_size=4096,
-#         )
-#         max_tokens = chunking._num_tokens_from_words(chunk_words)
-#         max_chars = chunking._num_chars_from_tokens(max_tokens)
-#         max_chars += chunking._DEFAULT_CHUNK_OVERLAP  # add in the chunk overlap
-#         max_chars += 50  # and a bit extra for some really long words
-#         for chunk in chunks:
-#             assert len(chunk) <= max_chars
-
-#     def test_chunk_docs_chunk_overlap_error(self):
-#         # TODO check if applicable to context-aware
-#         with pytest.raises(ValueError) as exc:
-#             chunking.chunk_document(
-#                 documents=testdata.documents,
-#                 chunk_word_count=5,
-#                 server_ctx_size=1034,
-#             )
-#         assert (
-#             "Got a larger chunk overlap (100) than chunk size (24), should be smaller"
-#             in str(exc.value)
-#         )
-
-
-# class TestContextAwareChunker():
-#     @pytest.fixture
-#     def chunker(documents_dir):
-#         pass
-
-#     def test_chunk_documents():
-#         pass
-
-#     def test_path_validator():
-#         pass
-
-#     def test_load_qna_yaml():
-#         pass
-
-#     def test_process_parsed_docling_json():
-#         pass
-
-#     def test_fuse_texts():
-#         pass
-
-#     def test_create_tokenizer():
-#         pass
-
-#     def test_get_token_count():
-#         pass
-
-#     def test_add_heading_formatting():
-#         pass
-
-#     def test_generate_table_from_parsed_rep():
-#         pass
-
-#     def test_get_table():
-#         pass
-
-#     def test_get_table_page_number():
-#         pass
-
-#     def test_build_chunks_from_docling_json():
-#         pass
-
-#     def test_export_document():
-#         pass
-
+        with tempfile.TemporaryDirectory() as temp_dir:
+            _ = DocumentChunker(leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
diff --git a/tests/testdata/sample_documents/moo_deng.pdf b/tests/testdata/sample_documents/moo_deng.pdf
diff --git a/tests/testdata/sample_documents/qna.yaml b/tests/testdata/sample_documents/qna.yaml
@@ -0,0 +1,2 @@
+version: 3
+domain: pop_culture