Update testing for Document Chunker classes

Signed-off-by: Khaled Sulayman <[email protected]> Co-authored-by: Aakanksha Duggal <[email protected]>
instructlab · Nov 6, 2024 · e5d35f8 · e5d35f8
1 parent 6851ad5
commit e5d35f8
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 68 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,16 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
+docling>=1.15.0,<2.0.0
 GitPython>=3.1.42,<4.0.0
 httpx>=0.25.0,<1.0.0
 instructlab-schema>=0.4.0
 langchain-text-splitters
-openai>=1.13.3,<2.0.0
 # Note: this dependency goes along with langchain-text-splitters and may be
 #       removed once that one is removed.
 # do not use 8.4.0 due to a bug in the library
 # https://github.com/instructlab/instructlab/issues/1389
-pypdf>=5.0.0
+openai>=1.13.3,<2.0.0
 tabulate>=0.9.0
 tenacity>=8.3.0,!=8.4.0
 torch>=2.3.0,<2.5.0

diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
@@ -20,7 +20,6 @@
 # Local
 from .testdata import testdata
 
-
 # def read_document_contents(document_path: Path):
 #     # TODO
 #     if document_path.suffix == ".md":
@@ -29,18 +28,18 @@
 #         pass
 
 
-def build_leaf_node(document_paths: str | list):
-    # TODO maybe check for directory
-    if not isinstance(document_paths, list):
-        document_paths = [document_paths]
+# def build_leaf_node(document_paths: str | list):
+#     # TODO maybe check for directory
+#     if not isinstance(document_paths, list):
+#         document_paths = [document_paths]
 
-    return [
-        {
-            "taxonomy_path": "",  # TODO
-            "filepaths": document_paths,
-            "documents": [read_document_contents(d) for d in document_paths],
-        }
-    ]
+#     return [
+#         {
+#             "taxonomy_path": "",  # TODO
+#             "filepaths": document_paths,
+#             "documents": [read_document_contents(d) for d in document_paths],
+#         }
+#     ]
 
 
 @pytest.fixture
@@ -130,7 +129,45 @@ def test_chunker_factory_unsupported_filetype():
 #         )
 
 
-# class TestContextAwareChunker():
+class TestContextAwareChunker:
+    @pytest.fixture
+    def sample_pdf_path():
+        return Path("sample_documents/sample.pdf")  # Replace with actual test path
+
+    def test_context_aware_chunker_pdf_parsing(sample_pdf_path):
+        """Test that ContextAwareChunker correctly parses and chunks PDF content."""
+        leaf_node = [
+            {
+                "documents": ["Sample PDF content"],
+                "taxonomy_path": "sample/path",
+                "filepaths": [sample_pdf_path],
+            }
+        ]
+        chunker = ContextAwareChunker(leaf_node=leaf_node)
+        chunks = chunker.chunk_documents()
+        assert chunks  # Ensure chunks were created
+        assert all(
+            "content" in chunk for chunk in chunks
+        )  # Verify content presence in chunks
+
+    def test_context_aware_chunker_tokenizer():
+        """Test that the tokenizer is created and token count is accurate."""
+        leaf_node = [
+            {
+                "documents": ["Sample content"],
+                "taxonomy_path": "sample/path",
+                "filepaths": [Path("document.pdf")],
+            }
+        ]
+        chunker = ContextAwareChunker(leaf_node=leaf_node)
+        tokenizer = chunker.create_tokenizer(
+            "sample_model_name"
+        )  # Pass in model name if required
+        tokens = tokenizer.encode("Test text")
+        assert tokens  # Ensure tokenization occurred
+        assert len(tokens) > 0
+
+
 #     @pytest.fixture
 #     def chunker(documents_dir):
 #         pass
@@ -173,4 +210,3 @@ def test_chunker_factory_unsupported_filetype():
 
 #     def test_export_document():
 #         pass
-
diff --git a/tests/test_chunking.py b/tests/test_chunking.py