diff --git a/requirements.txt b/requirements.txt index 684539be..0f48ca93 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 click>=8.1.7,<9.0.0 datasets>=2.18.0,<3.0.0 +docling>=1.15.0,<2.0.0 GitPython>=3.1.42,<4.0.0 httpx>=0.25.0,<1.0.0 instructlab-schema>=0.4.0 langchain-text-splitters -openai>=1.13.3,<2.0.0 # Note: this dependency goes along with langchain-text-splitters and may be # removed once that one is removed. # do not use 8.4.0 due to a bug in the library # https://github.com/instructlab/instructlab/issues/1389 -pypdf>=5.0.0 +openai>=1.13.3,<2.0.0 tabulate>=0.9.0 tenacity>=8.3.0,!=8.4.0 torch>=2.3.0,<2.5.0 diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 14b74969..5e46216d 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -3,7 +3,7 @@ from collections import defaultdict from enum import Enum from pathlib import Path -from typing import DefaultDict, Iterable, List, Tuple +from typing import DefaultDict, Iterable, List, Tuple, Optional import json import logging import re @@ -50,7 +50,7 @@ def __new__( cls, leaf_node, taxonomy_path, - output_dir: Path, + output_dir: Optional[Path], server_ctx_size=4096, chunk_word_count=1024, tokenizer_model_name: str | None = None, diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index b7821bd6..feca620c 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +import tempfile from pathlib import Path # Third Party @@ -21,31 +22,9 @@ from .testdata import testdata -# def read_document_contents(document_path: Path): -# # TODO -# if document_path.suffix == ".md": -# pass -# if document_path.suffix == ".pdf": -# pass - - -def build_leaf_node(document_paths: str | list): - # TODO maybe check for directory - if not isinstance(document_paths, list): - document_paths = [document_paths] - - return [ - { - "taxonomy_path": "", # TODO - "filepaths": document_paths, - "documents": [read_document_contents(d) for d in document_paths], - } - ] - - @pytest.fixture def documents_dir(): - return Path(__file__) / "testdata" / "sample_documents" + return Path(__file__).parent / "testdata" / "sample_documents" @pytest.mark.parametrize( @@ -55,122 +34,29 @@ def documents_dir(): ([Path("document.pdf")], ContextAwareChunker), ], ) -def test_chunker_factory(filepaths, chunker_type): +def test_chunker_factory(filepaths, chunker_type, documents_dir): """Test that the DocumentChunker factory class returns the proper Chunker type""" leaf_node = [ { "documents": ["Lorem ipsum"], - "taxonomy_path": "sample/path", + "taxonomy_path": "", "filepaths": filepaths, } ] - chunker = DocumentChunker(leaf_node=leaf_node) - assert isinstance(chunker, chunker_type) + with tempfile.TemporaryDirectory() as temp_dir: + chunker = DocumentChunker(leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1") + assert isinstance(chunker, chunker_type) -def test_chunker_factory_unsupported_filetype(): +def test_chunker_factory_unsupported_filetype(documents_dir): """Test that the DocumentChunker factory class fails when provided an unsupported document""" leaf_node = [ { "documents": ["Lorem ipsum"], - "taxonomy_path": "sample/path", + "taxonomy_path": "", "filepaths": [Path("document.jpg")], } ] with pytest.raises(ValueError): - _ = DocumentChunker(leaf_node=leaf_node) - - -# class TestTextSplitChunker(): -# @pytest.fixture -# def chunker(): -# pass - -# def test_chunk_documents(): -# pass - -# def test_chunk_docs_wc_exceeds_ctx_window(self): -# with pytest.raises(ValueError) as exc: -# chunking.chunk_document( -# documents=testdata.documents, -# chunk_word_count=1000, -# server_ctx_size=1034, -# ) -# assert ( -# "Given word count (1000) per doc will exceed the server context window size (1034)" -# in str(exc.value) -# ) - -# def test_chunk_docs_long_lines(self): -# # TODO see if this is applicable to context-aware -# chunk_words = 50 -# chunks = chunking.chunk_document( -# documents=testdata.long_line_documents, -# chunk_word_count=chunk_words, -# server_ctx_size=4096, -# ) -# max_tokens = chunking._num_tokens_from_words(chunk_words) -# max_chars = chunking._num_chars_from_tokens(max_tokens) -# max_chars += chunking._DEFAULT_CHUNK_OVERLAP # add in the chunk overlap -# max_chars += 50 # and a bit extra for some really long words -# for chunk in chunks: -# assert len(chunk) <= max_chars - -# def test_chunk_docs_chunk_overlap_error(self): -# # TODO check if applicable to context-aware -# with pytest.raises(ValueError) as exc: -# chunking.chunk_document( -# documents=testdata.documents, -# chunk_word_count=5, -# server_ctx_size=1034, -# ) -# assert ( -# "Got a larger chunk overlap (100) than chunk size (24), should be smaller" -# in str(exc.value) -# ) - - -# class TestContextAwareChunker(): -# @pytest.fixture -# def chunker(documents_dir): -# pass - -# def test_chunk_documents(): -# pass - -# def test_path_validator(): -# pass - -# def test_load_qna_yaml(): -# pass - -# def test_process_parsed_docling_json(): -# pass - -# def test_fuse_texts(): -# pass - -# def test_create_tokenizer(): -# pass - -# def test_get_token_count(): -# pass - -# def test_add_heading_formatting(): -# pass - -# def test_generate_table_from_parsed_rep(): -# pass - -# def test_get_table(): -# pass - -# def test_get_table_page_number(): -# pass - -# def test_build_chunks_from_docling_json(): -# pass - -# def test_export_document(): -# pass - + with tempfile.TemporaryDirectory() as temp_dir: + _ = DocumentChunker(leaf_node=leaf_node, taxonomy_path=documents_dir, output_dir=temp_dir, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1") diff --git a/tests/test_chunking.py b/tests/test_chunking.py deleted file mode 100644 index dae1f61f..00000000 --- a/tests/test_chunking.py +++ /dev/null @@ -1,52 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# Third Party -import pytest - -# First Party -from instructlab.sdg.utils import chunking - -# Local -from .testdata import testdata - - -class TestChunking: - """Test collection in instructlab.utils.chunking.""" - - def test_chunk_docs_wc_exceeds_ctx_window(self): - with pytest.raises(ValueError) as exc: - chunking.chunk_document( - documents=testdata.documents, - chunk_word_count=1000, - server_ctx_size=1034, - ) - assert ( - "Given word count (1000) per doc will exceed the server context window size (1034)" - in str(exc.value) - ) - - def test_chunk_docs_chunk_overlap_error(self): - with pytest.raises(ValueError) as exc: - chunking.chunk_document( - documents=testdata.documents, - chunk_word_count=5, - server_ctx_size=1034, - ) - assert ( - "Got a larger chunk overlap (100) than chunk size (24), should be smaller" - in str(exc.value) - ) - - def test_chunk_docs_long_lines(self): - chunk_words = 50 - chunks = chunking.chunk_document( - documents=testdata.long_line_documents, - chunk_word_count=chunk_words, - server_ctx_size=4096, - ) - max_tokens = chunking._num_tokens_from_words(chunk_words) - max_chars = chunking._num_chars_from_tokens(max_tokens) - max_chars += chunking._DEFAULT_CHUNK_OVERLAP # add in the chunk overlap - max_chars += 50 # and a bit extra for some really long words - for chunk in chunks: - assert len(chunk) <= max_chars diff --git a/tests/testdata/sample_documents/moo_deng.pdf b/tests/testdata/sample_documents/moo_deng.pdf new file mode 100644 index 00000000..4c5fc0e5 Binary files /dev/null and b/tests/testdata/sample_documents/moo_deng.pdf differ diff --git a/tests/testdata/sample_documents/qna.yaml b/tests/testdata/sample_documents/qna.yaml new file mode 100644 index 00000000..ea67338a --- /dev/null +++ b/tests/testdata/sample_documents/qna.yaml @@ -0,0 +1,2 @@ +version: 3 +domain: pop_culture \ No newline at end of file