diff --git a/semantic_chunkers/__init__.py b/semantic_chunkers/__init__.py index c9ef294..24e33e6 100644 --- a/semantic_chunkers/__init__.py +++ b/semantic_chunkers/__init__.py @@ -10,4 +10,4 @@ "StatisticalChunker", ] -__version__ = "0.0.2" \ No newline at end of file +__version__ = "0.0.2" diff --git a/semantic_chunkers/chunkers/statistical.py b/semantic_chunkers/chunkers/statistical.py index 3dee7ff..de0bfcc 100644 --- a/semantic_chunkers/chunkers/statistical.py +++ b/semantic_chunkers/chunkers/statistical.py @@ -426,7 +426,9 @@ def plot_sentence_similarity_scores( sentence after a similarity score below a specified threshold. """ - sentences = [sentence for doc in docs for sentence in sentence.regex_splitter(doc)] + sentences = [ + sentence for doc in docs for sentence in sentence.regex_splitter(doc) + ] encoded_sentences = self._encode_documents(sentences) similarity_scores = [] diff --git a/semantic_chunkers/splitters/sentence.py b/semantic_chunkers/splitters/sentence.py index 04aa363..9e75adc 100644 --- a/semantic_chunkers/splitters/sentence.py +++ b/semantic_chunkers/splitters/sentence.py @@ -53,4 +53,4 @@ def regex_splitter(text: str) -> list[str]: """ sentences = regex.split(regex_pattern, text, flags=regex.VERBOSE) sentences = [sentence.strip() for sentence in sentences if sentence.strip()] - return sentences \ No newline at end of file + return sentences diff --git a/tests/unit/test_splitters.py b/tests/unit/test_splitters.py index cac65e0..d7224de 100644 --- a/tests/unit/test_splitters.py +++ b/tests/unit/test_splitters.py @@ -6,8 +6,8 @@ from semantic_router.encoders.base import BaseEncoder from semantic_router.encoders.cohere import CohereEncoder from semantic_chunkers import BaseChunker -from semantic_chunkers import ConsecutiveSimSplitter -from semantic_chunkers import CumulativeSimSplitter +from semantic_chunkers import ConsecutiveChunker +from semantic_chunkers import CumulativeChunker def test_consecutive_sim_splitter(): @@ -21,7 +21,7 @@ def test_consecutive_sim_splitter(): input_type="", ) # Instantiate the ConsecutiveSimSplitter with the mock encoder - splitter = ConsecutiveSimSplitter(encoder=cohere_encoder, score_threshold=0.9) + splitter = ConsecutiveChunker(encoder=cohere_encoder, score_threshold=0.9) splitter.encoder = mock_encoder # Define some documents @@ -55,7 +55,7 @@ def test_cumulative_sim_splitter(): cohere_api_key="a", input_type="", ) - splitter = CumulativeSimSplitter(encoder=cohere_encoder, score_threshold=0.9) + splitter = CumulativeChunker(encoder=cohere_encoder, score_threshold=0.9) splitter.encoder = mock_encoder # Define some documents @@ -83,7 +83,7 @@ def test_consecutive_similarity_splitter_single_doc(): # Assuming any return value since it should not reach the point of using the encoder mock_encoder.return_value = np.array([[0.5, 0]]) - splitter = ConsecutiveSimSplitter(encoder=mock_encoder, score_threshold=0.5) + splitter = ConsecutiveChunker(encoder=mock_encoder, score_threshold=0.5) docs = ["doc1"] with pytest.raises(ValueError) as excinfo: @@ -96,7 +96,7 @@ def test_cumulative_similarity_splitter_single_doc(): # Assuming any return value since it should not reach the point of using the encoder mock_encoder.return_value = np.array([[0.5, 0]]) - splitter = CumulativeSimSplitter(encoder=mock_encoder, score_threshold=0.5) + splitter = CumulativeChunker(encoder=mock_encoder, score_threshold=0.5) docs = ["doc1"] with pytest.raises(ValueError) as excinfo: