Update CHANGELOG, examples and documentation

neo4j · Jan 20, 2025 · 385492b · 385492b
1 parent 2d3b4fd
commit 385492b
Show file tree

Hide file tree

Showing 8 changed files with 12 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@
 ### Changed
 - Updated LLM implementations to handle message history consistently across providers.
 - The `id_prefix` parameter in the `LexicalGraphConfig` is deprecated.
+- Changed the default behaviour of `FixedSizeSplitter` to avoid words cut-off in the chunks whenever it is possible.
 
 ### Fixed
 - IDs for the Document and Chunk nodes in the lexical graph are now randomly generated and unique across multiple runs, fixing issues in the lexical graph where relationships were created between chunks that were created by different pipeline runs.

diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst
@@ -581,9 +581,12 @@ that can be processed within the LLM token limits:
 
     from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
 
-    splitter = FixedSizeSplitter(chunk_size=4000, chunk_overlap=200)
+    splitter = FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=False)
     splitter.run(text="Hello World. Life is beautiful.")
 
+.. note::
+
+    `approximate` flag is by default set to True to ensure clean chunk start and end (i.e. avoid words cut in the middle) whenever it is possible.
 
 Wrappers for LangChain and LlamaIndex text splitters are included in this package:
 

diff --git a/examples/customize/build_graph/components/splitters/fixed_size_splitter.py b/examples/customize/build_graph/components/splitters/fixed_size_splitter.py
@@ -6,9 +6,10 @@
 
 async def main() -> TextChunks:
     splitter = FixedSizeSplitter(
-        # optionally, configure chunk_size and chunk_overlap
+        # optionally, configure chunk_size, chunk_overlap, and approximate flag
         # chunk_size=4000,
         # chunk_overlap=200,
+        # approximate = False
     )
     chunks = await splitter.run(text="text to split")
     return chunks
diff --git a/examples/customize/build_graph/pipeline/kg_builder_from_pdf.py b/examples/customize/build_graph/pipeline/kg_builder_from_pdf.py
@@ -83,7 +83,7 @@ async def define_and_run_pipeline(
     pipe = Pipeline()
     pipe.add_component(PdfLoader(), "pdf_loader")
     pipe.add_component(
-        FixedSizeSplitter(chunk_size=4000, chunk_overlap=200), "splitter"
+        FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=False), "splitter"
     )
     pipe.add_component(SchemaBuilder(), "schema")
     pipe.add_component(

diff --git a/examples/customize/build_graph/pipeline/kg_builder_from_text.py b/examples/customize/build_graph/pipeline/kg_builder_from_text.py
@@ -58,7 +58,7 @@ async def define_and_run_pipeline(
     # define the components
     pipe.add_component(
         # chunk_size=50 for the sake of this demo
-        FixedSizeSplitter(chunk_size=4000, chunk_overlap=200),
+        FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=False),
         "splitter",
     )
     pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")

diff --git a/examples/customize/build_graph/pipeline/lexical_graph_builder_from_text.py b/examples/customize/build_graph/pipeline/lexical_graph_builder_from_text.py
@@ -27,7 +27,7 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult:
     pipe = Pipeline()
     # define the components
     pipe.add_component(
-        FixedSizeSplitter(chunk_size=20, chunk_overlap=1),
+        FixedSizeSplitter(chunk_size=20, chunk_overlap=1, approximate=False),
         "splitter",
     )
     pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")

diff --git a/...s/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_single_pipeline.py b/...s/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_single_pipeline.py
@@ -56,7 +56,7 @@ async def define_and_run_pipeline(
     pipe = Pipeline()
     # define the components
     pipe.add_component(
-        FixedSizeSplitter(chunk_size=200, chunk_overlap=50),
+        FixedSizeSplitter(chunk_size=200, chunk_overlap=50,approximate=False),
         "splitter",
     )
     pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")

diff --git a/...les/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_two_pipelines.py b/...les/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_two_pipelines.py
@@ -47,7 +47,7 @@ async def build_lexical_graph(
     pipe = Pipeline()
     # define the components
     pipe.add_component(
-        FixedSizeSplitter(chunk_size=200, chunk_overlap=50),
+        FixedSizeSplitter(chunk_size=200, chunk_overlap=50, approximate=False),
         "splitter",
     )
     pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")