fix(agents-api): increase test coverage + set split_cuncks=Ture a…

…s default
julep-ai · Jan 15, 2025 · 8fe87cb · 8fe87cb
1 parent 9df8de4
commit 8fe87cb
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 4 deletions.
diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
@@ -41,7 +41,7 @@ def clean_keyword(kw: str) -> str:
     return WHITESPACE_RE.sub(" ", cleaned).strip()
 
 
-def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = False) -> list[str]:
+def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = True) -> list[str]:
     """Optimized keyword extraction with minimal behavior change."""
     excluded_labels = {
         "DATE",  # Absolute or relative dates or periods.
@@ -117,7 +117,7 @@ def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = False) -> l
 
 @lru_cache(maxsize=1000)
 def text_to_tsvector_query(
-    paragraph: str, top_n: int = 25, min_keywords: int = 1, split_chunks: bool = False
+    paragraph: str, top_n: int = 25, min_keywords: int = 1, split_chunks: bool = True
 ) -> str:
     """
     Extracts meaningful keywords/phrases from text and joins them with OR.

diff --git a/agents-api/agents_api/queries/docs/search_docs_by_text.py b/agents-api/agents_api/queries/docs/search_docs_by_text.py
@@ -62,7 +62,7 @@ async def search_docs_by_text(
     owner_types: list[str] = [owner[0] for owner in owners]
     owner_ids: list[str] = [str(owner[1]) for owner in owners]
     #  Pre-process rawtext query
-    query = text_to_tsvector_query(query)
+    query = text_to_tsvector_query(query, split_chunks=True)
 
     return (
         search_docs_text_query,

diff --git a/agents-api/agents_api/queries/docs/search_docs_hybrid.py b/agents-api/agents_api/queries/docs/search_docs_hybrid.py
@@ -83,7 +83,7 @@ async def search_docs_hybrid(
     owner_ids: list[str] = [str(owner[1]) for owner in owners]
 
     # Pre-process rawtext query
-    text_query = text_to_tsvector_query(text_query)
+    text_query = text_to_tsvector_query(text_query, split_chunks=True)
 
     return (
         search_docs_hybrid_query,

diff --git a/agents-api/tests/test_nlp_utilities.py b/agents-api/tests/test_nlp_utilities.py
@@ -80,6 +80,15 @@ async def _():
             "google OR john doe OR a software engineer",
         ),
         ("- google", "google"),
+        # Test duplicate keyword handling
+        (
+            "John Doe is great. John Doe is awesome.",
+            "john doe",  # Should only include "John Doe" once
+        ),
+        (
+            "Software Engineer at Google. Also, a Software Engineer.",
+            "Google OR Also a Software Engineer OR Software Engineer",  # Should only include "Software Engineer" once
+        ),
     ]
 
     for input_text, expected_output in test_cases:
@@ -141,6 +150,15 @@ async def _():
             "John Doe is a software engineer at Google.",
             "google OR john doe OR a OR software OR engineer",
         ),
+        # Test duplicate keyword handling
+        (
+            "John Doe is great. John Doe is awesome.",
+            "john doe",  # Should only include "John Doe" once even with split_chunks=True
+        ),
+        (
+            "Software Engineer at Google. Also, a Software Engineer.",
+            "Also OR a OR google OR software OR engineer",  # When split, each word appears once
+        ),
     ]
 
     for input_text, expected_output in test_cases: