Skip to content

Commit

Permalink
fix(agents-api): increase test coverage + set split_cuncks=Ture a…
Browse files Browse the repository at this point in the history
…s default
  • Loading branch information
Ahmad-mtos committed Jan 15, 2025
1 parent 9df8de4 commit 8fe87cb
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 4 deletions.
4 changes: 2 additions & 2 deletions agents-api/agents_api/common/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def clean_keyword(kw: str) -> str:
return WHITESPACE_RE.sub(" ", cleaned).strip()


def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = False) -> list[str]:
def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = True) -> list[str]:
"""Optimized keyword extraction with minimal behavior change."""
excluded_labels = {
"DATE", # Absolute or relative dates or periods.
Expand Down Expand Up @@ -117,7 +117,7 @@ def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = False) -> l

@lru_cache(maxsize=1000)
def text_to_tsvector_query(
paragraph: str, top_n: int = 25, min_keywords: int = 1, split_chunks: bool = False
paragraph: str, top_n: int = 25, min_keywords: int = 1, split_chunks: bool = True
) -> str:
"""
Extracts meaningful keywords/phrases from text and joins them with OR.
Expand Down
2 changes: 1 addition & 1 deletion agents-api/agents_api/queries/docs/search_docs_by_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ async def search_docs_by_text(
owner_types: list[str] = [owner[0] for owner in owners]
owner_ids: list[str] = [str(owner[1]) for owner in owners]
# Pre-process rawtext query
query = text_to_tsvector_query(query)
query = text_to_tsvector_query(query, split_chunks=True)

return (
search_docs_text_query,
Expand Down
2 changes: 1 addition & 1 deletion agents-api/agents_api/queries/docs/search_docs_hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ async def search_docs_hybrid(
owner_ids: list[str] = [str(owner[1]) for owner in owners]

# Pre-process rawtext query
text_query = text_to_tsvector_query(text_query)
text_query = text_to_tsvector_query(text_query, split_chunks=True)

return (
search_docs_hybrid_query,
Expand Down
18 changes: 18 additions & 0 deletions agents-api/tests/test_nlp_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,15 @@ async def _():
"google OR john doe OR a software engineer",
),
("- google", "google"),
# Test duplicate keyword handling
(
"John Doe is great. John Doe is awesome.",
"john doe", # Should only include "John Doe" once
),
(
"Software Engineer at Google. Also, a Software Engineer.",
"Google OR Also a Software Engineer OR Software Engineer", # Should only include "Software Engineer" once
),
]

for input_text, expected_output in test_cases:
Expand Down Expand Up @@ -141,6 +150,15 @@ async def _():
"John Doe is a software engineer at Google.",
"google OR john doe OR a OR software OR engineer",
),
# Test duplicate keyword handling
(
"John Doe is great. John Doe is awesome.",
"john doe", # Should only include "John Doe" once even with split_chunks=True
),
(
"Software Engineer at Google. Also, a Software Engineer.",
"Also OR a OR google OR software OR engineer", # When split, each word appears once
),
]

for input_text, expected_output in test_cases:
Expand Down

0 comments on commit 8fe87cb

Please sign in to comment.