diff --git a/agents-api/agents_api/models/docs/__init__.py b/agents-api/agents_api/models/docs/__init__.py index f668e048d..0ba3db0d4 100644 --- a/agents-api/agents_api/models/docs/__init__.py +++ b/agents-api/agents_api/models/docs/__init__.py @@ -21,4 +21,5 @@ from .embed_snippets import embed_snippets from .get_doc import get_doc from .list_docs import list_docs -from .search_docs import search_docs_by_embedding +from .search_docs_by_embedding import search_docs_by_embedding +from .search_docs_by_text import search_docs_by_text diff --git a/agents-api/agents_api/models/docs/search_docs.py b/agents-api/agents_api/models/docs/search_docs_by_embedding.py similarity index 98% rename from agents-api/agents_api/models/docs/search_docs.py rename to agents-api/agents_api/models/docs/search_docs_by_embedding.py index d5903bfe7..0acbf8f6a 100644 --- a/agents-api/agents_api/models/docs/search_docs.py +++ b/agents-api/agents_api/models/docs/search_docs_by_embedding.py @@ -48,6 +48,7 @@ def search_docs_by_embedding( confidence: float = 0.7, ef: int = 128, mmr_lambda: float = 0.25, + embedding_size: int = 1024, ) -> tuple[list[str], dict]: """ Searches for document snippets in CozoDB by embedding query. @@ -61,6 +62,9 @@ def search_docs_by_embedding( - mmr_lambda (float, optional): The lambda parameter for MMR. Defaults to 0.25. """ + assert len(query_embedding) == embedding_size + assert sum(query_embedding) + owner_id = str(owner_id) # Calculate the search radius based on confidence level diff --git a/agents-api/agents_api/models/docs/search_docs_by_text.py b/agents-api/agents_api/models/docs/search_docs_by_text.py new file mode 100644 index 000000000..901e3ea31 --- /dev/null +++ b/agents-api/agents_api/models/docs/search_docs_by_text.py @@ -0,0 +1,153 @@ +"""This module contains functions for searching documents in the CozoDB based on embedding queries.""" + +from typing import Literal +from uuid import UUID + +from beartype import beartype +from fastapi import HTTPException +from pycozo.client import QueryException +from pydantic import ValidationError + +from ...autogen.openapi_model import DocReference +from ..utils import ( + cozo_query, + partialclass, + rewrap_exceptions, + verify_developer_id_query, + verify_developer_owns_resource_query, + wrap_in_class, +) + + +@rewrap_exceptions( + { + QueryException: partialclass(HTTPException, status_code=400), + ValidationError: partialclass(HTTPException, status_code=400), + TypeError: partialclass(HTTPException, status_code=400), + } +) +@wrap_in_class( + DocReference, + transform=lambda d: { + "owner": { + "id": d["owner_id"], + "role": d["owner_type"], + }, + **d, + }, +) +@cozo_query +@beartype +def search_docs_by_text( + *, + developer_id: UUID, + owner_type: Literal["user", "agent"], + owner_id: UUID, + query: str, + k: int = 3, +) -> tuple[list[str], dict]: + """ + Searches for document snippets in CozoDB by embedding query. + + Parameters: + - owner_type (Literal["user", "agent"]): The type of the owner of the documents. + - owner_id (UUID): The unique identifier of the owner. + - query (str): The query string. + - k (int, optional): The number of nearest neighbors to retrieve. Defaults to 3. + """ + + owner_id = str(owner_id) + + # Construct the datalog query for searching document snippets + search_query = f""" + input[ + owner_id, + query, + ] <- [[ + to_uuid($owner_id), + $query, + ]] + + candidate[doc_id] := + input[owner_id, _], + *docs {{ + owner_type: $owner_type, + owner_id, + doc_id + }} + + search_result[ + doc_id, + snippet_data, + distance, + ] := + input[owner_id, query], + candidate[doc_id], + ~snippets:fts {{ + doc_id, + index, + content + | + query: query, + k: {k}, + score_kind: 'tf_idf', + bind_score: score, + }}, + distance = -score, + snippet_data = [index, content] + + m[ + doc_id, + collect(snippet), + distance, + title, + ] := + candidate[doc_id], + *docs {{ + owner_type: $owner_type, + owner_id, + doc_id, + title, + }}, + search_result [ + doc_id, + snippet_data, + distance, + ], + snippet = {{ + "index": snippet_data->0, + "content": snippet_data->1, + }} + + + ?[ + id, + owner_type, + owner_id, + snippets, + distance, + title, + ] := m[ + id, + snippets, + distance, + title, + ], owner_type = $owner_type, owner_id = $owner_id + + # Sort the results by distance to find the closest matches + :sort distance + :limit {k} + """ + + queries = [ + verify_developer_id_query(developer_id), + verify_developer_owns_resource_query( + developer_id, f"{owner_type}s", **{f"{owner_type}_id": owner_id} + ), + search_query, + ] + + return ( + queries, + {"owner_type": owner_type, "owner_id": owner_id, "query": query}, + ) diff --git a/agents-api/agents_api/models/docs/search_docs_hybrid.py b/agents-api/agents_api/models/docs/search_docs_hybrid.py new file mode 100644 index 000000000..1595b3ca5 --- /dev/null +++ b/agents-api/agents_api/models/docs/search_docs_hybrid.py @@ -0,0 +1,121 @@ +"""This module contains functions for searching documents in the CozoDB based on embedding queries.""" + +from statistics import mean, stdev +from typing import Literal +from uuid import UUID + +from beartype import beartype + +from ...autogen.openapi_model import DocReference +from .search_docs_by_embedding import search_docs_by_embedding +from .search_docs_by_text import search_docs_by_text + + +# Distribution based score normalization +# https://medium.com/plain-simple-software/distribution-based-score-fusion-dbsf-a-new-approach-to-vector-search-ranking-f87c37488b18 +def dbsf_normalize(scores: list[float]) -> list[float]: + """ + Scores scaled using minmax scaler with our custom feature range + (extremes indicated as 3 standard deviations from the mean) + """ + sd = stdev(scores) + if sd == 0: + return scores + + m = mean(scores) + m3d = 3 * sd + m + m_3d = m - 3 * sd + + return [(s - m_3d) / (m3d - m_3d) for s in scores] + + +def dbsf_fuse( + text_results: list[DocReference], + embedding_results: list[DocReference], + alpha: float = 0.7, # Weight of the embedding search results (this is a good default) +) -> list[DocReference]: + """ + Weighted reciprocal-rank fusion of text and embedding search results + """ + all_docs = {doc.id: doc for doc in text_results + embedding_results} + + text_scores: dict[UUID, float] = {doc.id: -doc.distance for doc in text_results} + + # Because these are cosine distances, we need to invert them + embedding_scores: dict[UUID, float] = { + doc.id: 1.0 - doc.distance for doc in embedding_results + } + + # normalize the scores + text_scores_normalized = dbsf_normalize(list(text_scores.values())) + text_scores = { + doc_id: score + for doc_id, score in zip(text_scores.keys(), text_scores_normalized) + } + + embedding_scores_normalized = dbsf_normalize(list(embedding_scores.values())) + embedding_scores = { + doc_id: score + for doc_id, score in zip(embedding_scores.keys(), embedding_scores_normalized) + } + + # Combine the scores + text_weight: float = 1 - alpha + embedding_weight: float = alpha + + combined_scores = [] + + for id in all_docs.keys(): + text_score = text_weight * text_scores.get(id, 0) + embedding_score = embedding_weight * embedding_scores.get(id, 0) + + combined_scores.append((id, text_score + embedding_score)) + + # Sort by the combined score + combined_scores = sorted(combined_scores, key=lambda x: x[1], reverse=True) + + # Rank the results + ranked_results = [] + for id, score in combined_scores: + doc = all_docs[id].model_copy() + doc.distance = 1.0 - score + ranked_results.append(doc) + + return ranked_results + + +@beartype +def search_docs_hybrid( + *, + developer_id: UUID, + owner_type: Literal["user", "agent"], + owner_id: UUID, + query: str, + query_embedding: list[float], + k: int = 3, + embed_search_options: dict = {}, + text_search_options: dict = {}, + **kwargs, +) -> list[DocReference]: + # TODO: We should probably parallelize these queries + text_results = search_docs_by_text( + developer_id=developer_id, + owner_type=owner_type, + owner_id=owner_id, + query=query, + k=2 * k, + **text_search_options, + **kwargs, + ) + + embedding_results = search_docs_by_embedding( + developer_id=developer_id, + owner_type=owner_type, + owner_id=owner_id, + query_embedding=query_embedding, + k=2 * k, + **embed_search_options, + **kwargs, + ) + + return dbsf_fuse(text_results, embedding_results)[:k] diff --git a/agents-api/agents_api/models/session/prepare_chat_context.py b/agents-api/agents_api/models/session/prepare_chat_context.py index f599a8d25..ccd49a36a 100644 --- a/agents-api/agents_api/models/session/prepare_chat_context.py +++ b/agents-api/agents_api/models/session/prepare_chat_context.py @@ -47,15 +47,10 @@ def prepare_chat_context( developer_id: UUID, agent_id: UUID, session_id: UUID, - # doc_query_embedding: list[float], - # docs_confidence: float = 0.4, - # k_docs: int = 3, ) -> tuple[list[str], dict]: """ - Executes a complex query to retrieve memory context based on session ID, tool and document embeddings. + Executes a complex query to retrieve memory context based on session ID. """ - # VECTOR_SIZE = 1024 - # docs_radius: float = 1.0 - docs_confidence session_data_query, sd_vars = prepare_session_data.__wrapped__( developer_id=developer_id, session_id=session_id @@ -89,9 +84,6 @@ def prepare_chat_context( }} """ - # TODO: Implement the following queries - # docs_query = ... - entries_query, e_vars = list_entries.__wrapped__( developer_id=developer_id, session_id=session_id, @@ -143,8 +135,5 @@ def prepare_chat_context( **sd_vars, **t_vars, **e_vars, - # "doc_query_embedding": doc_query_embedding, - # "k_docs": k_docs, - # "docs_radius": round(docs_radius, 2), }, )