From 385ddbfa06f1b3f92225cf56c1ca060ec6e013a1 Mon Sep 17 00:00:00 2001 From: Dmitry Paramonov Date: Fri, 25 Oct 2024 22:30:45 +0300 Subject: [PATCH] feat: Add metadata filter argument to doc search --- agents-api/agents_api/autogen/Docs.py | 3 ++- .../models/docs/search_docs_by_embedding.py | 14 +++++++++++++- .../agents_api/models/docs/search_docs_by_text.py | 12 +++++++++++- agents-api/agents_api/routers/docs/search_docs.py | 13 +++++++++++-- typespec/docs/models.tsp | 1 + .../@typespec/openapi3/openapi-0.4.0.yaml | 9 +++++++++ .../@typespec/openapi3/openapi-1.0.0.yaml | 9 +++++++++ 7 files changed, 56 insertions(+), 5 deletions(-) diff --git a/agents-api/agents_api/autogen/Docs.py b/agents-api/agents_api/autogen/Docs.py index b029a0acf..0a9c0f432 100644 --- a/agents-api/agents_api/autogen/Docs.py +++ b/agents-api/agents_api/autogen/Docs.py @@ -6,7 +6,7 @@ from typing import Annotated, Any, Literal from uuid import UUID -from pydantic import AwareDatetime, BaseModel, ConfigDict, Field +from pydantic import AwareDatetime, BaseModel, ConfigDict, Field, StrictBool class BaseDocSearchRequest(BaseModel): @@ -18,6 +18,7 @@ class BaseDocSearchRequest(BaseModel): """ The language to be used for text-only search. Support for other languages coming soon. """ + metadata_filter: dict[str, float | str | StrictBool | None] class CreateDocRequest(BaseModel): diff --git a/agents-api/agents_api/models/docs/search_docs_by_embedding.py b/agents-api/agents_api/models/docs/search_docs_by_embedding.py index e346b6b69..3e388146d 100644 --- a/agents-api/agents_api/models/docs/search_docs_by_embedding.py +++ b/agents-api/agents_api/models/docs/search_docs_by_embedding.py @@ -1,5 +1,6 @@ """This module contains functions for searching documents in the CozoDB based on embedding queries.""" +import json from typing import Any, Literal, TypeVar from uuid import UUID @@ -51,6 +52,7 @@ def search_docs_by_embedding( ef: int = 50, mmr_lambda: float = 0.5, embedding_size: int = 1024, + metadata_filter: dict[str, Any] = {}, ) -> tuple[list[str], dict]: """ Searches for document snippets in CozoDB by embedding query. @@ -62,11 +64,20 @@ def search_docs_by_embedding( k (int, optional): The number of nearest neighbors to retrieve. Defaults to 3. confidence (float, optional): The confidence threshold for filtering results. Defaults to 0.8. mmr_lambda (float, optional): The lambda parameter for MMR. Defaults to 0.25. + embedding_size (int): Embedding vector length + metadata_filter (dict[str, Any]): Dictionary to filter agents based on metadata. """ assert len(query_embedding) == embedding_size assert sum(query_embedding) + metadata_filter_str = ", ".join( + [ + f"metadata->{json.dumps(k)} == {json.dumps(v)}" + for k, v in metadata_filter.items() + ] + ) + owners: list[list[str]] = [ [owner_type, str(owner_id)] for owner_type, owner_id in owners ] @@ -92,7 +103,8 @@ def search_docs_by_embedding( owner_type, owner_id, doc_id - }} + }}, + {metadata_filter_str} intersnippet_distance[ doc_id, diff --git a/agents-api/agents_api/models/docs/search_docs_by_text.py b/agents-api/agents_api/models/docs/search_docs_by_text.py index 34ee8cb00..d63c594db 100644 --- a/agents-api/agents_api/models/docs/search_docs_by_text.py +++ b/agents-api/agents_api/models/docs/search_docs_by_text.py @@ -1,5 +1,6 @@ """This module contains functions for searching documents in the CozoDB based on embedding queries.""" +import json import re from typing import Any, Literal, TypeVar from uuid import UUID @@ -49,6 +50,7 @@ def search_docs_by_text( owners: list[tuple[Literal["user", "agent"], UUID]], query: str, k: int = 3, + metadata_filter: dict[str, Any] = {}, ) -> tuple[list[str], dict]: """ Searches for document snippets in CozoDB by embedding query. @@ -57,7 +59,14 @@ def search_docs_by_text( owners (list[tuple[Literal["user", "agent"], UUID]]): The type of the owner of the documents. query (str): The query string. k (int, optional): The number of nearest neighbors to retrieve. Defaults to 3. + metadata_filter (dict[str, Any]): Dictionary to filter agents based on metadata. """ + metadata_filter_str = ", ".join( + [ + f"metadata->{json.dumps(k)} == {json.dumps(v)}" + for k, v in metadata_filter.items() + ] + ) owners: list[list[str]] = [ [owner_type, str(owner_id)] for owner_type, owner_id in owners @@ -84,7 +93,8 @@ def search_docs_by_text( owner_type, owner_id, doc_id - }} + }}, + {metadata_filter_str} search_result[ doc_id, diff --git a/agents-api/agents_api/routers/docs/search_docs.py b/agents-api/agents_api/routers/docs/search_docs.py index ce0b62811..bab2875f5 100644 --- a/agents-api/agents_api/routers/docs/search_docs.py +++ b/agents-api/agents_api/routers/docs/search_docs.py @@ -25,21 +25,28 @@ def get_search_fn_and_params( search_fn, params = None, None match search_params: - case TextOnlyDocSearchRequest(text=query, limit=k): + case TextOnlyDocSearchRequest( + text=query, limit=k, metadata_filter=metadata_filter + ): search_fn = search_docs_by_text params = dict( query=query, k=k, + metadata_filter=metadata_filter, ) case VectorDocSearchRequest( - vector=query_embedding, limit=k, confidence=confidence + vector=query_embedding, + limit=k, + confidence=confidence, + metadata_filter=metadata_filter, ): search_fn = search_docs_by_embedding params = dict( query_embedding=query_embedding, k=k, confidence=confidence, + metadata_filter=metadata_filter, ) case HybridDocSearchRequest( @@ -48,6 +55,7 @@ def get_search_fn_and_params( limit=k, confidence=confidence, alpha=alpha, + metadata_filter=metadata_filter, ): search_fn = search_docs_hybrid params = dict( @@ -56,6 +64,7 @@ def get_search_fn_and_params( k=k, embed_search_options=dict(confidence=confidence), alpha=alpha, + metadata_filter=metadata_filter, ) return search_fn, params diff --git a/typespec/docs/models.tsp b/typespec/docs/models.tsp index dca287222..c2e8859d6 100644 --- a/typespec/docs/models.tsp +++ b/typespec/docs/models.tsp @@ -82,6 +82,7 @@ model BaseDocSearchRequest { /** The language to be used for text-only search. Support for other languages coming soon. */ lang: "en-US" = "en-US"; + metadata_filter: MetadataFilter, } model VectorDocSearchRequest extends BaseDocSearchRequest { diff --git a/typespec/tsp-output/@typespec/openapi3/openapi-0.4.0.yaml b/typespec/tsp-output/@typespec/openapi3/openapi-0.4.0.yaml index 72628ab49..489b96fa7 100644 --- a/typespec/tsp-output/@typespec/openapi3/openapi-0.4.0.yaml +++ b/typespec/tsp-output/@typespec/openapi3/openapi-0.4.0.yaml @@ -2425,6 +2425,7 @@ components: required: - limit - lang + - metadata_filter properties: limit: type: integer @@ -2438,6 +2439,14 @@ components: - en-US description: The language to be used for text-only search. Support for other languages coming soon. default: en-US + metadata_filter: + type: object + additionalProperties: + anyOf: + - type: number + - type: string + - type: boolean + nullable: true Docs.CreateDocRequest: type: object required: diff --git a/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml b/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml index 118d8adc8..06630cb44 100644 --- a/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml +++ b/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml @@ -2425,6 +2425,7 @@ components: required: - limit - lang + - metadata_filter properties: limit: type: integer @@ -2438,6 +2439,14 @@ components: - en-US description: The language to be used for text-only search. Support for other languages coming soon. default: en-US + metadata_filter: + type: object + additionalProperties: + anyOf: + - type: number + - type: string + - type: boolean + nullable: true Docs.CreateDocRequest: type: object required: