MMR implementation (#770)

> [!IMPORTANT] > Implements Maximal Marginal Relevance (MMR) in document search with a new `mmr_strength` parameter, updating search logic and API. > > - **Behavior**: > - Introduces `mmr_strength` parameter in `BaseDocSearchRequest` in `Docs.py` to control MMR behavior. > - Implements MMR logic in `maximal_marginal_relevance()` in `mmr.py`. > - Integrates MMR in `search_user_docs()` and `search_agent_docs()` in `search_docs.py`. > - **Search Logic**: > - Modifies `search_docs_by_embedding()` and `search_docs_hybrid()` to adjust `k` based on `mmr_strength`. > - Adds `embedding` field to `Snippet` model in `Docs.py` and `models.tsp`. > - **Dependencies**: > - Adds `simsimd` to `pyproject.toml` for optimized cosine similarity calculations. > - **Misc**: > - Updates OpenAPI spec in `openapi-1.0.0.yaml` to include `mmr_strength` and `embedding` fields. > > <sup>This description was created by </sup>[<img alt="Ellipsis" src="https://img.shields.io/badge/Ellipsis-blue?color=175173">](https://www.ellipsis.dev?ref=julep-ai%2Fjulep&utm_source=github&utm_medium=referral)<sup> for 583f70a. It will automatically update as commits are pushed.</sup>  --------- Signed-off-by: Diwank Singh Tomer <[email protected]> Co-authored-by: Diwank Singh Tomer <[email protected]>
julep-ai · Oct 30, 2024 · 74982fa · 74982fa
1 parent 7f3bee0
commit 74982fa
Show file tree

Hide file tree

Showing 12 changed files with 341 additions and 114 deletions.
diff --git a/agents-api/agents_api/autogen/Docs.py b/agents-api/agents_api/autogen/Docs.py
@@ -19,6 +19,10 @@ class BaseDocSearchRequest(BaseModel):
     The language to be used for text-only search. Support for other languages coming soon.
     """
     metadata_filter: dict[str, float | str | StrictBool | None] = {}
+    mmr_strength: Annotated[float, Field(ge=0.0, lt=1.0)] = 0
+    """
+    MMR Strength (mmr_strength = 1 - mmr_lambda)
+    """
 
 
 class CreateDocRequest(BaseModel):
@@ -176,6 +180,7 @@ class Snippet(BaseModel):
     )
     index: int
     content: str
+    embedding: list[float] | None = None
 
 
 class TextOnlyDocSearchRequest(BaseDocSearchRequest):

diff --git a/agents-api/agents_api/models/docs/mmr.py b/agents-api/agents_api/models/docs/mmr.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+import logging
+from typing import Union
+
+import numpy as np
+
+Matrix = Union[list[list[float]], list[np.ndarray], np.ndarray]
+
+logger = logging.getLogger(__name__)
+
+
+def _cosine_similarity(x: Matrix, y: Matrix) -> np.ndarray:
+    """Row-wise cosine similarity between two equal-width matrices.
+
+    Args:
+        x: A matrix of shape (n, m).
+        y: A matrix of shape (k, m).
+
+    Returns:
+        A matrix of shape (n, k) where each element (i, j) is the cosine similarity
+        between the ith row of X and the jth row of Y.
+
+    Raises:
+        ValueError: If the number of columns in X and Y are not the same.
+        ImportError: If numpy is not installed.
+    """
+
+    if len(x) == 0 or len(y) == 0:
+        return np.array([])
+
+    x = np.array(x)
+    y = np.array(y)
+    if x.shape[1] != y.shape[1]:
+        msg = (
+            f"Number of columns in X and Y must be the same. X has shape {x.shape} "
+            f"and Y has shape {y.shape}."
+        )
+        raise ValueError(msg)
+    try:
+        import simsimd as simd  # type: ignore
+
+        x = np.array(x, dtype=np.float32)
+        y = np.array(y, dtype=np.float32)
+        z = 1 - np.array(simd.cdist(x, y, metric="cosine"))
+        return z
+    except ImportError:
+        logger.debug(
+            "Unable to import simsimd, defaulting to NumPy implementation. If you want "
+            "to use simsimd please install with `pip install simsimd`."
+        )
+        x_norm = np.linalg.norm(x, axis=1)
+        y_norm = np.linalg.norm(y, axis=1)
+        # Ignore divide by zero errors run time warnings as those are handled below.
+        with np.errstate(divide="ignore", invalid="ignore"):
+            similarity = np.dot(x, y.T) / np.outer(x_norm, y_norm)
+        similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
+        return similarity
+
+
+def maximal_marginal_relevance(
+    query_embedding: np.ndarray,
+    embedding_list: list,
+    lambda_mult: float = 0.5,
+    k: int = 4,
+) -> list[int]:
+    """Calculate maximal marginal relevance.
+
+    Args:
+        query_embedding: The query embedding.
+        embedding_list: A list of embeddings.
+        lambda_mult: The lambda parameter for MMR. Default is 0.5.
+        k: The number of embeddings to return. Default is 4.
+
+    Returns:
+        A list of indices of the embeddings to return.
+
+    Raises:
+        ImportError: If numpy is not installed.
+    """
+
+    if min(k, len(embedding_list)) <= 0:
+        return []
+    if query_embedding.ndim == 1:
+        query_embedding = np.expand_dims(query_embedding, axis=0)
+    similarity_to_query = _cosine_similarity(query_embedding, embedding_list)[0]
+    most_similar = int(np.argmax(similarity_to_query))
+    idxs = [most_similar]
+    selected = np.array([embedding_list[most_similar]])
+    while len(idxs) < min(k, len(embedding_list)):
+        best_score = -np.inf
+        idx_to_add = -1
+        similarity_to_selected = _cosine_similarity(embedding_list, selected)
+        for i, query_score in enumerate(similarity_to_query):
+            if i in idxs:
+                continue
+            redundant_score = max(similarity_to_selected[i])
+            equation_score = (
+                lambda_mult * query_score - (1 - lambda_mult) * redundant_score
+            )
+            if equation_score > best_score:
+                best_score = equation_score
+                idx_to_add = i
+        idxs.append(idx_to_add)
+        selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
+    return idxs
diff --git a/agents-api/agents_api/models/docs/search_docs_by_embedding.py b/agents-api/agents_api/models/docs/search_docs_by_embedding.py
@@ -50,7 +50,6 @@ def search_docs_by_embedding(
     k: int = 3,
     confidence: float = 0.5,
     ef: int = 50,
-    mmr_strength: float = 0.0,
     embedding_size: int = 1024,
     ann_threshold: int = 1_000_000,
     metadata_filter: dict[str, Any] = {},
@@ -71,9 +70,6 @@ def search_docs_by_embedding(
 
     assert len(query_embedding) == embedding_size
     assert sum(query_embedding)
-    assert 0 <= mmr_strength < 1, "MMR strength must be in [0, 1) interval"
-
-    mmr_lambda: float = 1 - mmr_strength
 
     metadata_filter_str = ", ".join(
         [
@@ -138,6 +134,7 @@ def search_docs_by_embedding(
                 title,
                 content,
                 distance,
+                embedding,
             ] :=
                 # Get input values
                 input[owner_type, owner_id, query],
@@ -157,10 +154,11 @@ def search_docs_by_embedding(
                     content
                     |
                     query: query,
-                    k: {k*(3 if mmr_strength else 1)},   # Get more candidates for diversity
+                    k: {k},
                     ef: {ef},
                     radius: {radius},
                     bind_distance: distance,
+                    bind_vector: embedding,
                 }}
 
             :create _search_result {{
@@ -169,6 +167,7 @@ def search_docs_by_embedding(
                 title,
                 content,
                 distance,
+                embedding,
             }}
         }}
 
@@ -190,6 +189,7 @@ def search_docs_by_embedding(
                 title,
                 content,
                 distance,
+                embedding,
             ] :=
                 # Get input values
                 input[owner_type, owner_id, query],
@@ -213,14 +213,15 @@ def search_docs_by_embedding(
                 distance = cos_dist(query, embedding),
                 distance <= {radius}
 
-            :limit {k*(3 if mmr_strength else 1)}   # Get more candidates for diversity
+            :limit {k}   # Get more candidates for diversity
 
             :create _search_result {{
                 doc_id,
                 index,
                 title,
                 content,
                 distance,
+                embedding,
             }}
         }}
         %end
@@ -235,103 +236,24 @@ def search_docs_by_embedding(
             doc_id,
             snippet_data,
             distance,
-            mmr_score,
             title,
+            embedding,
         ] := 
             owners[owner_type, owner_id_str],
             owner_id = to_uuid(owner_id_str),
-            *_search_result{{ doc_id, index, title, content, distance }},
-            mmr_score = distance,
+            *_search_result{{ doc_id, index, title, content, distance, embedding, }},
             snippet_data = [index, content]
 
-        # Sort the results by distance to find the closest matches
-        :sort -mmr_score
-        :limit {k*(3 if mmr_strength else 1)}   # Get more candidates for diversity
-
-        :create _interim {{
-            owner_type,
-            owner_id,
-            doc_id,
-            snippet_data,
-            distance,
-            mmr_score,
-            title,
-        }}
-    """
-
-    mmr_interim_query = f"""
-        owners[owner_type, owner_id] <- $owners
-
-        # Calculate the min distance between every doc and every snippet being compared
-        intersnippet_distance[
-            doc_id,
-            index1,
-            min(dist)
-        ] :=
-            *_search_result{{ doc_id: doc_id2, index: index2 }},
-            *snippets {{
-                doc_id,
-                index: index1,
-                embedding: embedding1
-            }},
-            *snippets {{
-                doc_id: doc_id2,
-                index: index2,
-                embedding: embedding2
-            }},
-            is_null(embedding1) == false,
-            is_null(embedding2) == false,
-
-            # When doc_id == doc_id2, dont compare the same snippet
-            doc_id != doc_id2 || index1 != index2,
-            dist = cos_dist(embedding1, embedding2)
-
-
-        apply_mmr[
-            doc_id,
-            title,
-            snippet_data,
-            distance,
-            mmr_score,
-        ] :=
-            *_search_result{{ doc_id, index, title, content, distance: original_distance }},
-            intersnippet_distance[doc_id, index, intersnippet_distance],
-            mmr_score = ({mmr_lambda} * original_distance) - ((1.0 - {mmr_lambda}) * intersnippet_distance),
-            distance = max(0.0, min(1.0 - mmr_score, 1.0)),
-            snippet_data = [index, content]
-
-        ?[
-            owner_type,
-            owner_id,
-            doc_id,
-            snippet_data,
-            distance,
-            mmr_score,
-            title,
-        ] := 
-            owners[owner_type, owner_id_str],
-            owner_id = to_uuid(owner_id_str),
-            
-            apply_mmr[
-                doc_id,
-                title,
-                snippet_data,
-                distance,
-                mmr_score,
-            ]
-
-        # Sort the results by distance to find the closest matches
-        :sort -mmr_score
-        :limit {k}
+        :limit {k}   # Get more candidates for diversity
 
         :create _interim {{
             owner_type,
             owner_id,
             doc_id,
             snippet_data,
             distance,
-            mmr_score,
             title,
+            embedding,
         }}
     """
 
@@ -343,6 +265,7 @@ def search_docs_by_embedding(
             unique(snippet_data),
             distance,
             title,
+            embedding,
         ] := 
             *_interim {
                 owner_type,
@@ -351,6 +274,7 @@ def search_docs_by_embedding(
                 snippet_data,
                 distance,
                 title,
+                embedding,
             }
 
         m[
@@ -368,10 +292,12 @@ def search_docs_by_embedding(
                 snippet_data,
                 distance,
                 title,
+                embedding,
             ],
             snippet = {
                 "index": snippet_datum->0,
-                "content": snippet_datum->1
+                "content": snippet_datum->1,
+                "embedding": embedding,
             },
             snippet_datum in snippet_data
 
@@ -408,7 +334,7 @@ def search_docs_by_embedding(
         {{ {verify_query} }}
         {{ {determine_knn_ann_query} }}
         {search_query}
-        {{ {normal_interim_query if mmr_strength == 0.0 else mmr_interim_query} }}
+        {{ {normal_interim_query} }}
         {{ {collect_query} }}
     """
 

diff --git a/agents-api/agents_api/models/docs/search_docs_hybrid.py b/agents-api/agents_api/models/docs/search_docs_hybrid.py
@@ -107,7 +107,7 @@ def search_docs_hybrid(
         developer_id=developer_id,
         owners=owners,
         query=query,
-        k=2 * k,
+        k=k,
         metadata_filter=metadata_filter,
         **text_search_options,
     )
@@ -116,7 +116,7 @@ def search_docs_hybrid(
         developer_id=developer_id,
         owners=owners,
         query_embedding=query_embedding,
-        k=2 * k,
+        k=k,
         metadata_filter=metadata_filter,
         **embed_search_options,
     )