This repository has been archived by the owner on Nov 13, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 122
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Qdrant as a supported knowledge base (#244)
* feat: QdrantKnowlegeBase * feat: Async QdrantKnowledgeBase * test: updated async tests * test: async tests, refactor * chore: linting * docs: Added QdrantKnowledgeBase docstrings * chore: added QdrantKnowledgeBase.from_config() * docs: fix typos * chore: Bumped qdrant_client 1.7.2 * chore: resolve typings, default pytest-dotenv * chore: optional import qdrant_client * chore: Use distance TitleCase as docs * docs: Qdrant reference library.md * chore: Bump qdrant_client pyproject.toml
- Loading branch information
Showing
14 changed files
with
1,757 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
from .knowledge_base import list_canopy_indexes | ||
from .knowledge_base import KnowledgeBase | ||
from .qdrant.qdrant_knowledge_base import QdrantKnowledgeBase |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from canopy.knowledge_base.knowledge_base import INDEX_NAME_PREFIX | ||
|
||
COLLECTION_NAME_PREFIX = INDEX_NAME_PREFIX | ||
DENSE_VECTOR_NAME = "dense" | ||
RESERVED_METADATA_KEYS = {"document_id", "text", "source", "chunk_id"} | ||
SPARSE_VECTOR_NAME = "sparse" | ||
UUID_NAMESPACE = "867603e3-ba69-447d-a8ef-263dff19bda7" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
from copy import deepcopy | ||
from typing import Dict, List, Any, Union | ||
import uuid | ||
from canopy.knowledge_base.models import ( | ||
KBDocChunkWithScore, | ||
KBEncodedDocChunk, | ||
KBQuery, | ||
VectorValues, | ||
) | ||
from pinecone_text.sparse import SparseVector | ||
|
||
try: | ||
from qdrant_client import models | ||
except ImportError: | ||
pass | ||
|
||
from canopy.knowledge_base.qdrant.constants import ( | ||
DENSE_VECTOR_NAME, | ||
SPARSE_VECTOR_NAME, | ||
UUID_NAMESPACE, | ||
) | ||
|
||
|
||
class QdrantConverter: | ||
@staticmethod | ||
def convert_id(_id: str) -> str: | ||
""" | ||
Converts any string into a UUID string based on a seed. | ||
Qdrant accepts UUID strings and unsigned integers as point ID. | ||
We use a seed to convert each string into a UUID string deterministically. | ||
This allows us to overwrite the same point with the original ID. | ||
""" | ||
return str(uuid.uuid5(uuid.UUID(UUID_NAMESPACE), _id)) | ||
|
||
@staticmethod | ||
def encoded_docs_to_points( | ||
encoded_docs: List[KBEncodedDocChunk], | ||
) -> "List[models.PointStruct]": | ||
points = [] | ||
for doc in encoded_docs: | ||
record = doc.to_db_record() | ||
_id: str = record.pop("id") | ||
dense_vector: VectorValues = record.pop("values", None) | ||
sparse_vector: SparseVector = record.pop("sparse_values", None) | ||
|
||
vector: Dict[str, models.Vector] = {} | ||
|
||
if dense_vector: | ||
vector[DENSE_VECTOR_NAME] = dense_vector | ||
|
||
if sparse_vector: | ||
vector[SPARSE_VECTOR_NAME] = models.SparseVector( | ||
indices=sparse_vector["indices"], | ||
values=sparse_vector["values"], | ||
) | ||
|
||
points.append( | ||
models.PointStruct( | ||
id=QdrantConverter.convert_id(_id), | ||
vector=vector, | ||
payload={**record["metadata"], "chunk_id": _id}, | ||
) | ||
) | ||
return points | ||
|
||
@staticmethod | ||
def scored_point_to_scored_doc( | ||
scored_point, | ||
) -> "KBDocChunkWithScore": | ||
metadata: Dict[str, Any] = deepcopy(scored_point.payload or {}) | ||
_id = metadata.pop("chunk_id") | ||
text = metadata.pop("text", "") | ||
document_id = metadata.pop("document_id") | ||
return KBDocChunkWithScore( | ||
id=_id, | ||
text=text, | ||
document_id=document_id, | ||
score=scored_point.score, | ||
source=metadata.pop("source", ""), | ||
metadata=metadata, | ||
) | ||
|
||
@staticmethod | ||
def kb_query_to_search_vector( | ||
query: KBQuery, | ||
) -> "Union[models.NamedVector, models.NamedSparseVector]": | ||
# Use dense vector if available, otherwise use sparse vector | ||
query_vector: Union[models.NamedVector, models.NamedSparseVector] | ||
if query.values: | ||
query_vector = models.NamedVector(name=DENSE_VECTOR_NAME, vector=query.values) # noqa: E501 | ||
elif query.sparse_values: | ||
query_vector = models.NamedSparseVector( | ||
name=SPARSE_VECTOR_NAME, | ||
vector=models.SparseVector( | ||
indices=query.sparse_values["indices"], | ||
values=query.sparse_values["values"], | ||
), | ||
) | ||
else: | ||
raise ValueError("Query should have either dense or sparse vector.") | ||
return query_vector |
Oops, something went wrong.