From 7a83a981e5601d340d58e84222461cf523554f23 Mon Sep 17 00:00:00 2001 From: Tyler <41713505+Tylersuard@users.noreply.github.com> Date: Sat, 8 Jul 2023 21:49:56 -0700 Subject: [PATCH 1/4] Add get_all_ids_from_index() --- pinecone/index.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/pinecone/index.py b/pinecone/index.py index cd6f22e6..77cedbad 100644 --- a/pinecone/index.py +++ b/pinecone/index.py @@ -69,6 +69,41 @@ def __init__(self, index_name: str, pool_threads=1): self.user_agent = get_user_agent() self._vector_api = VectorOperationsApi(self) + + import numpy as np + def get_ids_from_query(self,input_vector): + "Helper function for get_all_ids_from_index()" + print("searching pinecone...") + results = index.query(vector=input_vector, top_k=10000,include_values=False) + ids = set() + print(type(results)) + for result in results['matches']: + ids.add(result['id']) + return ids + + def get_all_ids_from_index(index, num_dimensions, namespace=""): + """Get all ids for all vectors in the index. + + Example usage: + + all_ids = get_all_ids_from_index(index, num_dimensions=1536, namespace="") + print(all_ids)""" + + num_vectors = index.describe_index_stats()["namespaces"][namespace]['vector_count'] + all_ids = set() + while len(all_ids) < num_vectors: + print("Length of ids list is shorter than the number of total vectors...") + input_vector = np.random.rand(num_dimensions).tolist() + print("creating random vector...") + ids = get_ids_from_query(index,input_vector) + print("getting ids from a vector query...") + all_ids.update(ids) + print("updating ids set...") + print(f"Collected {len(all_ids)} ids out of {num_vectors}.") + + return all_ids + + @validate_and_convert_errors def upsert(self, vectors: Union[List[Vector], List[tuple], List[dict]], From 7411778dd5f027a2968c60281f7d9e65bb426dc4 Mon Sep 17 00:00:00 2001 From: Tyler <41713505+Tylersuard@users.noreply.github.com> Date: Sat, 8 Jul 2023 21:53:02 -0700 Subject: [PATCH 2/4] Update README.md with code for getting all vector ids --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 272692c6..6e409124 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,19 @@ query_response = index.query( ) ``` +## Get All Vector IDs + +The following example retrieves the ID for every vector in the Index. +```python +import pinecone + +pinecone.init(api_key="YOUR_API_KEY", environment="us-west1-gcp") +index = pinecone.Index("example-index") + +all_ids = index.get_all_ids_from_index(num_dimensions=1536, namespace="") +print(all_ids) +``` + ## Delete vectors The following example deletes vectors by ID. From fb1d549f910055fae7b420be5c1c7ea53f5421d7 Mon Sep 17 00:00:00 2001 From: Tyler <41713505+Tylersuard@users.noreply.github.com> Date: Sat, 8 Jul 2023 21:55:19 -0700 Subject: [PATCH 3/4] Update index.py --- pinecone/index.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pinecone/index.py b/pinecone/index.py index 77cedbad..74b3d97d 100644 --- a/pinecone/index.py +++ b/pinecone/index.py @@ -69,27 +69,25 @@ def __init__(self, index_name: str, pool_threads=1): self.user_agent = get_user_agent() self._vector_api = VectorOperationsApi(self) - - import numpy as np def get_ids_from_query(self,input_vector): "Helper function for get_all_ids_from_index()" print("searching pinecone...") - results = index.query(vector=input_vector, top_k=10000,include_values=False) + results = self.query(vector=input_vector, top_k=10000,include_values=False) ids = set() print(type(results)) for result in results['matches']: ids.add(result['id']) return ids - def get_all_ids_from_index(index, num_dimensions, namespace=""): + def get_all_ids_from_index(self, num_dimensions, namespace=""): """Get all ids for all vectors in the index. Example usage: - all_ids = get_all_ids_from_index(index, num_dimensions=1536, namespace="") + all_ids = get_all_ids_from_index(self, num_dimensions=1536, namespace="") print(all_ids)""" - num_vectors = index.describe_index_stats()["namespaces"][namespace]['vector_count'] + num_vectors = self.describe_index_stats()["namespaces"][namespace]['vector_count'] all_ids = set() while len(all_ids) < num_vectors: print("Length of ids list is shorter than the number of total vectors...") From 4cf2864619521458e64277491c26e30ffd82d84f Mon Sep 17 00:00:00 2001 From: Tyler <41713505+Tylersuard@users.noreply.github.com> Date: Sat, 8 Jul 2023 21:56:09 -0700 Subject: [PATCH 4/4] Update index.py --- pinecone/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pinecone/index.py b/pinecone/index.py index 74b3d97d..c7a5ebb5 100644 --- a/pinecone/index.py +++ b/pinecone/index.py @@ -93,7 +93,7 @@ def get_all_ids_from_index(self, num_dimensions, namespace=""): print("Length of ids list is shorter than the number of total vectors...") input_vector = np.random.rand(num_dimensions).tolist() print("creating random vector...") - ids = get_ids_from_query(index,input_vector) + ids = get_ids_from_query(self, input_vector) print("getting ids from a vector query...") all_ids.update(ids) print("updating ids set...")