diff --git a/README.md b/README.md index 5cd33b32..3f3a9c81 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ docs = ann.get_docs(limit=10, offset=0, order_by='x', ascending=True) After you have indexed the `docs`, you can update the docs in the index by calling `ann.update()`: ```python -updated_docs = [{'id': '0', 'embedding': [], 'price': 6}] +updated_docs = [{'id': '0', 'embedding': np.random.random([128]).astype(np.float32), 'price': 6}] ann.update(updated_docs) ``` diff --git a/annlite/index.py b/annlite/index.py index 22dd8b36..d642bcfe 100644 --- a/annlite/index.py +++ b/annlite/index.py @@ -56,6 +56,7 @@ def __init__( self, n_dim: int, metric: Union[str, Metric] = 'cosine', + embedding_field: str = 'embedding', n_cells: int = 1, n_subvectors: Optional[int] = None, n_clusters: Optional[int] = 256, @@ -90,6 +91,7 @@ def __init__( self.n_probe = max(n_probe, n_cells) self.n_cells = n_cells self.size_limit = 2048 + self._embedding_field = embedding_field if isinstance(metric, str): metric = Metric.from_string(metric) @@ -172,7 +174,7 @@ def __init__( total_size = 0 # TODO: add a progress bar for docs in self.documents_generator(0, batch_size=1024): - x = np.array([doc['embedding'] for doc in docs]) + x = np.array([doc[self._embedding_field] for doc in docs]) total_size += x.shape[0] self.partial_train(x, auto_save=True, force_train=True) if total_size >= MAX_TRAINING_DATA_SIZE: @@ -280,8 +282,7 @@ def index(self, docs: 'List', **kwargs): if not self.is_trained: raise RuntimeError(f'The indexer is not trained, cannot add new documents') - # TODO: Obtain the embeddings from the dict or change index signature - x = np.array([doc['embedding'] for doc in docs]) + x = np.array([doc[self._embedding_field] for doc in docs]) n_data, _ = self._sanity_check(x) assigned_cells = ( @@ -312,7 +313,7 @@ def update( raise RuntimeError(f'The indexer is not trained, cannot add new documents') # TODO: Obtain the embeddings from the dict or change index signature - x = np.array([doc['embedding'] for doc in docs]) + x = np.array([doc[self._embedding_field] for doc in docs]) n_data, _ = self._sanity_check(x) assigned_cells = ( @@ -347,7 +348,7 @@ def search( if not self.is_trained: raise RuntimeError(f'The indexer is not trained, cannot add new documents') - query_np = np.array([doc['embedding'] for doc in docs]) + query_np = np.array([doc[self._embedding_field] for doc in docs]) _, match_docs = self.search_by_vectors( query_np, filter=filter, limit=limit, include_metadata=include_metadata @@ -778,7 +779,7 @@ def _rebuild_index_from_local(self): f'Rebuild the index of cell-{cell_id} ({cell_size} docs)...' ) for docs in self.documents_generator(cell_id, batch_size=10240): - x = np.array([doc['embedding'] for doc in docs]) + x = np.array([doc[self._embedding_field] for doc in docs]) assigned_cells = np.ones(len(docs), dtype=np.int64) * cell_id super().insert(x, assigned_cells, docs, only_index=True) diff --git a/tests/test_index.py b/tests/test_index.py index 1ef0e3f8..9ded2524 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -268,7 +268,6 @@ def test_local_backup_restore(tmpdir): index = AnnLite(n_dim=D, data_path=tmpdir / 'workspace' / '0') index.index(docs) - tmpname = uuid.uuid4().hex index.backup() index.close() @@ -278,3 +277,24 @@ def test_local_backup_restore(tmpdir): status = index.stat assert int(status['total_docs']) == N assert int(status['index_size']) == N + + +def test_index_search_different_field(tmpdir): + X = np.random.random((N, D)).astype( + np.float32 + ) # 10,000 128-dim vectors to be indexed + + index = AnnLite( + n_dim=D, data_path=str(tmpdir), embedding_field='encoding', metric='euclidean' + ) + docs = [dict(id=f'{i}', encoding=X[i]) for i in range(N)] + index.index(docs) + query = [dict(encoding=X[i]) for i in range(5)] + + matches = index.search(query) + + for i in range(len(matches[0]) - 1): + assert ( + matches[0][i]['scores']['euclidean'] + <= matches[0][i + 1]['scores']['euclidean'] + )