From 8474e95705ac76990248325db03f3ee26d8030f3 Mon Sep 17 00:00:00 2001
From: Buqian Zheng <zhengbuqian@gmail.com>
Date: Wed, 10 Apr 2024 18:04:05 +0800
Subject: [PATCH] Update hello_hybrid_sparse_dense.py example to include BGE
 reranker

Signed-off-by: Buqian Zheng <zhengbuqian@gmail.com>
---
 examples/hello_hybrid_sparse_dense.py | 52 ++++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 8 deletions(-)

diff --git a/examples/hello_hybrid_sparse_dense.py b/examples/hello_hybrid_sparse_dense.py
index 08deb9d43..1a167750e 100644
--- a/examples/hello_hybrid_sparse_dense.py
+++ b/examples/hello_hybrid_sparse_dense.py
@@ -1,10 +1,26 @@
 # A demo showing hybrid semantic search with dense and sparse vectors using Milvus.
+#
 # You can optionally choose to use the BGE-M3 model to embed the text as dense
-# and sparse vectors, or simply use random generated vectors as the example.
-
-# To use BGE-M3 model, you need to install the optional `model` module in pymilvus:
+# and sparse vectors, or simply use random generated vectors as an example.
+#
+# You can also use the BGE CrossEncoder model to rerank the search results.
+#
+# Note that the sparse vector search feature is only available in Milvus 2.4.0 or
+# higher version. Make sure you follow https://milvus.io/docs/install_standalone-docker.md
+# to set up the latest version of Milvus in your local environment.
+
+# To connect to Milvus server, you need the python client library called pymilvus.
+# To use BGE-M3 model, you need to install the optional `model` module in pymilvus.
+# You can get them by simply running the following commands:
+#
+# pip install pymilvus
 # pip install pymilvus[model]
+
+# If true, use BGE-M3 model to generate dense and sparse vectors.
+# If false, use random numbers to compose dense and sparse vectors.
 use_bge_m3 = True
+# If true, the search result will be reranked using BGE CrossEncoder model.
+use_reranker = True
 
 # The overall steps are as follows:
 # 1. embed the text as dense and sparse vectors
@@ -104,12 +120,32 @@ def random_embedding(texts):
 # Currently Milvus only support 1 query in the same hybrid search request, so
 # we inspect res[0] directly. In future release Milvus will accept batch
 # hybrid search queries in the same call.
-for hit in res[0]:
-    print(f'text: {hit.fields["text"]} distance {hit.distance}')
-
-# If you are using BGE-M3 to generate the embedding, you should see the following:
+res = res[0]
+
+if use_reranker:
+    result_texts = [hit.fields["text"] for hit in res]
+    from pymilvus.model.reranker import BGERerankFunction
+    bge_rf = BGERerankFunction(device='cpu')
+    # rerank the results using BGE CrossEncoder model
+    results = bge_rf(query, result_texts, top_k=2)
+    for hit in results:
+        print(f'text: {hit.text} distance {hit.score}')
+else:
+    for hit in res:
+        print(f'text: {hit.fields["text"]} distance {hit.distance}')
+
+# If you used both BGE-M3 and the reranker, you should see the following:
+# text: Alan Turing was the first person to conduct substantial research in AI. distance 0.9306981017573297
+# text: Artificial intelligence was founded as an academic discipline in 1956. distance 0.03217001154515051
+#
+# If you used only BGE-M3, you should see the following:
 # text: Alan Turing was the first person to conduct substantial research in AI. distance 0.032786883413791656
 # text: Artificial intelligence was founded as an academic discipline in 1956. distance 0.016129031777381897
 
+# In this simple example the reranker yields the same result as the embedding based hybrid search, but in more complex
+# scenarios the reranker can provide more accurate results.
+
+# If you used random vectors, the result will be different each time you run the script.
+
 # Drop the collection to clean up the data.
-utility.drop_collection(col_name)
\ No newline at end of file
+utility.drop_collection(col_name)