Fix incorrect counter configuration that was causing array out of bou…

…nds errors (#163) Related to #158 Make the hit counter allocate reader.maxDocs instead of reader.numDocs, because reader.numDocs gets decreased when docs get deleted, even though the largest docId will still be the same. Also made the testing a bit more robust.
alexklibisz · Sep 24, 2020 · 20a94b7 · 20a94b7
1 parent 1968edb
commit 20a94b7
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 37 deletions.
diff --git a/elastiknn-lucene/src/main/java/org/apache/lucene/search/MatchHashesAndScoreQuery.java b/elastiknn-lucene/src/main/java/org/apache/lucene/search/MatchHashesAndScoreQuery.java
@@ -59,12 +59,14 @@ private HitCounter countHits(LeafReader reader) throws IOException {
                 } else {
                     TermsEnum termsEnum = terms.iterator();
                     PostingsEnum docs = null;
-                    HitCounter counter = new ArrayHitCounter(reader.numDocs());
+                    HitCounter counter = new ArrayHitCounter(reader.maxDoc());
+                    // TODO: Is this the right place to use the live docs bitset to check for deleted docs?
+                    // Bits liveDocs = reader.getLiveDocs();
                     for (HashAndFreq hac : hashAndFrequencies) {
                         if (termsEnum.seekExact(new BytesRef(hac.getHash()))) {
                             docs = termsEnum.postings(docs, PostingsEnum.NONE);
-                            for (int i = 0; i < docs.cost(); i++) {
-                                counter.increment(docs.nextDoc(), (short) Math.min(hac.getFreq(), docs.freq()));
+                            while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+                                counter.increment(docs.docID(), (short) Math.min(hac.getFreq(), docs.freq()));
                             }
                         }
                     }

diff --git a/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/NearestNeighborsQuerySpec.scala b/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/NearestNeighborsQuerySpec.scala
@@ -8,7 +8,7 @@ import com.klibisz.elastiknn.testing.{ElasticAsyncClient, SilentMatchers}
 import com.sksamuel.elastic4s.ElasticDsl._
 import com.sksamuel.elastic4s.XContentFactory
 import com.sksamuel.elastic4s.requests.common.RefreshPolicy
-import org.scalatest.{AsyncFunSpec, Inspectors, Matchers}
+import org.scalatest.{AsyncFunSpec, Inspectors, Matchers, _}
 
 import scala.concurrent.Future
 import scala.util.Random
@@ -199,52 +199,64 @@ class NearestNeighborsQuerySpec extends AsyncFunSpec with Matchers with Inspecto
   describe("deleting vectors") {
 
     // https://github.com/alexklibisz/elastiknn/issues/158
-    it("index, search, delete some, search again") {
+    it("index, search, delete some, search, replace them, search again") {
 
       implicit val rng: Random = new Random(0)
-      val index = "issue-158"
-      val dims = 100
+      val (index, vecField, idField, dims) = ("issue-158", "vec", "id", 100)
       val corpus = Vec.DenseFloat.randoms(dims, 1000)
       val ids = corpus.indices.map(i => s"v$i")
-      val queryVec = corpus.head
       val mapping = Mapping.L2Lsh(dims, 40, 4, 2)
-      val query = NearestNeighborsQuery.L2Lsh("vec", 30, 1, queryVec)
+      val query = NearestNeighborsQuery.L2Lsh(vecField, 30, 1)
 
-      for {
-        _ <- deleteIfExists(index)
-        _ <- eknn.execute(createIndex(index).shards(1).replicas(0))
-        _ <- eknn.putMapping(index, "vec", "id", mapping)
-        _ <- eknn.index(index, "vec", corpus, "id", ids)
-        _ <- eknn.execute(refreshIndex(index))
-        _ <- eknn.execute(forceMerge(index).maxSegments(1))
+      def searchDeleteSearchReplace(): Future[Assertion] = {
+        val randomIdx = rng.nextInt(corpus.length)
+        val (vec, id) = (corpus(randomIdx), ids(randomIdx))
+        for {
+          c1 <- eknn.execute(count(index))
+          _ = c1.result.count shouldBe corpus.length
 
-        // Check the count before deleting anything.
-        cnt1 <- eknn.execute(count(index))
-        res1 <- eknn.nearestNeighbors(index, query, 10, "id")
+          // Search for the randomly-picked vector. It should be its own best match.
+          s1 <- eknn.nearestNeighbors(index, query.withVec(vec), 10, idField)
+          _ <- s1.result.hits.hits.headOption.map(_.id) shouldBe Some(id)
 
-        // Delete the query vector from the index, count, and search again.
-        _ <- eknn.execute(deleteById(index, "v0"))
-        _ <- eknn.execute(refreshIndex(index))
-        cnt2 <- eknn.execute(count(index))
-        res2 <- eknn.nearestNeighbors(index, query, 10, "id")
+          // Delete the top five vectors.
+          deletedIdxs = s1.result.hits.hits.take(5).map(_.id.drop(1).toInt).toSeq
+          _ <- Future.traverse(deletedIdxs.map(ids.apply).map(deleteById(index, _)))(eknn.execute(_))
+          _ <- eknn.execute(refreshIndex(index))
+          c2 <- eknn.execute(count(index))
+          _ = c2.result.count shouldBe (corpus.length - deletedIdxs.length)
 
-        // Put the vector back, count, search again.
-        _ <- eknn.index(index, "vec", corpus.take(1), "id", ids.take(1))
-        _ <- eknn.execute(refreshIndex(index))
-        cnt3 <- eknn.execute(count(index))
-        res3 <- eknn.nearestNeighbors(index, query, 10, "id")
+          // Search again for the original vector. The previous last five results should be the new top five.
+          s2 <- eknn.nearestNeighbors(index, query.withVec(vec), 10, idField)
+          _ = s2.result.hits.hits.map(_.id).take(5).toSeq shouldBe s1.result.hits.hits.map(_.id).takeRight(5).toSeq
 
-      } yield {
-        cnt1.result.count shouldBe 1000L
-        res1.result.maxScore shouldBe 1d
-        res1.result.hits.hits.head.id shouldBe "v0"
+          // Put the deleted vectors back.
+          _ <- eknn.index(index, vecField, deletedIdxs.map(corpus.apply), idField, deletedIdxs.map(ids.apply))
+          _ <- eknn.execute(refreshIndex(index))
+          c3 <- eknn.execute(count(index))
+          _ = c3.result.count shouldBe corpus.length
 
-        cnt2.result.count shouldBe 999L
-        res2.result.hits.hits.head.id shouldBe res1.result.hits.hits.tail.head.id
+          // Search again for the same original vector.
+          s3 <- eknn.nearestNeighbors(index, query.withVec(vec), 10, idField)
+          _ = s3.result.hits.hits.map(_.id).toSet shouldBe s1.result.hits.hits.map(_.id).toSet
 
-        cnt3.result.count shouldBe 1000L
-        res3.result.hits.hits.head.id shouldBe "v0"
+        } yield Assertions.succeed
       }
+
+      for {
+        _ <- deleteIfExists(index)
+        _ <- eknn.execute(createIndex(index).shards(1).replicas(0))
+        _ <- eknn.putMapping(index, vecField, idField, mapping)
+        _ <- eknn.index(index, vecField, corpus, idField, ids)
+        _ <- eknn.execute(refreshIndex(index))
+
+        _ <- searchDeleteSearchReplace()
+        _ <- searchDeleteSearchReplace()
+        _ <- searchDeleteSearchReplace()
+        _ <- searchDeleteSearchReplace()
+        _ <- searchDeleteSearchReplace()
+
+      } yield Assertions.succeed
     }
   }