Fix NPE when searching after deleting docs (#159)

Based on issue #158. So far it seems that the solution is: 1. Explicitly handle the possibility of `null` returned when calling `leafReader.terms(field)`. 2. Use `leafReader.numDocs` instead of `indexReader.numDocs` to determine the total number of possible docs. Added a test for this scenario: index some docs, run a search, delete one doc, run a search, put the doc back, run a search. This passes locally.
alexklibisz · Sep 18, 2020 · 55fac2a · 55fac2a
1 parent 4840ad1
commit 55fac2a
Show file tree

Hide file tree

Showing 5 changed files with 82 additions and 22 deletions.
diff --git a/changelog.md b/changelog.md
@@ -1,3 +1,5 @@
+- Fixed null pointer exception which was happening when running queries after deleting some vectors.
+---
 - More memory-efficient implementation of Python ElastiknnModel.fit method. Uses an iterator over the vectors instead of a list of the vectors.
 ---
 - Renamed parameter `r` in L2Lsh mapping to `w`, which is more appropriate and common for "width".
@@ -16,6 +18,7 @@
 ---
 - Switched to less-naive implementation of multiprobe L2 LSH. Specifically, uses algorithm 1 from Qin, et. al. to generate
   perturbation sets lazily at query time instead of generating them exhaustively. This does not use the estimated 
+
   scoring optimization from that paper.
 - Performance optimizations for approximate queries. Specifically using a faster sorting method to sort the hashes before
   retrieving matching docs from the shard.

diff --git a/elastiknn-lucene/src/main/java/com/klibisz/elastiknn/lucene/ArrayHitCounter.java b/elastiknn-lucene/src/main/java/com/klibisz/elastiknn/lucene/ArrayHitCounter.java
@@ -40,7 +40,7 @@ public int numHits() {
 
     @Override
     public KthGreatest.Result kthGreatest(int k) {
-        return KthGreatest.kthGreatest(counts, k);
+        return KthGreatest.kthGreatest(counts, Math.min(k, counts.length - 1));
     }
 
     @Override

diff --git a/elastiknn-lucene/src/main/java/org/apache/lucene/search/MatchHashesAndScoreQuery.java b/elastiknn-lucene/src/main/java/org/apache/lucene/search/MatchHashesAndScoreQuery.java
@@ -11,6 +11,10 @@
 import java.util.Set;
 import java.util.function.Function;
 
+/**
+ * Query that finds docs containing the given hashes hashes (Lucene terms), and then applies a scoring function to the
+ * docs containing the most matching hashes. Largely based on Lucene's TermsInSetQuery.
+ */
 public class MatchHashesAndScoreQuery extends Query {
 
     public interface ScoreFunction {
@@ -22,7 +26,6 @@ public interface ScoreFunction {
     private final int candidates;
     private final IndexReader indexReader;
     private final Function<LeafReaderContext, ScoreFunction> scoreFunctionBuilder;
-    private final int numDocsInSegment;
 
     public MatchHashesAndScoreQuery(final String field,
                                     final HashAndFreq[] hashAndFrequencies,
@@ -38,7 +41,6 @@ public MatchHashesAndScoreQuery(final String field,
         this.candidates = candidates;
         this.indexReader = indexReader;
         this.scoreFunctionBuilder = scoreFunctionBuilder;
-        this.numDocsInSegment = indexReader.numDocs();
     }
 
     @Override
@@ -49,26 +51,29 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo
             /**
              * Builds and returns a map from doc ID to the number of matching hashes in that doc.
              */
-            private HitCounter countHits(LeafReaderContext context) throws IOException {
-                LeafReader reader = context.reader();
+            private HitCounter countHits(LeafReader reader) throws IOException {
                 Terms terms = reader.terms(field);
-                TermsEnum termsEnum = terms.iterator();
-                PostingsEnum docs = null;
-                HitCounter counter = new ArrayHitCounter(numDocsInSegment);
-                for (HashAndFreq hac : hashAndFrequencies) {
-                    if (termsEnum.seekExact(new BytesRef(hac.getHash()))) {
-                        docs = termsEnum.postings(docs, PostingsEnum.NONE);
-                        for (int i = 0; i < docs.cost(); i++) {
-                            counter.increment(docs.nextDoc(), (short) Math.min(hac.getFreq(), docs.freq()));
+                // terms seem to be null after deleting docs. https://github.com/alexklibisz/elastiknn/issues/158
+                if (terms == null) {
+                    return new ArrayHitCounter(0);
+                } else {
+                    TermsEnum termsEnum = terms.iterator();
+                    PostingsEnum docs = null;
+                    HitCounter counter = new ArrayHitCounter(reader.numDocs());
+                    for (HashAndFreq hac : hashAndFrequencies) {
+                        if (termsEnum.seekExact(new BytesRef(hac.getHash()))) {
+                            docs = termsEnum.postings(docs, PostingsEnum.NONE);
+                            for (int i = 0; i < docs.cost(); i++) {
+                                counter.increment(docs.nextDoc(), (short) Math.min(hac.getFreq(), docs.freq()));
+                            }
                         }
                     }
+                    return counter;
                 }
-                return counter;
             }
 
             private DocIdSetIterator buildDocIdSetIterator(HitCounter counter) {
-                if (candidates >= numDocsInSegment) return DocIdSetIterator.all(indexReader.maxDoc());
-                else if (counter.isEmpty()) return DocIdSetIterator.empty();
+                if (counter.isEmpty()) return DocIdSetIterator.empty();
                 else {
 
                     KthGreatest.Result kgr = counter.kthGreatest(candidates);
@@ -138,9 +143,9 @@ public Explanation explain(LeafReaderContext context, int doc) {
 
             @Override
             public Scorer scorer(LeafReaderContext context) throws IOException {
-
                 ScoreFunction scoreFunction = scoreFunctionBuilder.apply(context);
-                HitCounter counter = countHits(context);
+                LeafReader reader = context.reader();
+                HitCounter counter = countHits(reader);
                 DocIdSetIterator disi = buildDocIdSetIterator(counter);
 
                 return new Scorer(this) {

diff --git a/...nn-testing/src/test/scala/com/klibisz/elastiknn/query/MatchHashesAndScoreQuerySuite.scala b/...nn-testing/src/test/scala/com/klibisz/elastiknn/query/MatchHashesAndScoreQuerySuite.scala
@@ -13,7 +13,7 @@ import scala.collection.mutable.ArrayBuffer
 
 class MatchHashesAndScoreQuerySuite extends FunSuite with Matchers with LuceneSupport {
 
-  val ft = new VectorMapper.FieldType("elastiknn_dense_float_vector")
+  val ft: VectorMapper.FieldType = new VectorMapper.FieldType("elastiknn_dense_float_vector")
 
   test("empty harness") {
     indexAndSearch() { (_: IndexWriter) =>
@@ -91,7 +91,7 @@ class MatchHashesAndScoreQuerySuite extends FunSuite with Matchers with LuceneSu
     }
   }
 
-  test("documents with 0 hashes are not candidates") {
+  test("documents with 0 matches are not candidates") {
     indexAndSearch() { w =>
       for (_ <- 0 until 10) {
         val d = new Document()

diff --git a/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/NearestNeighborsQuerySpec.scala b/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/NearestNeighborsQuerySpec.scala
@@ -18,7 +18,7 @@ class NearestNeighborsQuerySpec extends AsyncFunSpec with Matchers with Inspecto
   // https://github.com/alexklibisz/elastiknn/issues/60
   describe("Vectors in nested fields") {
     implicit val rng: Random = new Random(0)
-    val index = "test-queries-nested-fields"
+    val index = "issue-60"
     val vec = Vec.DenseFloat.random(10)
     val mapping = Mapping.DenseFloat(vec.values.length)
     val nestedFields = Seq(
@@ -70,7 +70,7 @@ class NearestNeighborsQuerySpec extends AsyncFunSpec with Matchers with Inspecto
   // https://github.com/alexklibisz/elastiknn/issues/97
   describe("Query with filter on another field") {
     implicit val rng: Random = new Random(0)
-    val indexPrefix = "test-queries-with-filter"
+    val indexPrefix = "issue-97"
 
     // Generate a corpus of 100 docs. < 10 of the them have color blue, rest have color red.
     val dims = 10
@@ -196,4 +196,56 @@ class NearestNeighborsQuerySpec extends AsyncFunSpec with Matchers with Inspecto
     }
   }
 
+  describe("deleting vectors") {
+
+    // https://github.com/alexklibisz/elastiknn/issues/158
+    it("index, search, delete some, search again") {
+
+      implicit val rng: Random = new Random(0)
+      val index = "issue-158"
+      val dims = 100
+      val corpus = Vec.DenseFloat.randoms(dims, 1000)
+      val ids = corpus.indices.map(i => s"v$i")
+      val queryVec = corpus.head
+      val mapping = Mapping.L2Lsh(dims, 40, 4, 2)
+      val query = NearestNeighborsQuery.L2Lsh("vec", 30, 1, queryVec)
+
+      for {
+        _ <- deleteIfExists(index)
+        _ <- eknn.execute(createIndex(index).shards(1).replicas(0))
+        _ <- eknn.putMapping(index, "vec", "id", mapping)
+        _ <- eknn.index(index, "vec", corpus, "id", ids)
+        _ <- eknn.execute(refreshIndex(index))
+        _ <- eknn.execute(forceMerge(index).maxSegments(1))
+
+        // Check the count before deleting anything.
+        cnt1 <- eknn.execute(count(index))
+        res1 <- eknn.nearestNeighbors(index, query, 10, "id")
+
+        // Delete the query vector from the index, count, and search again.
+        _ <- eknn.execute(deleteById(index, "v0"))
+        _ <- eknn.execute(refreshIndex(index))
+        cnt2 <- eknn.execute(count(index))
+        res2 <- eknn.nearestNeighbors(index, query, 10, "id")
+
+        // Put the vector back, count, search again.
+        _ <- eknn.index(index, "vec", corpus.take(1), "id", ids.take(1))
+        _ <- eknn.execute(refreshIndex(index))
+        cnt3 <- eknn.execute(count(index))
+        res3 <- eknn.nearestNeighbors(index, query, 10, "id")
+
+      } yield {
+        cnt1.result.count shouldBe 1000L
+        res1.result.maxScore shouldBe 1d
+        res1.result.hits.hits.head.id shouldBe "v0"
+
+        cnt2.result.count shouldBe 999L
+        res2.result.hits.hits.head.id shouldBe res1.result.hits.hits.tail.head.id
+
+        cnt3.result.count shouldBe 1000L
+        res3.result.hits.hits.head.id shouldBe "v0"
+      }
+    }
+  }
+
 }