From 55fac2a81a80a2e93fef040e44a29a6359c1ec98 Mon Sep 17 00:00:00 2001 From: Alex Klibisz Date: Fri, 18 Sep 2020 16:02:24 -0700 Subject: [PATCH] Fix NPE when searching after deleting docs (#159) Based on issue #158. So far it seems that the solution is: 1. Explicitly handle the possibility of `null` returned when calling `leafReader.terms(field)`. 2. Use `leafReader.numDocs` instead of `indexReader.numDocs` to determine the total number of possible docs. Added a test for this scenario: index some docs, run a search, delete one doc, run a search, put the doc back, run a search. This passes locally. --- changelog.md | 3 + .../elastiknn/lucene/ArrayHitCounter.java | 2 +- .../search/MatchHashesAndScoreQuery.java | 39 +++++++------ .../query/MatchHashesAndScoreQuerySuite.scala | 4 +- .../query/NearestNeighborsQuerySpec.scala | 56 ++++++++++++++++++- 5 files changed, 82 insertions(+), 22 deletions(-) diff --git a/changelog.md b/changelog.md index 98101c2bf..4d39cb149 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,5 @@ +- Fixed null pointer exception which was happening when running queries after deleting some vectors. +--- - More memory-efficient implementation of Python ElastiknnModel.fit method. Uses an iterator over the vectors instead of a list of the vectors. --- - Renamed parameter `r` in L2Lsh mapping to `w`, which is more appropriate and common for "width". @@ -16,6 +18,7 @@ --- - Switched to less-naive implementation of multiprobe L2 LSH. Specifically, uses algorithm 1 from Qin, et. al. to generate perturbation sets lazily at query time instead of generating them exhaustively. This does not use the estimated + scoring optimization from that paper. - Performance optimizations for approximate queries. Specifically using a faster sorting method to sort the hashes before retrieving matching docs from the shard. diff --git a/elastiknn-lucene/src/main/java/com/klibisz/elastiknn/lucene/ArrayHitCounter.java b/elastiknn-lucene/src/main/java/com/klibisz/elastiknn/lucene/ArrayHitCounter.java index 64bf6e2de..947302cc9 100644 --- a/elastiknn-lucene/src/main/java/com/klibisz/elastiknn/lucene/ArrayHitCounter.java +++ b/elastiknn-lucene/src/main/java/com/klibisz/elastiknn/lucene/ArrayHitCounter.java @@ -40,7 +40,7 @@ public int numHits() { @Override public KthGreatest.Result kthGreatest(int k) { - return KthGreatest.kthGreatest(counts, k); + return KthGreatest.kthGreatest(counts, Math.min(k, counts.length - 1)); } @Override diff --git a/elastiknn-lucene/src/main/java/org/apache/lucene/search/MatchHashesAndScoreQuery.java b/elastiknn-lucene/src/main/java/org/apache/lucene/search/MatchHashesAndScoreQuery.java index ba020440a..6db6de9c7 100644 --- a/elastiknn-lucene/src/main/java/org/apache/lucene/search/MatchHashesAndScoreQuery.java +++ b/elastiknn-lucene/src/main/java/org/apache/lucene/search/MatchHashesAndScoreQuery.java @@ -11,6 +11,10 @@ import java.util.Set; import java.util.function.Function; +/** + * Query that finds docs containing the given hashes hashes (Lucene terms), and then applies a scoring function to the + * docs containing the most matching hashes. Largely based on Lucene's TermsInSetQuery. + */ public class MatchHashesAndScoreQuery extends Query { public interface ScoreFunction { @@ -22,7 +26,6 @@ public interface ScoreFunction { private final int candidates; private final IndexReader indexReader; private final Function scoreFunctionBuilder; - private final int numDocsInSegment; public MatchHashesAndScoreQuery(final String field, final HashAndFreq[] hashAndFrequencies, @@ -38,7 +41,6 @@ public MatchHashesAndScoreQuery(final String field, this.candidates = candidates; this.indexReader = indexReader; this.scoreFunctionBuilder = scoreFunctionBuilder; - this.numDocsInSegment = indexReader.numDocs(); } @Override @@ -49,26 +51,29 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo /** * Builds and returns a map from doc ID to the number of matching hashes in that doc. */ - private HitCounter countHits(LeafReaderContext context) throws IOException { - LeafReader reader = context.reader(); + private HitCounter countHits(LeafReader reader) throws IOException { Terms terms = reader.terms(field); - TermsEnum termsEnum = terms.iterator(); - PostingsEnum docs = null; - HitCounter counter = new ArrayHitCounter(numDocsInSegment); - for (HashAndFreq hac : hashAndFrequencies) { - if (termsEnum.seekExact(new BytesRef(hac.getHash()))) { - docs = termsEnum.postings(docs, PostingsEnum.NONE); - for (int i = 0; i < docs.cost(); i++) { - counter.increment(docs.nextDoc(), (short) Math.min(hac.getFreq(), docs.freq())); + // terms seem to be null after deleting docs. https://github.com/alexklibisz/elastiknn/issues/158 + if (terms == null) { + return new ArrayHitCounter(0); + } else { + TermsEnum termsEnum = terms.iterator(); + PostingsEnum docs = null; + HitCounter counter = new ArrayHitCounter(reader.numDocs()); + for (HashAndFreq hac : hashAndFrequencies) { + if (termsEnum.seekExact(new BytesRef(hac.getHash()))) { + docs = termsEnum.postings(docs, PostingsEnum.NONE); + for (int i = 0; i < docs.cost(); i++) { + counter.increment(docs.nextDoc(), (short) Math.min(hac.getFreq(), docs.freq())); + } } } + return counter; } - return counter; } private DocIdSetIterator buildDocIdSetIterator(HitCounter counter) { - if (candidates >= numDocsInSegment) return DocIdSetIterator.all(indexReader.maxDoc()); - else if (counter.isEmpty()) return DocIdSetIterator.empty(); + if (counter.isEmpty()) return DocIdSetIterator.empty(); else { KthGreatest.Result kgr = counter.kthGreatest(candidates); @@ -138,9 +143,9 @@ public Explanation explain(LeafReaderContext context, int doc) { @Override public Scorer scorer(LeafReaderContext context) throws IOException { - ScoreFunction scoreFunction = scoreFunctionBuilder.apply(context); - HitCounter counter = countHits(context); + LeafReader reader = context.reader(); + HitCounter counter = countHits(reader); DocIdSetIterator disi = buildDocIdSetIterator(counter); return new Scorer(this) { diff --git a/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/MatchHashesAndScoreQuerySuite.scala b/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/MatchHashesAndScoreQuerySuite.scala index 88af626b0..ffcd9096e 100644 --- a/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/MatchHashesAndScoreQuerySuite.scala +++ b/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/MatchHashesAndScoreQuerySuite.scala @@ -13,7 +13,7 @@ import scala.collection.mutable.ArrayBuffer class MatchHashesAndScoreQuerySuite extends FunSuite with Matchers with LuceneSupport { - val ft = new VectorMapper.FieldType("elastiknn_dense_float_vector") + val ft: VectorMapper.FieldType = new VectorMapper.FieldType("elastiknn_dense_float_vector") test("empty harness") { indexAndSearch() { (_: IndexWriter) => @@ -91,7 +91,7 @@ class MatchHashesAndScoreQuerySuite extends FunSuite with Matchers with LuceneSu } } - test("documents with 0 hashes are not candidates") { + test("documents with 0 matches are not candidates") { indexAndSearch() { w => for (_ <- 0 until 10) { val d = new Document() diff --git a/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/NearestNeighborsQuerySpec.scala b/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/NearestNeighborsQuerySpec.scala index 1a1e50269..3422f4957 100644 --- a/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/NearestNeighborsQuerySpec.scala +++ b/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/NearestNeighborsQuerySpec.scala @@ -18,7 +18,7 @@ class NearestNeighborsQuerySpec extends AsyncFunSpec with Matchers with Inspecto // https://github.com/alexklibisz/elastiknn/issues/60 describe("Vectors in nested fields") { implicit val rng: Random = new Random(0) - val index = "test-queries-nested-fields" + val index = "issue-60" val vec = Vec.DenseFloat.random(10) val mapping = Mapping.DenseFloat(vec.values.length) val nestedFields = Seq( @@ -70,7 +70,7 @@ class NearestNeighborsQuerySpec extends AsyncFunSpec with Matchers with Inspecto // https://github.com/alexklibisz/elastiknn/issues/97 describe("Query with filter on another field") { implicit val rng: Random = new Random(0) - val indexPrefix = "test-queries-with-filter" + val indexPrefix = "issue-97" // Generate a corpus of 100 docs. < 10 of the them have color blue, rest have color red. val dims = 10 @@ -196,4 +196,56 @@ class NearestNeighborsQuerySpec extends AsyncFunSpec with Matchers with Inspecto } } + describe("deleting vectors") { + + // https://github.com/alexklibisz/elastiknn/issues/158 + it("index, search, delete some, search again") { + + implicit val rng: Random = new Random(0) + val index = "issue-158" + val dims = 100 + val corpus = Vec.DenseFloat.randoms(dims, 1000) + val ids = corpus.indices.map(i => s"v$i") + val queryVec = corpus.head + val mapping = Mapping.L2Lsh(dims, 40, 4, 2) + val query = NearestNeighborsQuery.L2Lsh("vec", 30, 1, queryVec) + + for { + _ <- deleteIfExists(index) + _ <- eknn.execute(createIndex(index).shards(1).replicas(0)) + _ <- eknn.putMapping(index, "vec", "id", mapping) + _ <- eknn.index(index, "vec", corpus, "id", ids) + _ <- eknn.execute(refreshIndex(index)) + _ <- eknn.execute(forceMerge(index).maxSegments(1)) + + // Check the count before deleting anything. + cnt1 <- eknn.execute(count(index)) + res1 <- eknn.nearestNeighbors(index, query, 10, "id") + + // Delete the query vector from the index, count, and search again. + _ <- eknn.execute(deleteById(index, "v0")) + _ <- eknn.execute(refreshIndex(index)) + cnt2 <- eknn.execute(count(index)) + res2 <- eknn.nearestNeighbors(index, query, 10, "id") + + // Put the vector back, count, search again. + _ <- eknn.index(index, "vec", corpus.take(1), "id", ids.take(1)) + _ <- eknn.execute(refreshIndex(index)) + cnt3 <- eknn.execute(count(index)) + res3 <- eknn.nearestNeighbors(index, query, 10, "id") + + } yield { + cnt1.result.count shouldBe 1000L + res1.result.maxScore shouldBe 1d + res1.result.hits.hits.head.id shouldBe "v0" + + cnt2.result.count shouldBe 999L + res2.result.hits.hits.head.id shouldBe res1.result.hits.hits.tail.head.id + + cnt3.result.count shouldBe 1000L + res3.result.hits.hits.head.id shouldBe "v0" + } + } + } + }