Skip to content

Commit

Permalink
Fix NPE when searching after deleting docs (#159)
Browse files Browse the repository at this point in the history
Based on issue #158. 
So far it seems that the solution is:
1. Explicitly handle the possibility of `null` returned when calling `leafReader.terms(field)`.
2. Use `leafReader.numDocs` instead of `indexReader.numDocs` to determine the total number of possible docs.
Added a test for this scenario: index some docs, run a search, delete one doc, run a search, put the doc back, run a search. This passes locally.
  • Loading branch information
alexklibisz authored Sep 18, 2020
1 parent 4840ad1 commit 55fac2a
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 22 deletions.
3 changes: 3 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
- Fixed null pointer exception which was happening when running queries after deleting some vectors.
---
- More memory-efficient implementation of Python ElastiknnModel.fit method. Uses an iterator over the vectors instead of a list of the vectors.
---
- Renamed parameter `r` in L2Lsh mapping to `w`, which is more appropriate and common for "width".
Expand All @@ -16,6 +18,7 @@
---
- Switched to less-naive implementation of multiprobe L2 LSH. Specifically, uses algorithm 1 from Qin, et. al. to generate
perturbation sets lazily at query time instead of generating them exhaustively. This does not use the estimated

scoring optimization from that paper.
- Performance optimizations for approximate queries. Specifically using a faster sorting method to sort the hashes before
retrieving matching docs from the shard.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public int numHits() {

@Override
public KthGreatest.Result kthGreatest(int k) {
return KthGreatest.kthGreatest(counts, k);
return KthGreatest.kthGreatest(counts, Math.min(k, counts.length - 1));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
import java.util.Set;
import java.util.function.Function;

/**
* Query that finds docs containing the given hashes hashes (Lucene terms), and then applies a scoring function to the
* docs containing the most matching hashes. Largely based on Lucene's TermsInSetQuery.
*/
public class MatchHashesAndScoreQuery extends Query {

public interface ScoreFunction {
Expand All @@ -22,7 +26,6 @@ public interface ScoreFunction {
private final int candidates;
private final IndexReader indexReader;
private final Function<LeafReaderContext, ScoreFunction> scoreFunctionBuilder;
private final int numDocsInSegment;

public MatchHashesAndScoreQuery(final String field,
final HashAndFreq[] hashAndFrequencies,
Expand All @@ -38,7 +41,6 @@ public MatchHashesAndScoreQuery(final String field,
this.candidates = candidates;
this.indexReader = indexReader;
this.scoreFunctionBuilder = scoreFunctionBuilder;
this.numDocsInSegment = indexReader.numDocs();
}

@Override
Expand All @@ -49,26 +51,29 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo
/**
* Builds and returns a map from doc ID to the number of matching hashes in that doc.
*/
private HitCounter countHits(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
private HitCounter countHits(LeafReader reader) throws IOException {
Terms terms = reader.terms(field);
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
HitCounter counter = new ArrayHitCounter(numDocsInSegment);
for (HashAndFreq hac : hashAndFrequencies) {
if (termsEnum.seekExact(new BytesRef(hac.getHash()))) {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
for (int i = 0; i < docs.cost(); i++) {
counter.increment(docs.nextDoc(), (short) Math.min(hac.getFreq(), docs.freq()));
// terms seem to be null after deleting docs. https://github.com/alexklibisz/elastiknn/issues/158
if (terms == null) {
return new ArrayHitCounter(0);
} else {
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
HitCounter counter = new ArrayHitCounter(reader.numDocs());
for (HashAndFreq hac : hashAndFrequencies) {
if (termsEnum.seekExact(new BytesRef(hac.getHash()))) {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
for (int i = 0; i < docs.cost(); i++) {
counter.increment(docs.nextDoc(), (short) Math.min(hac.getFreq(), docs.freq()));
}
}
}
return counter;
}
return counter;
}

private DocIdSetIterator buildDocIdSetIterator(HitCounter counter) {
if (candidates >= numDocsInSegment) return DocIdSetIterator.all(indexReader.maxDoc());
else if (counter.isEmpty()) return DocIdSetIterator.empty();
if (counter.isEmpty()) return DocIdSetIterator.empty();
else {

KthGreatest.Result kgr = counter.kthGreatest(candidates);
Expand Down Expand Up @@ -138,9 +143,9 @@ public Explanation explain(LeafReaderContext context, int doc) {

@Override
public Scorer scorer(LeafReaderContext context) throws IOException {

ScoreFunction scoreFunction = scoreFunctionBuilder.apply(context);
HitCounter counter = countHits(context);
LeafReader reader = context.reader();
HitCounter counter = countHits(reader);
DocIdSetIterator disi = buildDocIdSetIterator(counter);

return new Scorer(this) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import scala.collection.mutable.ArrayBuffer

class MatchHashesAndScoreQuerySuite extends FunSuite with Matchers with LuceneSupport {

val ft = new VectorMapper.FieldType("elastiknn_dense_float_vector")
val ft: VectorMapper.FieldType = new VectorMapper.FieldType("elastiknn_dense_float_vector")

test("empty harness") {
indexAndSearch() { (_: IndexWriter) =>
Expand Down Expand Up @@ -91,7 +91,7 @@ class MatchHashesAndScoreQuerySuite extends FunSuite with Matchers with LuceneSu
}
}

test("documents with 0 hashes are not candidates") {
test("documents with 0 matches are not candidates") {
indexAndSearch() { w =>
for (_ <- 0 until 10) {
val d = new Document()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class NearestNeighborsQuerySpec extends AsyncFunSpec with Matchers with Inspecto
// https://github.com/alexklibisz/elastiknn/issues/60
describe("Vectors in nested fields") {
implicit val rng: Random = new Random(0)
val index = "test-queries-nested-fields"
val index = "issue-60"
val vec = Vec.DenseFloat.random(10)
val mapping = Mapping.DenseFloat(vec.values.length)
val nestedFields = Seq(
Expand Down Expand Up @@ -70,7 +70,7 @@ class NearestNeighborsQuerySpec extends AsyncFunSpec with Matchers with Inspecto
// https://github.com/alexklibisz/elastiknn/issues/97
describe("Query with filter on another field") {
implicit val rng: Random = new Random(0)
val indexPrefix = "test-queries-with-filter"
val indexPrefix = "issue-97"

// Generate a corpus of 100 docs. < 10 of the them have color blue, rest have color red.
val dims = 10
Expand Down Expand Up @@ -196,4 +196,56 @@ class NearestNeighborsQuerySpec extends AsyncFunSpec with Matchers with Inspecto
}
}

describe("deleting vectors") {

// https://github.com/alexklibisz/elastiknn/issues/158
it("index, search, delete some, search again") {

implicit val rng: Random = new Random(0)
val index = "issue-158"
val dims = 100
val corpus = Vec.DenseFloat.randoms(dims, 1000)
val ids = corpus.indices.map(i => s"v$i")
val queryVec = corpus.head
val mapping = Mapping.L2Lsh(dims, 40, 4, 2)
val query = NearestNeighborsQuery.L2Lsh("vec", 30, 1, queryVec)

for {
_ <- deleteIfExists(index)
_ <- eknn.execute(createIndex(index).shards(1).replicas(0))
_ <- eknn.putMapping(index, "vec", "id", mapping)
_ <- eknn.index(index, "vec", corpus, "id", ids)
_ <- eknn.execute(refreshIndex(index))
_ <- eknn.execute(forceMerge(index).maxSegments(1))

// Check the count before deleting anything.
cnt1 <- eknn.execute(count(index))
res1 <- eknn.nearestNeighbors(index, query, 10, "id")

// Delete the query vector from the index, count, and search again.
_ <- eknn.execute(deleteById(index, "v0"))
_ <- eknn.execute(refreshIndex(index))
cnt2 <- eknn.execute(count(index))
res2 <- eknn.nearestNeighbors(index, query, 10, "id")

// Put the vector back, count, search again.
_ <- eknn.index(index, "vec", corpus.take(1), "id", ids.take(1))
_ <- eknn.execute(refreshIndex(index))
cnt3 <- eknn.execute(count(index))
res3 <- eknn.nearestNeighbors(index, query, 10, "id")

} yield {
cnt1.result.count shouldBe 1000L
res1.result.maxScore shouldBe 1d
res1.result.hits.hits.head.id shouldBe "v0"

cnt2.result.count shouldBe 999L
res2.result.hits.hits.head.id shouldBe res1.result.hits.hits.tail.head.id

cnt3.result.count shouldBe 1000L
res3.result.hits.hits.head.id shouldBe "v0"
}
}
}

}

0 comments on commit 55fac2a

Please sign in to comment.