Skip to content

Commit

Permalink
Support queries with unsorted sparse bool query vecs (#134)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexklibisz authored Aug 11, 2020
1 parent b965186 commit 0b66b64
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 2 deletions.
2 changes: 2 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
- Support sparse bool query vectors with unsorted true indices.
---
- Added new submodules which can be used without Elasticsearch:
- `com.klibisz.elastiknn:models` contains exact and approximate similarity models, all in Java with minimal dependencies.
- `com.klibisz.elastiknn:lucene` contains the custom Lucene queries and some Lucene-related utilities used by Elastiknn.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.klibisz.elastiknn

import scala.annotation.tailrec
import scala.util.Random

package object api {
Expand Down Expand Up @@ -28,10 +29,21 @@ package object api {

final case class SparseBool(trueIndices: Array[Int], totalIndices: Int) extends Vec with KnownDims {
def sorted(): SparseBool = copy(trueIndices.sorted)

def isSorted: Boolean = {
@tailrec
def check(i: Int): Boolean =
if (i == trueIndices.length) true
else if (trueIndices(i) < trueIndices(i - 1)) false
else check(i + 1)
check(1)
}

override def equals(other: Any): Boolean = other match {
case other: SparseBool => trueIndices.deep == other.trueIndices.deep && totalIndices == other.totalIndices
case _ => false
}

override def toString: String = s"SparseBool(${trueIndices.take(3).mkString(",")},...,${trueIndices.length}/$totalIndices)"

def dims: Int = totalIndices
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,12 @@ object KnnQueryBuilder {
val map = parser.map()
val json: Json = javaMapEncoder(map)
val query = ElasticsearchCodec.decodeJsonGet[NearestNeighborsQuery](json)
new KnnQueryBuilder(query)
// Account for sparse bool vecs which need to be sorted.
val sortedVec = query.vec match {
case v: Vec.SparseBool if !v.isSorted => v.sorted()
case _ => query.vec
}
new KnnQueryBuilder(query.withVec(sortedVec))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,4 +145,55 @@ class NearestNeighborsQuerySpec extends AsyncFunSpec with Matchers with Inspecto
}
}

// https://gitter.im/elastiknn/community?at=5f3012df65e829425e70ee31
describe("Sparse bool vectors with unsorted indices") {
implicit val rng: Random = new Random(0)
val indexPrefix = "test-sbv-unsorted"

val dims = 20000
val corpus = Vec.SparseBool.randoms(dims, 100)

val queryVec = {
val sorted = corpus.head
val shuffled = rng.shuffle(sorted.trueIndices.toVector).toArray
sorted.copy(shuffled)
}

// Test with multiple mappings/queries.
val mappingsAndQueries = Seq(
Mapping.SparseBool(dims) -> Seq(
NearestNeighborsQuery.Exact("vec", Similarity.Jaccard, queryVec),
NearestNeighborsQuery.Exact("vec", Similarity.Hamming, queryVec),
),
Mapping.JaccardLsh(dims, 40, 1) -> Seq(
NearestNeighborsQuery.Exact("vec", Similarity.Jaccard, queryVec),
NearestNeighborsQuery.Exact("vec", Similarity.Hamming, queryVec),
NearestNeighborsQuery.JaccardLsh("vec", 100, queryVec)
),
Mapping.HammingLsh(dims, 40, 2) -> Seq(
NearestNeighborsQuery.Exact("vec", Similarity.Jaccard, queryVec),
NearestNeighborsQuery.Exact("vec", Similarity.Hamming, queryVec),
NearestNeighborsQuery.HammingLsh("vec", 100, queryVec)
)
)

for {
(mapping, queries) <- mappingsAndQueries
query <- queries
} it(s"finds unsorted sparse bool vecs with mapping [${mapping}] and query [${query}]") {
val index = s"$indexPrefix-${UUID.randomUUID.toString}"
for {
_ <- deleteIfExists(index)
_ <- eknn.execute(createIndex(index).shards(1).replicas(1))
_ <- eknn.putMapping(index, "vec", "id", mapping)
_ <- eknn.index(index, "vec", corpus, "id", corpus.indices.map(i => s"v$i"))
_ <- eknn.execute(refreshIndex(index))
res <- eknn.nearestNeighbors(index, query, 5, "id")
} yield {
res.result.maxScore shouldBe 1d
res.result.hits.hits.head.id shouldBe "v0"
}
}
}

}
2 changes: 1 addition & 1 deletion version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1.0-PRE30
0.1.0-PRE31

0 comments on commit 0b66b64

Please sign in to comment.