From 0b66b64e951ae429a5f85debb7e0ada948e65be5 Mon Sep 17 00:00:00 2001 From: Alex Klibisz Date: Mon, 10 Aug 2020 22:02:34 -0400 Subject: [PATCH] Support queries with unsorted sparse bool query vecs (#134) --- changelog.md | 2 + .../com/klibisz/elastiknn/api/package.scala | 12 +++++ .../elastiknn/query/KnnQueryBuilder.scala | 7 ++- .../query/NearestNeighborsQuerySpec.scala | 51 +++++++++++++++++++ version | 2 +- 5 files changed, 72 insertions(+), 2 deletions(-) diff --git a/changelog.md b/changelog.md index 2530f6577..67dee6862 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,5 @@ +- Support sparse bool query vectors with unsorted true indices. +--- - Added new submodules which can be used without Elasticsearch: - `com.klibisz.elastiknn:models` contains exact and approximate similarity models, all in Java with minimal dependencies. - `com.klibisz.elastiknn:lucene` contains the custom Lucene queries and some Lucene-related utilities used by Elastiknn. diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/package.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/package.scala index dab5f800d..9ea224031 100644 --- a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/package.scala +++ b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/package.scala @@ -1,5 +1,6 @@ package com.klibisz.elastiknn +import scala.annotation.tailrec import scala.util.Random package object api { @@ -28,10 +29,21 @@ package object api { final case class SparseBool(trueIndices: Array[Int], totalIndices: Int) extends Vec with KnownDims { def sorted(): SparseBool = copy(trueIndices.sorted) + + def isSorted: Boolean = { + @tailrec + def check(i: Int): Boolean = + if (i == trueIndices.length) true + else if (trueIndices(i) < trueIndices(i - 1)) false + else check(i + 1) + check(1) + } + override def equals(other: Any): Boolean = other match { case other: SparseBool => trueIndices.deep == other.trueIndices.deep && totalIndices == other.totalIndices case _ => false } + override def toString: String = s"SparseBool(${trueIndices.take(3).mkString(",")},...,${trueIndices.length}/$totalIndices)" def dims: Int = totalIndices diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/KnnQueryBuilder.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/KnnQueryBuilder.scala index 5bed79168..9ecb4fe42 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/KnnQueryBuilder.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/KnnQueryBuilder.scala @@ -35,7 +35,12 @@ object KnnQueryBuilder { val map = parser.map() val json: Json = javaMapEncoder(map) val query = ElasticsearchCodec.decodeJsonGet[NearestNeighborsQuery](json) - new KnnQueryBuilder(query) + // Account for sparse bool vecs which need to be sorted. + val sortedVec = query.vec match { + case v: Vec.SparseBool if !v.isSorted => v.sorted() + case _ => query.vec + } + new KnnQueryBuilder(query.withVec(sortedVec)) } } diff --git a/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/NearestNeighborsQuerySpec.scala b/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/NearestNeighborsQuerySpec.scala index 03b6cdd50..1a1e50269 100644 --- a/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/NearestNeighborsQuerySpec.scala +++ b/elastiknn-testing/src/test/scala/com/klibisz/elastiknn/query/NearestNeighborsQuerySpec.scala @@ -145,4 +145,55 @@ class NearestNeighborsQuerySpec extends AsyncFunSpec with Matchers with Inspecto } } + // https://gitter.im/elastiknn/community?at=5f3012df65e829425e70ee31 + describe("Sparse bool vectors with unsorted indices") { + implicit val rng: Random = new Random(0) + val indexPrefix = "test-sbv-unsorted" + + val dims = 20000 + val corpus = Vec.SparseBool.randoms(dims, 100) + + val queryVec = { + val sorted = corpus.head + val shuffled = rng.shuffle(sorted.trueIndices.toVector).toArray + sorted.copy(shuffled) + } + + // Test with multiple mappings/queries. + val mappingsAndQueries = Seq( + Mapping.SparseBool(dims) -> Seq( + NearestNeighborsQuery.Exact("vec", Similarity.Jaccard, queryVec), + NearestNeighborsQuery.Exact("vec", Similarity.Hamming, queryVec), + ), + Mapping.JaccardLsh(dims, 40, 1) -> Seq( + NearestNeighborsQuery.Exact("vec", Similarity.Jaccard, queryVec), + NearestNeighborsQuery.Exact("vec", Similarity.Hamming, queryVec), + NearestNeighborsQuery.JaccardLsh("vec", 100, queryVec) + ), + Mapping.HammingLsh(dims, 40, 2) -> Seq( + NearestNeighborsQuery.Exact("vec", Similarity.Jaccard, queryVec), + NearestNeighborsQuery.Exact("vec", Similarity.Hamming, queryVec), + NearestNeighborsQuery.HammingLsh("vec", 100, queryVec) + ) + ) + + for { + (mapping, queries) <- mappingsAndQueries + query <- queries + } it(s"finds unsorted sparse bool vecs with mapping [${mapping}] and query [${query}]") { + val index = s"$indexPrefix-${UUID.randomUUID.toString}" + for { + _ <- deleteIfExists(index) + _ <- eknn.execute(createIndex(index).shards(1).replicas(1)) + _ <- eknn.putMapping(index, "vec", "id", mapping) + _ <- eknn.index(index, "vec", corpus, "id", corpus.indices.map(i => s"v$i")) + _ <- eknn.execute(refreshIndex(index)) + res <- eknn.nearestNeighbors(index, query, 5, "id") + } yield { + res.result.maxScore shouldBe 1d + res.result.hits.hits.head.id shouldBe "v0" + } + } + } + } diff --git a/version b/version index e7270f946..da92a9905 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.1.0-PRE30 +0.1.0-PRE31