Disable norms on fields used to store LSH hashes (#78)

I setup a benchmark for the SparseIndexed mapping/query and noticed a lot of time spent in the Lucene80NormsProducer class. Setting .omitNorms(true) in a few places seems to get rid of that class in the sampler. It decreases the time for the ExecuteLocal benchmark from 100 seconds for 1000 serial queries to ~85 seconds for 1000 serial queries and 74 seconds to 62 seconds for parallel queries.
alexklibisz · Jun 17, 2020 · dfb81fd · dfb81fd
1 parent adf8262
commit dfb81fd
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 51 deletions.
diff --git a/benchmarks/src/main/scala/com/klibisz/elastiknn/benchmarks/Execute.scala b/benchmarks/src/main/scala/com/klibisz/elastiknn/benchmarks/Execute.scala
@@ -102,6 +102,7 @@ object Execute extends App {
             } yield ()
         }
         _ <- eknnClient.execute(refreshIndex(trainIndexName, testIndexName))
+        _ <- eknnClient.execute(forceMerge(trainIndexName, testIndexName))
       } yield ()
     }
 
@@ -159,6 +160,7 @@ object Execute extends App {
 
       // Run searches on the holdout vectors.
       (singleResults, totalDuration) <- search(testVecs)
+      _ <- log.info(s"Completed ${singleResults.length} searches in ${totalDuration / 1000f} seconds")
 
     } yield BenchmarkResult(dataset, eknnMapping, eknnQuery, k, parallelism, totalDuration, singleResults)
   }
@@ -254,17 +256,14 @@ object ExecuteLocal extends App {
 
   override def run(args: List[String]): URIO[Console, ExitCode] = {
     val s3Client = S3Utils.minioClient()
-    val dataset = Dataset.RandomDenseFloat(1024, 50000)
+    val dataset = Dataset.RandomSparseBool(1000, 10000)
     val exp = Experiment(
       dataset,
-      Mapping.L2Lsh(dataset.dims, 200, 2, 1),
-      NearestNeighborsQuery.L2Lsh("vec", Vec.Empty(), 100),
-      Mapping.L2Lsh(dataset.dims, 200, 2, 1),
+      Mapping.SparseIndexed(dataset.dims),
+      NearestNeighborsQuery.SparseIndexed("vec", Vec.Empty(), Similarity.Jaccard),
+      Mapping.SparseIndexed(dataset.dims),
       Seq(
-        Query(NearestNeighborsQuery.L2Lsh("vec", Vec.Empty(), 100), 100),
-        Query(NearestNeighborsQuery.L2Lsh("vec", Vec.Empty(), 100), 100),
-        Query(NearestNeighborsQuery.L2Lsh("vec", Vec.Empty(), 100), 100),
-        Query(NearestNeighborsQuery.L2Lsh("vec", Vec.Empty(), 100), 100)
+        Query(NearestNeighborsQuery.SparseIndexed("vec", Vec.Empty(), Similarity.Jaccard), 100)
       )
     )
     s3Client.putObject("elastiknn-benchmarks", s"experiments/${exp.md5sum}.json", codecs.experimentCodec(exp).noSpaces)

diff --git a/plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala b/plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala
@@ -8,9 +8,10 @@ import com.klibisz.elastiknn.query.{ExactQuery, LshQuery, SparseIndexedQuery}
 import com.klibisz.elastiknn.{ELASTIKNN_NAME, VectorDimensionException}
 import io.circe.syntax._
 import io.circe.{Json, JsonObject}
-import org.apache.lucene.index.{IndexableField, Term}
+import org.apache.lucene.index.{IndexOptions, IndexableField, Term}
 import org.apache.lucene.search.similarities.BooleanSimilarity
 import org.apache.lucene.search.{DocValuesFieldExistsQuery, Query, TermInSetQuery, TermQuery}
+import org.apache.lucene.document
 import org.apache.lucene.util.BytesRef
 import org.elasticsearch.common.xcontent.{ToXContent, XContentBuilder}
 import org.elasticsearch.index.mapper.Mapper.TypeParser
@@ -56,14 +57,44 @@ object VectorMapper {
   private def incompatible(m: Mapping, v: Vec): Exception = new IllegalArgumentException(
     s"Mapping [${nospaces(m)}] is not compatible with vector [${nospaces(v)}]"
   )
+
+  class FieldType(typeName: String) extends MappedFieldType {
+
+    // We generally only care about the presence or absence of terms, not their counts or anything fancier.
+    this.setSimilarity(new SimilarityProvider("boolean", new BooleanSimilarity))
+    this.setOmitNorms(true)
+    this.setBoost(1f)
+    this.setTokenized(false)
+
+    override def typeName(): String = typeName
+    override def clone(): FieldType = new FieldType(typeName)
+    override def termQuery(value: Any, context: QueryShardContext): Query = value match {
+      case b: BytesRef => new TermQuery(new Term(name(), b))
+      case _ =>
+        throw new UnsupportedOperationException(
+          s"Field [${name()}] of type [${typeName()}] doesn't support term queries with value of type [${value.getClass}]")
+    }
+
+    override def existsQuery(context: QueryShardContext): Query = new DocValuesFieldExistsQuery(name())
+  }
+
+  val simpleTokenFieldType: document.FieldType = {
+    val ft = new document.FieldType
+    ft.setIndexOptions(IndexOptions.DOCS)
+    ft.setTokenized(false)
+    ft.setOmitNorms(true)
+    ft.freeze()
+    ft
+  }
+
 }
 
 abstract class VectorMapper[V <: Vec: ElasticsearchCodec] { self =>
 
   val CONTENT_TYPE: String
   def checkAndCreateFields(mapping: Mapping, field: String, vec: V): Try[Seq[IndexableField]]
 
-  private val fieldType = new this.FieldType
+  private val fieldType = new VectorMapper.FieldType(CONTENT_TYPE)
 
   import com.klibisz.elastiknn.utils.CirceUtils.javaMapEncoder
 
@@ -140,21 +171,4 @@ abstract class VectorMapper[V <: Vec: ElasticsearchCodec] { self =>
     }
   }
 
-  class FieldType extends MappedFieldType {
-
-    // We generally only care about the presence or absence of terms, not their counts or anything fancier.
-    this.setSimilarity(new SimilarityProvider("boolean", new BooleanSimilarity))
-
-    override def typeName(): String = CONTENT_TYPE
-    override def clone(): FieldType = new FieldType
-    override def termQuery(value: Any, context: QueryShardContext): Query = value match {
-      case b: BytesRef => new TermQuery(new Term(name(), b))
-      case _ =>
-        throw new UnsupportedOperationException(
-          s"Field [${name()}] of type [${typeName()}] doesn't support term queries with value of type [${value.getClass}]")
-    }
-
-    override def existsQuery(context: QueryShardContext): Query = new DocValuesFieldExistsQuery(name())
-  }
-
 }
diff --git a/plugin/src/main/scala/com/klibisz/elastiknn/query/LshQuery.scala b/plugin/src/main/scala/com/klibisz/elastiknn/query/LshQuery.scala
@@ -5,16 +5,14 @@ import java.util.Objects
 
 import com.google.common.collect.MinMaxPriorityQueue
 import com.klibisz.elastiknn.api.{Mapping, Vec}
+import com.klibisz.elastiknn.mapper.VectorMapper
 import com.klibisz.elastiknn.models.LshFunction
 import com.klibisz.elastiknn.storage.{StoredVec, UnsafeSerialization}
-import org.apache.lucene.document.{Field, FieldType}
+import org.apache.lucene.document.Field
 import org.apache.lucene.index._
-import org.apache.lucene.queryparser.xml.builders.MatchAllDocsQueryBuilder
 import org.apache.lucene.search._
 import org.apache.lucene.util.BytesRef
 import org.elasticsearch.common.lucene.search.function.{CombineFunction, FunctionScoreQuery, LeafScoreFunction, ScoreFunction}
-import org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder
-import org.elasticsearch.index.query.{BoolQueryBuilder, ConstantScoreQueryBuilder, MoreLikeThisQueryBuilder, QueryBuilder, TermQueryBuilder}
 
 object LshQuery {
 
@@ -77,8 +75,7 @@ object LshQuery {
       lshFunc(queryVec).foreach { h =>
         val term = new Term(field, new BytesRef(UnsafeSerialization.writeInt(h)))
         val termQuery = new TermQuery(term)
-        val constQuery = new ConstantScoreQuery(termQuery) // TODO: is this necessary?
-        builder.add(new BooleanClause(constQuery, BooleanClause.Occur.SHOULD))
+        builder.add(new BooleanClause(termQuery, BooleanClause.Occur.SHOULD))
       }
       builder.setMinimumNumberShouldMatch(1)
       builder.build()
@@ -87,18 +84,10 @@ object LshQuery {
     new FunctionScoreQuery(isecQuery, func, CombineFunction.REPLACE, 0f, Float.MaxValue)
   }
 
-  private val hashesFieldType: FieldType = {
-    val ft = new FieldType
-    ft.setIndexOptions(IndexOptions.DOCS)
-    ft.setTokenized(false)
-    ft.freeze()
-    ft
-  }
-
   def index[M <: Mapping, V <: Vec: StoredVec.Encoder, S <: StoredVec](field: String, vec: V, mapping: M)(
       implicit lshFunctionCache: LshFunctionCache[M, V, S]): Seq[IndexableField] = {
     ExactQuery.index(field, vec) ++ lshFunctionCache(mapping)(vec).map { h =>
-      new Field(field, UnsafeSerialization.writeInt(h), hashesFieldType)
+      new Field(field, UnsafeSerialization.writeInt(h), VectorMapper.simpleTokenFieldType)
     }
   }
 

diff --git a/plugin/src/main/scala/com/klibisz/elastiknn/query/SparseIndexedQuery.scala b/plugin/src/main/scala/com/klibisz/elastiknn/query/SparseIndexedQuery.scala
@@ -3,6 +3,7 @@ package com.klibisz.elastiknn.query
 import java.util.Objects
 
 import com.klibisz.elastiknn.api._
+import com.klibisz.elastiknn.mapper.VectorMapper
 import com.klibisz.elastiknn.models.SparseIndexedSimilarityFunction
 import com.klibisz.elastiknn.storage.UnsafeSerialization
 import org.apache.lucene.document.{Field, FieldType, NumericDocValuesField}
@@ -54,6 +55,7 @@ object SparseIndexedQuery {
         val clause = new BooleanClause(termQuery, BooleanClause.Occur.SHOULD)
         builder.add(clause)
       }
+      builder.setMinimumNumberShouldMatch(1)
       builder.build()
     }
     val f = new SparseIndexedScoreFunction(field, queryVec, simFunc)
@@ -62,17 +64,9 @@ object SparseIndexedQuery {
 
   def numTrueDocValueField(field: String): String = s"$field.num_true"
 
-  private val trueIndicesFieldType: FieldType = {
-    val ft = new FieldType
-    ft.setIndexOptions(IndexOptions.DOCS)
-    ft.setTokenized(false)
-    ft.freeze()
-    ft
-  }
-
   def index(field: String, vec: Vec.SparseBool): Seq[IndexableField] = {
     vec.trueIndices.map { ti =>
-      new Field(field, UnsafeSerialization.writeInt(ti), trueIndicesFieldType)
+      new Field(field, UnsafeSerialization.writeInt(ti), VectorMapper.simpleTokenFieldType)
     } ++ ExactQuery.index(field, vec) :+ new NumericDocValuesField(numTrueDocValueField(field), vec.trueIndices.length)
   }