Skip to content

Commit

Permalink
Disable norms on fields used to store LSH hashes (#78)
Browse files Browse the repository at this point in the history
I setup a benchmark for the SparseIndexed mapping/query and noticed a lot of time spent in the Lucene80NormsProducer class. Setting .omitNorms(true) in a few places seems to get rid of that class in the sampler. It decreases the time for the ExecuteLocal benchmark from 100 seconds for 1000 serial queries to ~85 seconds for 1000 serial queries and 74 seconds to 62 seconds for parallel queries.
  • Loading branch information
alexklibisz authored Jun 17, 2020
1 parent adf8262 commit dfb81fd
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 51 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ object Execute extends App {
} yield ()
}
_ <- eknnClient.execute(refreshIndex(trainIndexName, testIndexName))
_ <- eknnClient.execute(forceMerge(trainIndexName, testIndexName))
} yield ()
}

Expand Down Expand Up @@ -159,6 +160,7 @@ object Execute extends App {

// Run searches on the holdout vectors.
(singleResults, totalDuration) <- search(testVecs)
_ <- log.info(s"Completed ${singleResults.length} searches in ${totalDuration / 1000f} seconds")

} yield BenchmarkResult(dataset, eknnMapping, eknnQuery, k, parallelism, totalDuration, singleResults)
}
Expand Down Expand Up @@ -254,17 +256,14 @@ object ExecuteLocal extends App {

override def run(args: List[String]): URIO[Console, ExitCode] = {
val s3Client = S3Utils.minioClient()
val dataset = Dataset.RandomDenseFloat(1024, 50000)
val dataset = Dataset.RandomSparseBool(1000, 10000)
val exp = Experiment(
dataset,
Mapping.L2Lsh(dataset.dims, 200, 2, 1),
NearestNeighborsQuery.L2Lsh("vec", Vec.Empty(), 100),
Mapping.L2Lsh(dataset.dims, 200, 2, 1),
Mapping.SparseIndexed(dataset.dims),
NearestNeighborsQuery.SparseIndexed("vec", Vec.Empty(), Similarity.Jaccard),
Mapping.SparseIndexed(dataset.dims),
Seq(
Query(NearestNeighborsQuery.L2Lsh("vec", Vec.Empty(), 100), 100),
Query(NearestNeighborsQuery.L2Lsh("vec", Vec.Empty(), 100), 100),
Query(NearestNeighborsQuery.L2Lsh("vec", Vec.Empty(), 100), 100),
Query(NearestNeighborsQuery.L2Lsh("vec", Vec.Empty(), 100), 100)
Query(NearestNeighborsQuery.SparseIndexed("vec", Vec.Empty(), Similarity.Jaccard), 100)
)
)
s3Client.putObject("elastiknn-benchmarks", s"experiments/${exp.md5sum}.json", codecs.experimentCodec(exp).noSpaces)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ import com.klibisz.elastiknn.query.{ExactQuery, LshQuery, SparseIndexedQuery}
import com.klibisz.elastiknn.{ELASTIKNN_NAME, VectorDimensionException}
import io.circe.syntax._
import io.circe.{Json, JsonObject}
import org.apache.lucene.index.{IndexableField, Term}
import org.apache.lucene.index.{IndexOptions, IndexableField, Term}
import org.apache.lucene.search.similarities.BooleanSimilarity
import org.apache.lucene.search.{DocValuesFieldExistsQuery, Query, TermInSetQuery, TermQuery}
import org.apache.lucene.document
import org.apache.lucene.util.BytesRef
import org.elasticsearch.common.xcontent.{ToXContent, XContentBuilder}
import org.elasticsearch.index.mapper.Mapper.TypeParser
Expand Down Expand Up @@ -56,14 +57,44 @@ object VectorMapper {
private def incompatible(m: Mapping, v: Vec): Exception = new IllegalArgumentException(
s"Mapping [${nospaces(m)}] is not compatible with vector [${nospaces(v)}]"
)

class FieldType(typeName: String) extends MappedFieldType {

// We generally only care about the presence or absence of terms, not their counts or anything fancier.
this.setSimilarity(new SimilarityProvider("boolean", new BooleanSimilarity))
this.setOmitNorms(true)
this.setBoost(1f)
this.setTokenized(false)

override def typeName(): String = typeName
override def clone(): FieldType = new FieldType(typeName)
override def termQuery(value: Any, context: QueryShardContext): Query = value match {
case b: BytesRef => new TermQuery(new Term(name(), b))
case _ =>
throw new UnsupportedOperationException(
s"Field [${name()}] of type [${typeName()}] doesn't support term queries with value of type [${value.getClass}]")
}

override def existsQuery(context: QueryShardContext): Query = new DocValuesFieldExistsQuery(name())
}

val simpleTokenFieldType: document.FieldType = {
val ft = new document.FieldType
ft.setIndexOptions(IndexOptions.DOCS)
ft.setTokenized(false)
ft.setOmitNorms(true)
ft.freeze()
ft
}

}

abstract class VectorMapper[V <: Vec: ElasticsearchCodec] { self =>

val CONTENT_TYPE: String
def checkAndCreateFields(mapping: Mapping, field: String, vec: V): Try[Seq[IndexableField]]

private val fieldType = new this.FieldType
private val fieldType = new VectorMapper.FieldType(CONTENT_TYPE)

import com.klibisz.elastiknn.utils.CirceUtils.javaMapEncoder

Expand Down Expand Up @@ -140,21 +171,4 @@ abstract class VectorMapper[V <: Vec: ElasticsearchCodec] { self =>
}
}

class FieldType extends MappedFieldType {

// We generally only care about the presence or absence of terms, not their counts or anything fancier.
this.setSimilarity(new SimilarityProvider("boolean", new BooleanSimilarity))

override def typeName(): String = CONTENT_TYPE
override def clone(): FieldType = new FieldType
override def termQuery(value: Any, context: QueryShardContext): Query = value match {
case b: BytesRef => new TermQuery(new Term(name(), b))
case _ =>
throw new UnsupportedOperationException(
s"Field [${name()}] of type [${typeName()}] doesn't support term queries with value of type [${value.getClass}]")
}

override def existsQuery(context: QueryShardContext): Query = new DocValuesFieldExistsQuery(name())
}

}
19 changes: 4 additions & 15 deletions plugin/src/main/scala/com/klibisz/elastiknn/query/LshQuery.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,14 @@ import java.util.Objects

import com.google.common.collect.MinMaxPriorityQueue
import com.klibisz.elastiknn.api.{Mapping, Vec}
import com.klibisz.elastiknn.mapper.VectorMapper
import com.klibisz.elastiknn.models.LshFunction
import com.klibisz.elastiknn.storage.{StoredVec, UnsafeSerialization}
import org.apache.lucene.document.{Field, FieldType}
import org.apache.lucene.document.Field
import org.apache.lucene.index._
import org.apache.lucene.queryparser.xml.builders.MatchAllDocsQueryBuilder
import org.apache.lucene.search._
import org.apache.lucene.util.BytesRef
import org.elasticsearch.common.lucene.search.function.{CombineFunction, FunctionScoreQuery, LeafScoreFunction, ScoreFunction}
import org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder
import org.elasticsearch.index.query.{BoolQueryBuilder, ConstantScoreQueryBuilder, MoreLikeThisQueryBuilder, QueryBuilder, TermQueryBuilder}

object LshQuery {

Expand Down Expand Up @@ -77,8 +75,7 @@ object LshQuery {
lshFunc(queryVec).foreach { h =>
val term = new Term(field, new BytesRef(UnsafeSerialization.writeInt(h)))
val termQuery = new TermQuery(term)
val constQuery = new ConstantScoreQuery(termQuery) // TODO: is this necessary?
builder.add(new BooleanClause(constQuery, BooleanClause.Occur.SHOULD))
builder.add(new BooleanClause(termQuery, BooleanClause.Occur.SHOULD))
}
builder.setMinimumNumberShouldMatch(1)
builder.build()
Expand All @@ -87,18 +84,10 @@ object LshQuery {
new FunctionScoreQuery(isecQuery, func, CombineFunction.REPLACE, 0f, Float.MaxValue)
}

private val hashesFieldType: FieldType = {
val ft = new FieldType
ft.setIndexOptions(IndexOptions.DOCS)
ft.setTokenized(false)
ft.freeze()
ft
}

def index[M <: Mapping, V <: Vec: StoredVec.Encoder, S <: StoredVec](field: String, vec: V, mapping: M)(
implicit lshFunctionCache: LshFunctionCache[M, V, S]): Seq[IndexableField] = {
ExactQuery.index(field, vec) ++ lshFunctionCache(mapping)(vec).map { h =>
new Field(field, UnsafeSerialization.writeInt(h), hashesFieldType)
new Field(field, UnsafeSerialization.writeInt(h), VectorMapper.simpleTokenFieldType)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package com.klibisz.elastiknn.query
import java.util.Objects

import com.klibisz.elastiknn.api._
import com.klibisz.elastiknn.mapper.VectorMapper
import com.klibisz.elastiknn.models.SparseIndexedSimilarityFunction
import com.klibisz.elastiknn.storage.UnsafeSerialization
import org.apache.lucene.document.{Field, FieldType, NumericDocValuesField}
Expand Down Expand Up @@ -54,6 +55,7 @@ object SparseIndexedQuery {
val clause = new BooleanClause(termQuery, BooleanClause.Occur.SHOULD)
builder.add(clause)
}
builder.setMinimumNumberShouldMatch(1)
builder.build()
}
val f = new SparseIndexedScoreFunction(field, queryVec, simFunc)
Expand All @@ -62,17 +64,9 @@ object SparseIndexedQuery {

def numTrueDocValueField(field: String): String = s"$field.num_true"

private val trueIndicesFieldType: FieldType = {
val ft = new FieldType
ft.setIndexOptions(IndexOptions.DOCS)
ft.setTokenized(false)
ft.freeze()
ft
}

def index(field: String, vec: Vec.SparseBool): Seq[IndexableField] = {
vec.trueIndices.map { ti =>
new Field(field, UnsafeSerialization.writeInt(ti), trueIndicesFieldType)
new Field(field, UnsafeSerialization.writeInt(ti), VectorMapper.simpleTokenFieldType)
} ++ ExactQuery.index(field, vec) :+ new NumericDocValuesField(numTrueDocValueField(field), vec.trueIndices.length)
}

Expand Down

0 comments on commit dfb81fd

Please sign in to comment.