Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove getClassLoader permissions in plugin-security.policy #656

Merged
merged 12 commits into from
Mar 23, 2024
2 changes: 1 addition & 1 deletion docs/pages/performance/fashion-mnist/plot.b64

Large diffs are not rendered by default.

Binary file modified docs/pages/performance/fashion-mnist/plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 8 additions & 8 deletions docs/pages/performance/fashion-mnist/results.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
|Model|Parameters|Recall|Queries per Second|
|---|---|---|---|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=500 probes=0|0.378|381.122|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=1000 probes=0|0.447|315.007|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=500 probes=3|0.635|302.868|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=1000 probes=3|0.716|258.193|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=500 probes=0|0.768|335.365|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=1000 probes=0|0.846|288.638|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=500 probes=3|0.921|230.383|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=1000 probes=3|0.960|207.293|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=500 probes=0|0.378|381.926|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=1000 probes=0|0.447|315.984|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=500 probes=3|0.635|298.115|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=1000 probes=3|0.716|258.478|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=500 probes=0|0.767|335.131|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=1000 probes=0|0.846|282.080|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=500 probes=3|0.921|222.554|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=1000 probes=3|0.960|202.313|
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package com.klibisz.elastiknn.api;

import java.util.Arrays;

public class FloatArrayBuffer {

// Track the last final capacity to exploit the fact that the current
// vector length is probably the same as the last vector length.
// Using a non-atomic because race conditions are unlikely to hurt.
private static final int minInitialCapacity = 4;
private static final int maxInitialCapacity = 4096;
private static int nextInitialCapacity = minInitialCapacity;

private float[] array;

private int index = 0;

public FloatArrayBuffer() {
this.array = new float[nextInitialCapacity];
}

public void append(float f) {
// I also measured a try/catch approach that attempts to set the index,
// catches an IndexOutOfBoundsException, and then expands the array.
// The if statement gets about 557013 ops/s on r6i.4xlarge.
// The try/catch gets about 523811 ops/s on r6i.4xlarge.
// Sticking with if statement because it's simpler and faster.
if (index == this.array.length) {
this.array = Arrays.copyOf(this.array, this.array.length * 2);
}
this.array[index++] = f;
}

public float[] toArray() {
if (nextInitialCapacity != index) {
nextInitialCapacity = Math.min(maxInitialCapacity, Math.max(minInitialCapacity, index));
}
if (this.array.length == index) {
return this.array;
} else {
return Arrays.copyOf(this.array, index);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package com.klibisz.elastiknn.api;

import java.util.Arrays;

public class IntArrayBuffer {

// Track the last final capacity to exploit the fact that the current
// vector length is probably the same as the last vector length.
// Using a non-atomic because race conditions are unlikely to hurt.
private static final int minInitialCapacity = 4;
private static final int maxInitialCapacity = 4096;
private static int nextInitialCapacity = minInitialCapacity;

private int[] array;

private int index = 0;

public IntArrayBuffer() {
this.array = new int[nextInitialCapacity];
}

public void append(int i) {
if (index == this.array.length) {
this.array = Arrays.copyOf(this.array, this.array.length * 2);
}
this.array[index++] = i;
}

public int[] toArray() {
if (nextInitialCapacity != index) {
nextInitialCapacity = Math.min(maxInitialCapacity, Math.max(minInitialCapacity, index));
}
if (this.array.length == index) {
return this.array;
} else {
return Arrays.copyOf(this.array, index);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import org.elasticsearch.xcontent._

import java.io.ByteArrayOutputStream
import scala.collection.immutable.SortedSet
import scala.collection.mutable.ArrayBuffer

/** JSON codec for Elastiknn API types, implemented using the Elasticsearch XContentBuilder and XContentParser.
*/
Expand Down Expand Up @@ -397,8 +396,8 @@ object XContentCodec {
private def assertValue(name: String, text: String, expected: SortedSet[String]): Unit =
if (expected.contains(text)) () else throw new XContentParseException(unexpectedValue(name, text, expected))

private def parseFloatArray(p: XContentParser, expectedLength: Int): Array[Float] = {
val b = new ArrayBuffer[Float](expectedLength)
private def parseFloatArray(p: XContentParser): Array[Float] = {
val b = new FloatArrayBuffer()
p.currentToken() match {
case START_ARRAY => ()
case VALUE_NUMBER => b.append(p.floatValue())
Expand All @@ -411,8 +410,8 @@ object XContentCodec {
b.toArray
}

private def parseSparseBoolArray(p: XContentParser, expectedLength: Int): Array[Int] = {
val b = new ArrayBuffer[Int](expectedLength)
private def parseSparseBoolArray(p: XContentParser): Array[Int] = {
val b = new IntArrayBuffer()
p.currentToken() match {
case START_ARRAY => ()
case VALUE_NUMBER => b.append(p.intValue())
Expand Down Expand Up @@ -469,13 +468,13 @@ object XContentCodec {
index = Some(p.text())
case n @ Names.TRUE_INDICES =>
assertToken(n, p.nextToken(), START_ARRAY)
trueIndices = Some(parseSparseBoolArray(p, 42))
trueIndices = Some(parseSparseBoolArray(p))
case n @ Names.TOTAL_INDICES =>
assertToken(n, p.nextToken(), VALUE_NUMBER)
totalIndices = Some(p.intValue())
case n @ Names.VALUES =>
assertToken(n, p.nextToken(), START_ARRAY)
values = Some(parseFloatArray(p, 42))
values = Some(parseFloatArray(p))
case _ => p.nextToken()
}
}
Expand All @@ -485,9 +484,9 @@ object XContentCodec {
case END_ARRAY =>
values = Some(Array.empty)
case VALUE_NUMBER =>
values = Some(parseFloatArray(p, 42))
values = Some(parseFloatArray(p))
case START_ARRAY =>
trueIndices = Some(parseSparseBoolArray(p, 42))
trueIndices = Some(parseSparseBoolArray(p))
assertToken(p.nextToken(), VALUE_NUMBER)
totalIndices = Some(p.intValue())
case t =>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package com.klibisz.elastiknn.jmhbenchmarks

import com.klibisz.elastiknn.api.FloatArrayBuffer
import org.openjdk.jmh.annotations._

import scala.collection.mutable.ArrayBuffer
import scala.util.Random

@State(Scope.Benchmark)
class FloatArrayBufferBenchmarksState {
implicit private val rng: Random = new Random(0)
val lst768 = (0 until 768).map(_ => rng.nextFloat()).toList
}

class FloatArrayBufferBenchmarks {

@Benchmark
@BenchmarkMode(Array(Mode.Throughput))
@Fork(value = 1)
@Warmup(time = 5, iterations = 1)
@Measurement(time = 5, iterations = 1)
def scalaAppendFixedInitialSize(state: FloatArrayBufferBenchmarksState): Int = {
val buf = new ArrayBuffer[Float]()
state.lst768.foreach(buf.append)
buf.toArray.length
}

@Benchmark
@BenchmarkMode(Array(Mode.Throughput))
@Fork(value = 1)
@Warmup(time = 5, iterations = 1)
@Measurement(time = 5, iterations = 1)
def scalaAppendKnownInitialSize(state: FloatArrayBufferBenchmarksState): Int = {
val buf = new ArrayBuffer[Float](768)
state.lst768.foreach(buf.append)
buf.toArray.length
}

@Benchmark
@BenchmarkMode(Array(Mode.Throughput))
@Fork(value = 1)
@Warmup(time = 5, iterations = 1)
@Measurement(time = 5, iterations = 1)
def customAppend(state: FloatArrayBufferBenchmarksState): Int = {
val buf = new FloatArrayBuffer()
state.lst768.foreach(buf.append)
buf.toArray.length
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package com.klibisz.elastiknn;

import org.elasticsearch.index.mapper.FieldMapper;

public class VectorMapperUtil {

public static FieldMapper.Parameter<?>[] EMPTY_ARRAY_FIELD_MAPPER_PARAMETER = new FieldMapper.Parameter[0];

}
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
grant {
permission java.lang.RuntimePermission "getClassLoader";
};
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ object VectorMapper {
else {
val sorted = vec.sorted() // Sort for faster intersections on the query side.
mapping match {
case Mapping.SparseBool(_) => Try(ExactQuery.index(field, sorted))
case Mapping.SparseBool(_) => Try(Seq(ExactQuery.index(field, sorted)))
case m: Mapping.JaccardLsh =>
Try(HashingQuery.index(field, luceneFieldType, sorted, modelCache(m).hash(vec.trueIndices, vec.totalIndices)))
case m: Mapping.HammingLsh =>
Expand All @@ -51,7 +51,7 @@ object VectorMapper {
Failure(ElastiknnException.vectorDimensions(vec.values.length, mapping.dims))
else
mapping match {
case Mapping.DenseFloat(_) => Try(ExactQuery.index(field, vec))
case Mapping.DenseFloat(_) => Try(Seq(ExactQuery.index(field, vec)))
case m: Mapping.CosineLsh => Try(HashingQuery.index(field, luceneFieldType, vec, modelCache(m).hash(vec.values)))
case m: Mapping.L2Lsh => Try(HashingQuery.index(field, luceneFieldType, vec, modelCache(m).hash(vec.values)))
case m: Mapping.PermutationLsh => Try(HashingQuery.index(field, luceneFieldType, vec, modelCache(m).hash(vec.values)))
Expand Down Expand Up @@ -138,6 +138,9 @@ abstract class VectorMapper[V <: Vec: XContentCodec.Decoder] { self =>
override def getMergeBuilder: FieldMapper.Builder = new Builder(simpleName(), mapping)
}

override def getParameters: Array[FieldMapper.Parameter[_]] = Array.empty
override def getParameters: Array[FieldMapper.Parameter[_]] =
// This has to be defined in Java because scala's Array wrapper uses ClassTag,
// which requires the extra permission: java.lang.RuntimePermission "getClassLoader".
VectorMapperUtil.EMPTY_ARRAY_FIELD_MAPPER_PARAMETER
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ final class ExactQuery[V <: Vec, S <: StoredVec](field: String, queryVec: V, sim
}

object ExactQuery {
def index[V <: Vec: StoredVec.Encoder](field: String, vec: V): Seq[IndexableField] = {
def index[V <: Vec: StoredVec.Encoder](field: String, vec: V): IndexableField = {
val storedVec = implicitly[StoredVec.Encoder[V]].apply(vec)
Seq(new BinaryDocValuesField(field, new BytesRef(storedVec)))
new BinaryDocValuesField(field, new BytesRef(storedVec))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import org.apache.lucene.util.BytesRef
import org.elasticsearch.common.lucene.search.function.{CombineFunction, LeafScoreFunction, ScoreFunction}

import java.util.Objects
import scala.collection.mutable.ListBuffer

final class HashingQuery[V <: Vec, S <: StoredVec: Decoder](
field: String,
Expand Down Expand Up @@ -52,10 +53,15 @@ final class HashingQuery[V <: Vec, S <: StoredVec: Decoder](
private val reader = ctx.reader()
private val terms = reader.terms(field)
private val termsEnum = terms.iterator()
private val postings = hashes.sorted.flatMap { h =>
if (termsEnum.seekExact(new BytesRef(h.hash))) Some(termsEnum.postings(null, PostingsEnum.NONE))
else None
private val postings: Seq[PostingsEnum] = {
val buf = new ListBuffer[PostingsEnum]()
hashes.sorted.foreach { h =>
if (termsEnum.seekExact(new BytesRef(h.hash))) buf.prepend(termsEnum.postings(null, PostingsEnum.NONE))
else None
}
buf.toList.reverse
}

override def score(docId: Int, subQueryScore: Float): Double = {
val intersection = postings.count { p => p.docID() != DocIdSetIterator.NO_MORE_DOCS && p.advance(docId) == docId }
simFunc.maxScore * (intersection * 1d / hashes.length)
Expand Down Expand Up @@ -84,8 +90,11 @@ object HashingQuery {
fieldType: FieldType,
vec: V,
hashes: Array[HashAndFreq]
): Seq[IndexableField] = ExactQuery.index(field, vec) ++ hashes.flatMap { h =>
val f = new Field(field, h.hash, fieldType)
(0 until h.freq).map(_ => f)
): Seq[IndexableField] = {
val buffer = ListBuffer.empty[IndexableField]
hashes.foreach { h =>
(0 until h.freq).foreach(_ => buffer.prepend(new Field(field, h.hash, fieldType)))
}
buffer.prepend(ExactQuery.index(field, vec)).toList
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ import scala.util.Random

class PermutationLshModelSuite extends AnyFunSuite with Matchers with LuceneSupport {

// For some unknown reason the exact score values started to slightly differ around March 2024.
def round(f: Float): Float =
BigDecimal(f).setScale(6, BigDecimal.RoundingMode.HALF_UP).floatValue

test("lucene example where counting matters") {

// This example demonstrates a tricky condition: 0 appears once in the query vector and three times in corpus vector
Expand Down Expand Up @@ -62,7 +66,7 @@ class PermutationLshModelSuite extends AnyFunSuite with Matchers with LuceneSupp
} { case (r, s) =>
queryVecs.map { v =>
val q = new HashingQuery("vec", v, 200, lsh.hash(v.values), cosine)
s.search(q.toLuceneQuery(r), 100).scoreDocs.map(sd => (sd.doc, sd.score)).toVector
s.search(q.toLuceneQuery(r), 100).scoreDocs.map(sd => (sd.doc, round(sd.score))).toVector
}
}
queryResults
Expand Down
Loading