Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Label Filter Cases (only compatible with milvus) #455

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ dependencies = [
"psutil",
"polars",
"plotly",
"environs",
"environs<14.1.0",
"pydantic<v2",
"scikit-learn",
"pymilvus", # with pandas, numpy, ujson
Expand Down
28 changes: 1 addition & 27 deletions vectordb_bench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,33 +27,7 @@ class config:
DROP_OLD = env.bool("DROP_OLD", True)
USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", True)

NUM_CONCURRENCY = env.list(
"NUM_CONCURRENCY",
[
1,
5,
10,
15,
20,
25,
30,
35,
40,
45,
50,
55,
60,
65,
70,
75,
80,
85,
90,
95,
100,
],
subcast=int,
)
NUM_CONCURRENCY = env.list("NUM_CONCURRENCY", [1, 5, 10, 20, 30, 40, 60, 80], subcast=int)

CONCURRENCY_DURATION = 30

Expand Down
3 changes: 2 additions & 1 deletion vectordb_bench/backend/assembler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from vectordb_bench.backend.clients import EmptyDBCaseConfig
from vectordb_bench.backend.data_source import DatasetSource
from vectordb_bench.backend.filter import FilterOp
from vectordb_bench.models import TaskConfig

from .cases import CaseLabel
Expand Down Expand Up @@ -55,7 +56,7 @@ def assemble_all(

# sort by dataset size
for _, runner in db2runner.items():
runner.sort(key=lambda x: x.ca.dataset.data.size)
runner.sort(key=lambda x: (x.ca.dataset.data.size, 0 if x.ca.filters.type == FilterOp.StrEqual else 1))

all_runners = []
all_runners.extend(load_runners)
Expand Down
91 changes: 72 additions & 19 deletions vectordb_bench/backend/cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@

from vectordb_bench import config
from vectordb_bench.backend.clients.api import MetricType
from vectordb_bench.backend.filter import Filter, FilterOp, IntFilter, LabelFilter, non_filter
from vectordb_bench.base import BaseModel
from vectordb_bench.frontend.components.custom.getCustomConfig import (
CustomDatasetConfig,
)
from vectordb_bench.frontend.components.custom.getCustomConfig import CustomDatasetConfig

from .dataset import CustomDataset, Dataset, DatasetManager, DatasetWithSizeType

Expand Down Expand Up @@ -50,6 +49,8 @@ class CaseType(Enum):

StreamingPerformanceCase = 200

LabelFilterPerformanceCase = 300

def case_cls(self, custom_configs: dict | None = None) -> type["Case"]:
if custom_configs is None:
return type2case.get(self)()
Expand Down Expand Up @@ -97,15 +98,21 @@ class Case(BaseModel):
filter_rate: float | None = None

@property
def filters(self) -> dict | None:
if self.filter_rate is not None:
target_id = round(self.filter_rate * self.dataset.data.size)
return {
"metadata": f">={target_id}",
"id": target_id,
}
def filters(self) -> Filter:
return non_filter

@property
def with_scalar_labels(self) -> bool:
return self.filters.type == FilterOp.StrEqual

def check_scalar_labels(self) -> None:
if self.with_scalar_labels and not self.dataset.data.with_scalar_labels:
msg = f"Case init failed: no scalar_labels data in current dataset ({self.dataset.data.full_name})"
raise ValueError(msg)

return None
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.check_scalar_labels()


class CapacityCase(Case):
Expand Down Expand Up @@ -151,6 +158,14 @@ class Performance768D10M(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_10M


class IntFilterPerformanceCase(PerformanceCase):
@property
def filters(self) -> Filter:
int_field = self.dataset.data.train_id_field
int_value = int(self.dataset.data.size * self.filter_rate)
return IntFilter(filter_rate=self.filter_rate, int_field=int_field, int_value=int_value)


class Performance768D1M(PerformanceCase):
case_id: CaseType = CaseType.Performance768D1M
dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
Expand All @@ -162,7 +177,7 @@ class Performance768D1M(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_1M


class Performance768D10M1P(PerformanceCase):
class Performance768D10M1P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance768D10M1P
filter_rate: float | int | None = 0.01
dataset: DatasetManager = Dataset.COHERE.manager(10_000_000)
Expand All @@ -174,7 +189,7 @@ class Performance768D10M1P(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_10M


class Performance768D1M1P(PerformanceCase):
class Performance768D1M1P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance768D1M1P
filter_rate: float | int | None = 0.01
dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
Expand All @@ -186,7 +201,7 @@ class Performance768D1M1P(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_1M


class Performance768D10M99P(PerformanceCase):
class Performance768D10M99P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance768D10M99P
filter_rate: float | int | None = 0.99
dataset: DatasetManager = Dataset.COHERE.manager(10_000_000)
Expand All @@ -198,7 +213,7 @@ class Performance768D10M99P(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_10M


class Performance768D1M99P(PerformanceCase):
class Performance768D1M99P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance768D1M99P
filter_rate: float | int | None = 0.99
dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
Expand Down Expand Up @@ -246,7 +261,7 @@ class Performance1536D5M(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_5M


class Performance1536D500K1P(PerformanceCase):
class Performance1536D500K1P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance1536D500K1P
filter_rate: float | int | None = 0.01
dataset: DatasetManager = Dataset.OPENAI.manager(500_000)
Expand All @@ -258,7 +273,7 @@ class Performance1536D500K1P(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_500K


class Performance1536D5M1P(PerformanceCase):
class Performance1536D5M1P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance1536D5M1P
filter_rate: float | int | None = 0.01
dataset: DatasetManager = Dataset.OPENAI.manager(5_000_000)
Expand All @@ -270,7 +285,7 @@ class Performance1536D5M1P(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_5M


class Performance1536D500K99P(PerformanceCase):
class Performance1536D500K99P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance1536D500K99P
filter_rate: float | int | None = 0.99
dataset: DatasetManager = Dataset.OPENAI.manager(500_000)
Expand All @@ -282,7 +297,7 @@ class Performance1536D500K99P(PerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_500K


class Performance1536D5M99P(PerformanceCase):
class Performance1536D5M99P(IntFilterPerformanceCase):
case_id: CaseType = CaseType.Performance1536D5M99P
filter_rate: float | int | None = 0.99
dataset: DatasetManager = Dataset.OPENAI.manager(5_000_000)
Expand Down Expand Up @@ -408,6 +423,43 @@ def __init__(
)


class LabelFilterPerformanceCase(PerformanceCase):
case_id: CaseType = CaseType.LabelFilterPerformanceCase
dataset_with_size_type: DatasetWithSizeType
label_percentage: float

def __init__(
self,
dataset_with_size_type: DatasetWithSizeType | str,
label_percentage: float,
**kwargs,
):
if not isinstance(dataset_with_size_type, DatasetWithSizeType):
dataset_with_size_type = DatasetWithSizeType(dataset_with_size_type)
name = f"Label-Filter-{label_percentage*100:.1f}% - {dataset_with_size_type.value}"
description = f"Label-Filter-{label_percentage*100:.1f}% Performance Test ({dataset_with_size_type.value})"
dataset = dataset_with_size_type.get_manager()
load_timeout = dataset_with_size_type.get_load_timeout()
optimize_timeout = dataset_with_size_type.get_optimize_timeout()
filters = LabelFilter(label_percentage=label_percentage)
filter_rate = filters.filter_rate
super().__init__(
name=name,
description=description,
dataset=dataset,
load_timeout=load_timeout,
optimize_timeout=optimize_timeout,
filter_rate=filter_rate,
dataset_with_size_type=dataset_with_size_type,
label_percentage=label_percentage,
**kwargs,
)

@property
def filters(self) -> Filter:
return LabelFilter(label_percentage=self.label_percentage)


type2case = {
CaseType.CapacityDim960: CapacityDim960,
CaseType.CapacityDim128: CapacityDim128,
Expand All @@ -427,4 +479,5 @@ def __init__(
CaseType.Performance1536D50K: Performance1536D50K,
CaseType.PerformanceCustomDataset: PerformanceCustomDataset,
CaseType.StreamingPerformanceCase: StreamingPerformanceCase,
CaseType.LabelFilterPerformanceCase: LabelFilterPerformanceCase,
}
21 changes: 19 additions & 2 deletions vectordb_bench/backend/clients/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from pydantic import BaseModel, SecretStr, validator

from vectordb_bench.backend.filter import Filter, FilterOp


class MetricType(str, Enum):
L2 = "L2"
Expand Down Expand Up @@ -110,6 +112,21 @@ class VectorDB(ABC):
>>> milvus.search_embedding()
"""

"The filtering types supported by the VectorDB Client, default only non-filter"
supported_filter_types: list[FilterOp] = [FilterOp.NonFilter]

@classmethod
def filter_supported(cls, filters: Filter) -> bool:
"""Ensure that the filters are supported before testing filtering cases."""
return filters.type in cls.supported_filter_types

def prepare_filter(self, filters: Filter):
"""The vector database is allowed to pre-prepare different filter conditions
to reduce redundancy during the testing process.

(All search tests in a case use consistent filtering conditions.)"""
return

@abstractmethod
def __init__(
self,
Expand Down Expand Up @@ -160,8 +177,9 @@ def insert_embeddings(
self,
embeddings: list[list[float]],
metadata: list[int],
labels_data: list[str] | None = None,
**kwargs,
) -> (int, Exception):
) -> tuple[int, Exception]:
"""Insert the embeddings to the vector database. The default number of embeddings for
each insert_embeddings is 5000.

Expand All @@ -180,7 +198,6 @@ def search_embedding(
self,
query: list[float],
k: int = 100,
filters: dict | None = None,
) -> list[int]:
"""Get k most similar embeddings to query vector.

Expand Down
3 changes: 2 additions & 1 deletion vectordb_bench/backend/clients/milvus/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


class MilvusConfig(DBConfig):
uri: SecretStr = "http://10.102.7.230:19530"
uri: SecretStr = "http://localhost:19530"
user: str | None = None
password: SecretStr | None = None

Expand Down Expand Up @@ -33,6 +33,7 @@ class MilvusIndexConfig(BaseModel):

index: IndexType
metric_type: MetricType | None = None
use_partition_key: bool = True # for label-filter

@property
def is_gpu_index(self) -> bool:
Expand Down
Loading