Skip to content

Commit

Permalink
A candidate that uses Faiss 1.7.4
Browse files Browse the repository at this point in the history
Signed-off-by: Alexandr Guzhva <[email protected]>
  • Loading branch information
alexanderguzhva committed Nov 1, 2023
1 parent 14a1406 commit 609c978
Show file tree
Hide file tree
Showing 24 changed files with 565 additions and 179 deletions.
2 changes: 2 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,5 @@ benchmark_test(benchmark_float_bitset hdf5/benchmark_float_bitset.cpp)
benchmark_test(benchmark_float_qps hdf5/benchmark_float_qps.cpp)
benchmark_test(benchmark_float_range hdf5/benchmark_float_range.cpp)
benchmark_test(benchmark_float_range_bitset hdf5/benchmark_float_range_bitset.cpp)

benchmark_test(gen_hdf5_file hdf5/gen_hdf5_file.cpp)
105 changes: 18 additions & 87 deletions benchmark/hdf5/benchmark_hdf5.h
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,20 @@ class Benchmark_hdf5 : public Benchmark_base {
return data_out;
}

void
write_hdf5_dataset(hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols,
const void* data) {
hsize_t dims[2];
dims[0] = rows;
dims[1] = cols;
auto dataspace = H5Screate_simple(2, dims, NULL);
auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data);
assert(err == 0);
H5Dclose(dataset);
H5Sclose(dataspace);
}

// For binary vector, dim should be divided by 32, since we use int32 to store binary vector data */
template <bool is_binary>
void
Expand All @@ -338,31 +352,18 @@ class Benchmark_hdf5 : public Benchmark_base {
/* Open the file and the dataset. */
hid_t file = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

auto write_hdf5_dataset = [](hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols,
const void* data) {
hsize_t dims[2];
dims[0] = rows;
dims[1] = cols;
auto dataspace = H5Screate_simple(2, dims, NULL);
auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data);
assert(err == 0);
H5Dclose(dataset);
H5Sclose(dataspace);
};

/* write train dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_FLOAT, nb, dim, xb);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim, xb);
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim / 32, xb);
}

/* write test dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_FLOAT, nq, dim, xq);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim, xq);
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim / 32, xq);
}

/* write ground-truth labels dataset */
Expand All @@ -388,31 +389,18 @@ class Benchmark_hdf5 : public Benchmark_base {
/* Open the file and the dataset. */
hid_t file = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

auto write_hdf5_dataset = [](hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols,
const void* data) {
hsize_t dims[2];
dims[0] = rows;
dims[1] = cols;
auto dataspace = H5Screate_simple(2, dims, NULL);
auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data);
assert(err == 0);
H5Dclose(dataset);
H5Sclose(dataspace);
};

/* write train dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_FLOAT, nb, dim, xb);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim, xb);
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim / 32, xb);
}

/* write test dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_FLOAT, nq, dim, xq);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim, xq);
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim / 32, xq);
}

/* write ground-truth radius */
Expand All @@ -431,63 +419,6 @@ class Benchmark_hdf5 : public Benchmark_base {
H5Fclose(file);
}

// For binary vector, dim should be divided by 32, since we use int32 to store binary vector data */
// Write HDF5 file with following dataset:
// HDF5_DATASET_RADIUS - H5T_NATIVE_FLOAT, [1, nq]
// HDF5_DATASET_LIMS - H5T_NATIVE_INT32, [1, nq+1]
// HDF5_DATASET_NEIGHBORS - H5T_NATIVE_INT32, [1, lims[nq]]
// HDF5_DATASET_DISTANCES - H5T_NATIVE_FLOAT, [1, lims[nq]]
template <bool is_binary>
void
hdf5_write_range(const char* file_name, const int32_t dim, const void* xb, const int32_t nb, const void* xq,
const int32_t nq, const float* g_radius, const void* g_lims, const void* g_ids,
const void* g_dist) {
/* Open the file and the dataset. */
hid_t file = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

auto write_hdf5_dataset = [](hid_t file, const char* dataset_name, hid_t type_id, int32_t rows, int32_t cols,
const void* data) {
hsize_t dims[2];
dims[0] = rows;
dims[1] = cols;
auto dataspace = H5Screate_simple(2, dims, NULL);
auto dataset = H5Dcreate2(file, dataset_name, type_id, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
auto err = H5Dwrite(dataset, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data);
assert(err == 0);
H5Dclose(dataset);
H5Sclose(dataspace);
};

/* write train dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_FLOAT, nb, dim, xb);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TRAIN, H5T_NATIVE_INT32, nb, dim, xb);
}

/* write test dataset */
if (!is_binary) {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_FLOAT, nq, dim, xq);
} else {
write_hdf5_dataset(file, HDF5_DATASET_TEST, H5T_NATIVE_INT32, nq, dim, xq);
}

/* write ground-truth radius */
write_hdf5_dataset(file, HDF5_DATASET_RADIUS, H5T_NATIVE_FLOAT, 1, nq, g_radius);

/* write ground-truth lims dataset */
write_hdf5_dataset(file, HDF5_DATASET_LIMS, H5T_NATIVE_INT32, 1, nq + 1, g_lims);

/* write ground-truth labels dataset */
write_hdf5_dataset(file, HDF5_DATASET_NEIGHBORS, H5T_NATIVE_INT32, 1, ((int32_t*)g_lims)[nq], g_ids);

/* write ground-truth distance dataset */
write_hdf5_dataset(file, HDF5_DATASET_DISTANCES, H5T_NATIVE_FLOAT, 1, ((int32_t*)g_lims)[nq], g_dist);

/* Close/release resources. */
H5Fclose(file);
}

protected:
std::string ann_test_name_ = "";
std::string metric_str_;
Expand Down
178 changes: 178 additions & 0 deletions benchmark/hdf5/gen_hdf5_file.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
// Copyright (C) 2019-2023 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.

#include <gtest/gtest.h>

#include <algorithm>
#include <random>
#include <vector>

#include "benchmark_hdf5.h"
#include "knowhere/comp/brute_force.h"
#include "knowhere/comp/index_param.h"
#include "knowhere/comp/knowhere_config.h"
#include "knowhere/dataset.h"

knowhere::DataSetPtr
GenDataSet(int rows, int dim) {
std::mt19937 rng(42);
std::uniform_real_distribution<> distrib(-1.0, 1.0);
float* ts = new float[rows * dim];
for (int i = 0; i < rows * dim; ++i) {
ts[i] = (float)distrib(rng);
}
auto ds = knowhere::GenDataSet(rows, dim, ts);
ds->SetIsOwner(true);
return ds;
}

knowhere::DataSetPtr
GenBinDataSet(int rows, int dim) {
std::mt19937 rng(42);
std::uniform_int_distribution<> distrib(0, 255);
int uint8_num = dim / 8;
uint8_t* ts = new uint8_t[rows * uint8_num];
for (int i = 0; i < rows * uint8_num; ++i) {
ts[i] = (uint8_t)distrib(rng);
}
auto ds = knowhere::GenDataSet(rows, dim, ts);
ds->SetIsOwner(true);
return ds;
}

class Create_HDF5 : public Benchmark_hdf5, public ::testing::Test {
protected:
void
SetUp() override {
}

void
TearDown() override {
}

template <bool is_binary>
void
create_hdf5_file(const knowhere::MetricType& metric_type, const int64_t nb, const int64_t nq, const int64_t dim,
const int64_t topk) {
std::string metric_str = metric_type;
transform(metric_str.begin(), metric_str.end(), metric_str.begin(), ::tolower);
std::string fn = "rand-" + std::to_string(dim) + "-" + metric_str + ".hdf5";

knowhere::Json json;
json[knowhere::meta::DIM] = dim;
json[knowhere::meta::METRIC_TYPE] = metric_type;
json[knowhere::meta::TOPK] = topk;

knowhere::DataSetPtr xb_ds, xq_ds;
if (is_binary) {
xb_ds = GenBinDataSet(nb, dim);
xq_ds = GenBinDataSet(nq, dim);
} else {
xb_ds = GenDataSet(nb, dim);
xq_ds = GenDataSet(nq, dim);
}

auto result = knowhere::BruteForce::Search(xb_ds, xq_ds, json, nullptr);
assert(result.has_value());

// convert golden_ids to int32
auto elem_cnt = nq * topk;
std::vector<int32_t> gt_ids_int(elem_cnt);
for (int32_t i = 0; i < elem_cnt; i++) {
gt_ids_int[i] = result.value()->GetIds()[i];
}

hdf5_write<is_binary>(fn.c_str(), dim, topk, xb_ds->GetTensor(), nb, xq_ds->GetTensor(), nq, gt_ids_int.data(),
result.value()->GetDistance());
}

template <bool is_binary>
void
create_range_hdf5_file(const knowhere::MetricType& metric_type, const int64_t nb, const int64_t nq,
const int64_t dim, const float radius) {
std::string metric_str = metric_type;
transform(metric_str.begin(), metric_str.end(), metric_str.begin(), ::tolower);
std::string fn = "rand-" + std::to_string(dim) + "-" + metric_str + "-range.hdf5";

knowhere::Json json;
json[knowhere::meta::DIM] = dim;
json[knowhere::meta::METRIC_TYPE] = metric_type;
json[knowhere::meta::RADIUS] = radius;

knowhere::DataSetPtr xb_ds, xq_ds;
if (is_binary) {
xb_ds = GenBinDataSet(nb, dim);
xq_ds = GenBinDataSet(nq, dim);
} else {
xb_ds = GenDataSet(nb, dim);
xq_ds = GenDataSet(nq, dim);
}

auto result = knowhere::BruteForce::RangeSearch(xb_ds, xq_ds, json, nullptr);
assert(result.has_value());

// convert golden_lims to int32
std::vector<int32_t> gt_lims_int(nq + 1);
for (int32_t i = 0; i <= nq; i++) {
gt_lims_int[i] = result.value()->GetLims()[i];
}

// convert golden_ids to int32
auto elem_cnt = result.value()->GetLims()[nq];
std::vector<int32_t> gt_ids_int(elem_cnt);
for (int32_t i = 0; i < elem_cnt; i++) {
gt_ids_int[i] = result.value()->GetIds()[i];
}

hdf5_write_range<is_binary>(fn.c_str(), dim, xb_ds->GetTensor(), nb, xq_ds->GetTensor(), nq, radius,
gt_lims_int.data(), gt_ids_int.data(), result.value()->GetDistance());
}
};

TEST_F(Create_HDF5, CREATE_FLOAT) {
int64_t nb = 10000;
int64_t nq = 100;
int64_t dim = 128;
int64_t topk = 100;

create_hdf5_file<false>(knowhere::metric::L2, nb, nq, dim, topk);
create_hdf5_file<false>(knowhere::metric::IP, nb, nq, dim, topk);
create_hdf5_file<false>(knowhere::metric::COSINE, nb, nq, dim, topk);
}

TEST_F(Create_HDF5, CREATE_FLOAT_RANGE) {
int64_t nb = 10000;
int64_t nq = 100;
int64_t dim = 128;

create_range_hdf5_file<false>(knowhere::metric::L2, nb, nq, dim, 65.0);
create_range_hdf5_file<false>(knowhere::metric::IP, nb, nq, dim, 8.7);
create_range_hdf5_file<false>(knowhere::metric::COSINE, nb, nq, dim, 0.2);
}

TEST_F(Create_HDF5, CREATE_BINARY) {
int64_t nb = 10000;
int64_t nq = 100;
int64_t dim = 1024;
int64_t topk = 100;

create_hdf5_file<true>(knowhere::metric::HAMMING, nb, nq, dim, topk);
create_hdf5_file<true>(knowhere::metric::JACCARD, nb, nq, dim, topk);
}

TEST_F(Create_HDF5, CREATE_BINARY_RANGE) {
int64_t nb = 10000;
int64_t nq = 100;
int64_t dim = 1024;

create_range_hdf5_file<true>(knowhere::metric::HAMMING, nb, nq, dim, 476);
create_range_hdf5_file<true>(knowhere::metric::JACCARD, nb, nq, dim, 0.63);
}
1 change: 1 addition & 0 deletions include/knowhere/comp/index_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ namespace indexparam {
// IVF Params
constexpr const char* NPROBE = "nprobe";
constexpr const char* NLIST = "nlist";
constexpr const char* USE_ELKAN = "use_elkan";
constexpr const char* NBITS = "nbits"; // PQ/SQ
constexpr const char* M = "m"; // PQ param for IVFPQ
constexpr const char* SSIZE = "ssize";
Expand Down
8 changes: 8 additions & 0 deletions python/knowhere/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,11 @@ def GetVectorDataSetToArray(ans):
data = np.zeros([rows, dim]).astype(np.float32)
swigknowhere.DataSetTensor2Array(ans, data)
return data


def GetBinaryVectorDataSetToArray(ans):
dim = int(swigknowhere.DataSet_Dim(ans) / 32)
rows = swigknowhere.DataSet_Rows(ans)
data = np.zeros([rows, dim]).astype(np.int32)
swigknowhere.BinaryDataSetTensor2Array(ans, data)
return data
12 changes: 12 additions & 0 deletions python/knowhere/knowhere.i
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ import_array();
%apply (float* INPLACE_ARRAY2, int DIM1, int DIM2){(float *dis,int nq_1,int k_1)}
%apply (int *INPLACE_ARRAY2, int DIM1, int DIM2){(int *ids,int nq_2,int k_2)}
%apply (float* INPLACE_ARRAY2, int DIM1, int DIM2){(float *data,int rows,int dim)}
%apply (int32_t *INPLACE_ARRAY2, int DIM1, int DIM2){(int32_t *data,int rows,int dim)}

%typemap(in, numinputs=0) knowhere::Status& status(knowhere::Status tmp) %{
$1 = &tmp;
Expand Down Expand Up @@ -329,6 +330,17 @@ DataSetTensor2Array(knowhere::DataSetPtr result, float* data, int rows, int dim)
}
}

void
BinaryDataSetTensor2Array(knowhere::DataSetPtr result, int32_t* data, int rows, int dim) {
GILReleaser rel;
auto data_ = result->GetTensor();
for (int i = 0; i < rows; i++) {
for (int j = 0; j < dim; ++j) {
*(data + i * dim + j) = *((int32_t*)(data_) + i * dim + j);
}
}
}

void
DumpRangeResultIds(knowhere::DataSetPtr result, int* ids, int len) {
GILReleaser rel;
Expand Down
1 change: 1 addition & 0 deletions src/common/config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ static const std::unordered_set<std::string> ext_legal_json_keys = {"metric_type
"dim",
"nlist", // IVF param
"nprobe", // IVF param
"use_elkan", // IVF param
"ssize", // IVF_FLAT_CC param
"nbits", // IVF_PQ param
"m", // IVF_PQ param
Expand Down
Loading

0 comments on commit 609c978

Please sign in to comment.