diff --git a/CMakeLists.txt b/CMakeLists.txt index 9235e6e92..329c5c0b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,9 @@ include(cmake/libs/libhnsw.cmake) include_directories(thirdparty/faiss) +find_package(Boost REQUIRED) +include_directories(${Boost_INCLUDE_DIRS}) + find_package(OpenMP REQUIRED) find_package(folly REQUIRED) @@ -177,6 +180,7 @@ endif() include_directories(src) include_directories(include) +list(APPEND KNOWHERE_LINKER_LIBS Boost::boost) list(APPEND KNOWHERE_LINKER_LIBS faiss) list(APPEND KNOWHERE_LINKER_LIBS glog::glog) list(APPEND KNOWHERE_LINKER_LIBS nlohmann_json::nlohmann_json) diff --git a/include/knowhere/sparse_utils.h b/include/knowhere/sparse_utils.h index ba582a5a2..56fbfc70c 100644 --- a/include/knowhere/sparse_utils.h +++ b/include/knowhere/sparse_utils.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -59,6 +60,31 @@ GetDocValueBM25Computer(float k1, float b, float avgdl) { }; } +class DocIdFilterByVector { + public: + DocIdFilterByVector(std::vector&& docids) : docids_(std::move(docids)) { + std::sort(docids_.begin(), docids_.end()); + } + + [[nodiscard]] bool + test(const table_t id) { + // find the first id that is greater than or equal to the specific id + while (pos_ < docids_.size() && docids_[pos_] < id) { + ++pos_; + } + return !(pos_ < docids_.size() && docids_[pos_] == id); + } + + [[nodiscard]] bool + empty() const { + return docids_.empty(); + } + + private: + std::vector docids_; + size_t pos_ = 0; +}; + template class SparseRow { static_assert(std::is_same_v, "SparseRow supports float only"); @@ -72,6 +98,15 @@ class SparseRow { SparseRow(size_t count, uint8_t* data, bool own_data) : data_(data), count_(count), own_data_(own_data) { } + SparseRow(const std::vector>& data) : count_(data.size()), own_data_(true) { + data_ = new uint8_t[count_ * element_size()]; + for (size_t i = 0; i < count_; ++i) { + auto* elem = reinterpret_cast(data_) + i; + elem->index = data[i].first; + elem->value = data[i].second; + } + } + // copy constructor and copy assignment operator perform deep copy SparseRow(const SparseRow& other) : SparseRow(other.count_) { std::memcpy(data_, other.data_, data_byte_size()); @@ -147,6 +182,9 @@ class SparseRow { void set_at(size_t i, table_t index, T value) { + if (i >= count_) { + throw std::out_of_range("set_at on a SparseRow with invalid index"); + } auto* elem = reinterpret_cast(data_) + i; elem->index = index; elem->value = value; @@ -300,12 +338,12 @@ class GrowableVectorView { mmap_element_count_ = 0; } - size_type + [[nodiscard]] size_type capacity() const { return mmap_byte_size_ / sizeof(T); } - size_type + [[nodiscard]] size_type size() const { return mmap_element_count_; } @@ -346,6 +384,59 @@ class GrowableVectorView { return reinterpret_cast(mmap_data_)[i]; } + class iterator : public boost::iterator_facade { + public: + iterator() = default; + explicit iterator(T* ptr) : ptr_(ptr) { + } + + friend class GrowableVectorView; + friend class boost::iterator_core_access; + + T& + dereference() const { + return *ptr_; + } + + void + increment() { + ++ptr_; + } + + void + decrement() { + --ptr_; + } + + void + advance(std::ptrdiff_t n) { + ptr_ += n; + } + + std::ptrdiff_t + distance_to(const iterator& other) const { + return other.ptr_ - ptr_; + } + + bool + equal(const iterator& other) const { + return ptr_ == other.ptr_; + } + + private: + T* ptr_ = nullptr; + }; + + iterator + begin() const { + return iterator(reinterpret_cast(mmap_data_)); + } + + iterator + end() const { + return iterator(reinterpret_cast(mmap_data_) + mmap_element_count_); + } + private: void* mmap_data_ = nullptr; size_type mmap_byte_size_ = 0; diff --git a/python/setup.py b/python/setup.py index 8893901fe..8a0f32140 100644 --- a/python/setup.py +++ b/python/setup.py @@ -51,6 +51,7 @@ def get_readme(): get_numpy_include(), os.path.join("..", "include"), os.path.join("..", "thirdparty"), + get_thirdparty_prefix("boost-headers") + "/include", get_thirdparty_prefix("nlohmann_json") + "/include", get_thirdparty_prefix("libglog") + "/include", get_thirdparty_prefix("gflags") + "/include" diff --git a/src/index/sparse/sparse_index_node.cc b/src/index/sparse/sparse_index_node.cc index 7e2e3e4b5..61cf373c9 100644 --- a/src/index/sparse/sparse_index_node.cc +++ b/src/index/sparse/sparse_index_node.cc @@ -54,14 +54,12 @@ class SparseInvertedIndexNode : public IndexNode { LOG_KNOWHERE_ERROR_ << Type() << " only support metric_type IP or BM25"; return Status::invalid_metric_type; } - auto drop_ratio_build = cfg.drop_ratio_build.value_or(0.0f); auto index_or = CreateIndex(cfg); if (!index_or.has_value()) { return index_or.error(); } auto index = index_or.value(); - index->Train(static_cast*>(dataset->GetTensor()), dataset->GetRows(), - drop_ratio_build); + index->Train(static_cast*>(dataset->GetTensor()), dataset->GetRows()); if (index_ != nullptr) { LOG_KNOWHERE_WARNING_ << Type() << " has already been created, deleting old"; DeleteExistingIndex(); @@ -209,37 +207,17 @@ class SparseInvertedIndexNode : public IndexNode { [[nodiscard]] expected GetVectorByIds(const DataSetPtr dataset) const override { - if (!index_) { - return expected::Err(Status::empty_index, "index not loaded"); - } - - auto rows = dataset->GetRows(); - auto ids = dataset->GetIds(); - - auto data = std::make_unique[]>(rows); - int64_t dim = 0; - try { - for (int64_t i = 0; i < rows; ++i) { - auto& target = data[i]; - index_->GetVectorById(ids[i], target); - dim = std::max(dim, target.dim()); - } - } catch (std::exception& e) { - return expected::Err(Status::invalid_args, "GetVectorByIds failed"); - } - auto res = GenResultDataSet(rows, dim, data.release()); - res->SetIsSparse(true); - return res; + return expected::Err(Status::not_implemented, "GetVectorByIds not implemented"); } [[nodiscard]] bool HasRawData(const std::string& metric_type) const override { - return true; + return false; } [[nodiscard]] expected GetIndexMeta(std::unique_ptr cfg) const override { - throw std::runtime_error("GetIndexMeta not supported for current index type"); + return expected::Err(Status::not_implemented, "GetIndexMeta not supported for current index type"); } Status @@ -284,19 +262,19 @@ class SparseInvertedIndexNode : public IndexNode { } auto cfg = static_cast(*config); auto reader = knowhere::FileReader(filename); - map_size_ = reader.size(); + size_t map_size = reader.size(); int map_flags = MAP_SHARED; #ifdef MAP_POPULATE if (cfg.enable_mmap_pop.has_value() && cfg.enable_mmap_pop.value()) { map_flags |= MAP_POPULATE; } #endif - map_ = static_cast(mmap(nullptr, map_size_, PROT_READ, map_flags, reader.descriptor(), 0)); - if (map_ == MAP_FAILED) { + char* map = static_cast(mmap(nullptr, map_size, PROT_READ, map_flags, reader.descriptor(), 0)); + if (map == MAP_FAILED) { LOG_KNOWHERE_ERROR_ << "Failed to mmap file: " << strerror(errno); return Status::disk_file_error; } - if (madvise(map_, map_size_, MADV_RANDOM) != 0) { + if (madvise(map, map_size, MADV_RANDOM) != 0) { LOG_KNOWHERE_WARNING_ << "Failed to madvise file: " << strerror(errno); } auto index_or = CreateIndex(cfg); @@ -304,9 +282,13 @@ class SparseInvertedIndexNode : public IndexNode { return index_or.error(); } index_ = index_or.value(); - MemoryIOReader map_reader((uint8_t*)map_, map_size_); + MemoryIOReader map_reader((uint8_t*)map, map_size); auto supplement_target_filename = filename + ".knowhere_sparse_index_supplement"; - return index_->Load(map_reader, map_flags, supplement_target_filename); + auto status = index_->Load(map_reader, map_flags, supplement_target_filename); + if (munmap(map, map_size) != 0) { + LOG_KNOWHERE_ERROR_ << "Failed to munmap when trying to delete index: " << strerror(errno); + } + return status; } static std::unique_ptr @@ -367,23 +349,11 @@ class SparseInvertedIndexNode : public IndexNode { delete index_; index_ = nullptr; } - if (map_ != nullptr) { - auto res = munmap(map_, map_size_); - if (res != 0) { - LOG_KNOWHERE_ERROR_ << "Failed to munmap when trying to delete index: " << strerror(errno); - } - map_ = nullptr; - map_size_ = 0; - } } sparse::BaseInvertedIndex* index_{}; std::shared_ptr search_pool_; std::shared_ptr build_pool_; - - // if map_ is not nullptr, it means the index is mmapped from disk. - char* map_ = nullptr; - size_t map_size_ = 0; }; // class SparseInvertedIndexNode // Concurrent version of SparseInvertedIndexNode diff --git a/src/index/sparse/sparse_inverted_index.h b/src/index/sparse/sparse_inverted_index.h index 06665894a..c696fd1dc 100644 --- a/src/index/sparse/sparse_inverted_index.h +++ b/src/index/sparse/sparse_inverted_index.h @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -47,7 +46,7 @@ class BaseInvertedIndex { Load(MemoryIOReader& reader, int map_flags = MAP_SHARED, const std::string& supplement_target_filename = "") = 0; virtual Status - Train(const SparseRow* data, size_t rows, float drop_ratio_build) = 0; + Train(const SparseRow* data, size_t rows) = 0; virtual Status Add(const SparseRow* data, size_t rows, int64_t dim) = 0; @@ -61,10 +60,7 @@ class BaseInvertedIndex { const DocValueComputer& computer) const = 0; virtual float - GetRawDistance(const label_t id, const SparseRow& query, const DocValueComputer& computer) const = 0; - - virtual void - GetVectorById(const label_t id, SparseRow& output) const = 0; + GetRawDistance(const label_t vec_id, const SparseRow& query, const DocValueComputer& computer) const = 0; virtual expected> GetDocValueComputer(const SparseInvertedIndexConfig& cfg) const = 0; @@ -156,7 +152,7 @@ class InvertedIndex : public BaseInvertedIndex { * * 1. size_t rows * 2. size_t cols - * 3. T value_threshold_ + * 3. T value_threshold_ (deprecated) * 4. for each row: * 1. size_t len * 2. for each non-zero value: @@ -168,23 +164,42 @@ class InvertedIndex : public BaseInvertedIndex { * * Data are densely packed in serialized bytes and no padding is added. */ - writeBinaryPOD(writer, n_rows_internal()); - writeBinaryPOD(writer, n_cols_internal()); - writeBinaryPOD(writer, value_threshold_); - for (size_t i = 0; i < n_rows_internal(); ++i) { - auto& row = raw_data_[i]; - writeBinaryPOD(writer, row.size()); - if (row.size() == 0) { + T deprecated_value_threshold = 0; + writeBinaryPOD(writer, n_rows_internal_); + writeBinaryPOD(writer, max_dim_); + writeBinaryPOD(writer, deprecated_value_threshold); + BitsetView bitset(nullptr, 0); + + std::vector> cursors; + + for (size_t i = 0; i < inverted_lut_.size(); ++i) { + cursors.emplace_back(inverted_lut_[i], n_rows_internal_, 0, 0, bitset); + } + + for (table_t vec_id = 0; vec_id < n_rows_internal_; ++vec_id) { + std::vector> vec_row; + for (size_t i = 0; i < inverted_lut_.size(); ++i) { + if (cursors[i].cur_vec_id_ == vec_id) { + vec_row.emplace_back(dim_map_reverse_[i], cursors[i].cur_vec_val()); + cursors[i].next(); + } + } + + SparseRow raw_row(vec_row); + writeBinaryPOD(writer, raw_row.size()); + if (raw_row.size() == 0) { continue; } - writer.write(row.data(), row.size() * SparseRow::element_size()); + writer.write(raw_row.data(), raw_row.size() * SparseRow::element_size()); } + return Status::success; } Status Load(MemoryIOReader& reader, int map_flags = MAP_SHARED, const std::string& supplement_target_filename = "") override { + T deprecated_value_threshold; int64_t rows; readBinaryPOD(reader, rows); // previous versions used the signness of rows to indicate whether to @@ -192,15 +207,11 @@ class InvertedIndex : public BaseInvertedIndex { // take the absolute value of rows. rows = std::abs(rows); readBinaryPOD(reader, max_dim_); - readBinaryPOD(reader, value_threshold_); - if (value_threshold_ > 0) { - drop_during_build_ = true; - } + readBinaryPOD(reader, deprecated_value_threshold); if constexpr (mmapped) { RETURN_IF_ERROR(PrepareMmap(reader, rows, map_flags, supplement_target_filename)); } else { - raw_data_.reserve(rows); if constexpr (bm25) { bm25_params_->row_sums.reserve(rows); } @@ -209,17 +220,21 @@ class InvertedIndex : public BaseInvertedIndex { for (int64_t i = 0; i < rows; ++i) { size_t count; readBinaryPOD(reader, count); + SparseRow raw_row; if constexpr (mmapped) { - raw_data_.emplace_back(count, reader.data() + reader.tellg(), false); + raw_row = std::move(SparseRow(count, reader.data() + reader.tellg(), false)); reader.advance(count * SparseRow::element_size()); } else { - raw_data_.emplace_back(count); + raw_row = std::move(SparseRow(count)); if (count > 0) { - reader.read(raw_data_[i].data(), count * SparseRow::element_size()); + reader.read(raw_row.data(), count * SparseRow::element_size()); } } - add_row_to_index(raw_data_[i], i); + add_row_to_index(raw_row, i); } + + n_rows_internal_ = rows; + return Status::success; } @@ -248,16 +263,12 @@ class InvertedIndex : public BaseInvertedIndex { // reset reader to the beginning reader.seekg(initial_reader_location); - auto raw_data_byte_size = rows * sizeof(typename decltype(raw_data_)::value_type); auto inverted_lut_byte_size = idx_counts.size() * sizeof(typename decltype(inverted_lut_)::value_type); - // actually due to drop_ratio_build, the number of non-zero values that will be added to the luts is - // less than nnz. but since the memory is mmapped, it is ok to still allocate some extra space for those - // dropped values. auto luts_byte_size = nnz * sizeof(typename decltype(inverted_lut_)::value_type::value_type); auto max_score_in_dim_byte_size = idx_counts.size() * sizeof(typename decltype(max_score_in_dim_)::value_type); size_t row_sums_byte_size = 0; - map_byte_size_ = raw_data_byte_size + inverted_lut_byte_size + luts_byte_size; + map_byte_size_ = inverted_lut_byte_size + luts_byte_size; if constexpr (use_wand) { map_byte_size_ += max_score_in_dim_byte_size; } @@ -302,8 +313,6 @@ class InvertedIndex : public BaseInvertedIndex { char* ptr = map_; // initialize containers memory. - raw_data_.initialize(ptr, raw_data_byte_size); - ptr += raw_data_byte_size; inverted_lut_.initialize(ptr, inverted_lut_byte_size); ptr += inverted_lut_byte_size; @@ -338,30 +347,10 @@ class InvertedIndex : public BaseInvertedIndex { // Non zero drop ratio is only supported for static index, i.e. data should // include all rows that'll be added to the index. Status - Train(const SparseRow* data, size_t rows, float drop_ratio_build) override { + Train(const SparseRow* data, size_t rows) override { if constexpr (mmapped) { throw std::invalid_argument("mmapped InvertedIndex does not support Train"); } else { - if (drop_ratio_build == 0.0f) { - return Status::success; - } - // TODO: maybe i += 10 to down sample to speed up. - size_t amount = 0; - for (size_t i = 0; i < rows; ++i) { - amount += data[i].size(); - } - if (amount == 0) { - return Status::success; - } - std::vector vals; - vals.reserve(amount); - for (size_t i = 0; i < rows; ++i) { - for (size_t j = 0; j < data[i].size(); ++j) { - vals.push_back(fabs(data[i][j].val)); - } - } - value_threshold_ = get_threshold(vals, drop_ratio_build); - drop_during_build_ = true; return Status::success; } } @@ -371,22 +360,19 @@ class InvertedIndex : public BaseInvertedIndex { if constexpr (mmapped) { throw std::invalid_argument("mmapped InvertedIndex does not support Add"); } else { - auto current_rows = n_rows_internal(); - if (current_rows > 0 && drop_during_build_) { - LOG_KNOWHERE_ERROR_ << "Not allowed to add data to a built index with drop_ratio_build > 0."; - return Status::invalid_args; - } + auto current_rows = n_rows_internal_; if ((size_t)dim > max_dim_) { max_dim_ = dim; } - raw_data_.insert(raw_data_.end(), data, data + rows); if constexpr (bm25) { bm25_params_->row_sums.reserve(current_rows + rows); } for (size_t i = 0; i < rows; ++i) { add_row_to_index(data[i], current_rows + i); } + n_rows_internal_ += rows; + return Status::success; } } @@ -407,9 +393,8 @@ class InvertedIndex : public BaseInvertedIndex { } auto q_threshold = get_threshold(values, drop_ratio_search); - // if no data was dropped during both build and search, no refinement is - // needed. - if (!drop_during_build_ && drop_ratio_search == 0) { + // if no data was dropped during search, no refinement is needed. + if (drop_ratio_search == 0) { refine_factor = 1; } MaxMinHeap heap(k * refine_factor); @@ -439,40 +424,47 @@ class InvertedIndex : public BaseInvertedIndex { } auto q_threshold = get_threshold(values, drop_ratio_search); auto distances = compute_all_distances(query, q_threshold, computer); - for (size_t i = 0; i < distances.size(); ++i) { - if (bitset.empty() || !bitset.test(i)) { - continue; + if (!bitset.empty()) { + for (size_t i = 0; i < distances.size(); ++i) { + if (bitset.test(i)) { + distances[i] = 0.0f; + } } - distances[i] = 0.0f; } return distances; } float - GetRawDistance(const label_t id, const SparseRow& query, const DocValueComputer& computer) const override { - T doc_sum = bm25 ? bm25_params_->row_sums.at(id) : 0; - return query.dot(raw_data_[id], computer, doc_sum); - } + GetRawDistance(const label_t vec_id, const SparseRow& query, + const DocValueComputer& computer) const override { + float distance = 0.0f; - void - GetVectorById(const label_t id, SparseRow& output) const override { - output = raw_data_[id]; + for (size_t i = 0; i < query.size(); ++i) { + auto [idx, val] = query[i]; + auto dim_id = dim_map_.find(idx); + if (dim_id == dim_map_.end()) { + continue; + } + auto& lut = inverted_lut_[dim_id->second]; + auto it = + std::lower_bound(lut.begin(), lut.end(), vec_id, [](const auto& x, table_t y) { return x.id < y; }); + if (it != lut.end() && it->id == vec_id) { + distance += val * computer(it->val, bm25 ? bm25_params_->row_sums.at(vec_id) : 0); + } + } + + return distance; } [[nodiscard]] size_t size() const override { size_t res = sizeof(*this); - for (size_t i = 0; i < raw_data_.size(); ++i) { - res += raw_data_[i].memory_usage(); - } res += dim_map_.size() * (sizeof(typename decltype(dim_map_)::key_type) + sizeof(typename decltype(dim_map_)::mapped_type)); if constexpr (mmapped) { return res + map_byte_size_; } else { - res += sizeof(typename decltype(raw_data_)::value_type) * raw_data_.capacity(); - res += sizeof(typename decltype(inverted_lut_)::value_type) * inverted_lut_.capacity(); for (size_t i = 0; i < inverted_lut_.size(); ++i) { res += sizeof(typename decltype(inverted_lut_)::value_type::value_type) * inverted_lut_[i].capacity(); @@ -486,17 +478,17 @@ class InvertedIndex : public BaseInvertedIndex { [[nodiscard]] size_t n_rows() const override { - return n_rows_internal(); + return n_rows_internal_; } [[nodiscard]] size_t n_cols() const override { - return n_cols_internal(); + return max_dim_; } - [[nodiscard]] virtual bool + [[nodiscard]] bool IsApproximated() const override { - return drop_during_build_; + return false; } private: @@ -516,22 +508,12 @@ class InvertedIndex : public BaseInvertedIndex { return *pos; } - size_t - n_rows_internal() const { - return raw_data_.size(); - } - - size_t - n_cols_internal() const { - return max_dim_; - } - std::vector compute_all_distances(const SparseRow& q_vec, T q_threshold, const DocValueComputer& computer) const { - std::vector scores(n_rows_internal(), 0.0f); + std::vector scores(n_rows_internal_, 0.0f); for (size_t idx = 0; idx < q_vec.size(); ++idx) { auto [i, v] = q_vec[idx]; - if (v < q_threshold || i >= n_cols_internal()) { + if (v < q_threshold || i >= max_dim_) { continue; } auto dim_id = dim_map_.find(i); @@ -549,41 +531,27 @@ class InvertedIndex : public BaseInvertedIndex { return scores; } - // find the top-k candidates using brute force search, k as specified by the capacity of the heap. - // any value in q_vec that is smaller than q_threshold and any value with dimension >= n_cols() will be ignored. - // TODO: may switch to row-wise brute force if filter rate is high. Benchmark needed. - void - search_brute_force(const SparseRow& q_vec, T q_threshold, MaxMinHeap& heap, const BitsetView& bitset, - const DocValueComputer& computer) const { - auto scores = compute_all_distances(q_vec, q_threshold, computer); - for (size_t i = 0; i < n_rows_internal(); ++i) { - if ((bitset.empty() || !bitset.test(i)) && scores[i] != 0) { - heap.push(i, scores[i]); - } - } - } - // LUT supports size() and operator[] which returns an SparseIdVal. - template + template struct Cursor { public: - Cursor(const LUT& lut, size_t num_vec, float max_score, float q_value, const BitsetView bitset) + Cursor(const LUT& lut, size_t num_vec, float max_score, float q_value, DocIdFilter filter) : lut_(lut), lut_size_(lut.size()), total_num_vec_(num_vec), max_score_(max_score), q_value_(q_value), - bitset_(bitset) { - while (loc_ < lut_size_ && !bitset_.empty() && bitset_.test(lut_[loc_].id)) { - loc_++; - } + filter_(filter) { + skip_filtered_ids(); update_cur_vec_id(); } Cursor(const Cursor& rhs) = delete; + Cursor(Cursor&& rhs) noexcept = default; void next() { - next_internal(); + ++loc_; + skip_filtered_ids(); update_cur_vec_id(); } @@ -591,8 +559,9 @@ class InvertedIndex : public BaseInvertedIndex { void seek(table_t vec_id) { while (loc_ < lut_size_ && lut_[loc_].id < vec_id) { - next_internal(); + ++loc_; } + skip_filtered_ids(); update_cur_vec_id(); } @@ -607,7 +576,7 @@ class InvertedIndex : public BaseInvertedIndex { size_t total_num_vec_ = 0; float max_score_ = 0.0f; float q_value_ = 0.0f; - const BitsetView bitset_; + DocIdFilter filter_; table_t cur_vec_id_ = 0; private: @@ -621,20 +590,36 @@ class InvertedIndex : public BaseInvertedIndex { } inline void - next_internal() { - loc_++; - while (loc_ < lut_size_ && !bitset_.empty() && bitset_.test(lut_[loc_].id)) { - loc_++; + skip_filtered_ids() { + while (loc_ < lut_size_ && !filter_.empty() && filter_.test(lut_[loc_].id)) { + ++loc_; } } }; // struct Cursor + // find the top-k candidates using brute force search, k as specified by the capacity of the heap. + // any value in q_vec that is smaller than q_threshold and any value with dimension >= n_cols() will be ignored. + // TODO: may switch to row-wise brute force if filter rate is high. Benchmark needed. + template + void + search_brute_force(const SparseRow& q_vec, T q_threshold, MaxMinHeap& heap, DocIdFilter& filter, + const DocValueComputer& computer) const { + auto scores = compute_all_distances(q_vec, q_threshold, computer); + for (size_t i = 0; i < n_rows_internal_; ++i) { + if ((filter.empty() || !filter.test(i)) && scores[i] != 0) { + heap.push(i, scores[i]); + } + } + } + // any value in q_vec that is smaller than q_threshold will be ignored. + template void - search_wand(const SparseRow& q_vec, T q_threshold, MaxMinHeap& heap, const BitsetView& bitset, + search_wand(const SparseRow& q_vec, T q_threshold, MaxMinHeap& heap, DocIdFilter& filter, const DocValueComputer& computer) const { auto q_dim = q_vec.size(); - std::vector>> cursors(q_dim); + std::vector>> cursors( + q_dim); size_t valid_q_dim = 0; for (size_t i = 0; i < q_dim; ++i) { auto [idx, val] = q_vec[i]; @@ -643,8 +628,8 @@ class InvertedIndex : public BaseInvertedIndex { continue; } auto& lut = inverted_lut_[dim_id->second]; - cursors[valid_q_dim++] = std::make_shared>( - lut, n_rows_internal(), max_score_in_dim_[dim_id->second] * val, val, bitset); + cursors[valid_q_dim++] = std::make_shared>( + lut, n_rows_internal_, max_score_in_dim_[dim_id->second] * val, val, filter); } if (valid_q_dim == 0) { return; @@ -701,23 +686,22 @@ class InvertedIndex : public BaseInvertedIndex { } void - refine_and_collect(const SparseRow& q_vec, MaxMinHeap& inaccurate, size_t k, float* distances, + refine_and_collect(const SparseRow& q_vec, MaxMinHeap& inacc_heap, size_t k, float* distances, label_t* labels, const DocValueComputer& computer) const { - std::priority_queue, std::vector>, std::greater>> heap; + MaxMinHeap heap(k); + std::vector docids; - while (!inaccurate.empty()) { - auto [u, d] = inaccurate.top(); - inaccurate.pop(); - - T u_sum = bm25 ? bm25_params_->row_sums.at(u) : 0; + while (!inacc_heap.empty()) { + auto [u, d] = inacc_heap.top(); + docids.push_back(u); + inacc_heap.pop(); + } - auto dist_acc = q_vec.dot(raw_data_[u], computer, u_sum); - if (heap.size() < k) { - heap.emplace(u, dist_acc); - } else if (heap.top().val < dist_acc) { - heap.pop(); - heap.emplace(u, dist_acc); - } + DocIdFilterByVector filter(std::move(docids)); + if (use_wand) { + search_wand(q_vec, 0, heap, filter, computer); + } else { + search_brute_force(q_vec, 0, heap, filter, computer); } collect_result(heap, distances, labels); } @@ -734,7 +718,7 @@ class InvertedIndex : public BaseInvertedIndex { } inline void - add_row_to_index(const SparseRow& row, table_t id) { + add_row_to_index(const SparseRow& row, table_t vec_id) { [[maybe_unused]] T row_sum = 0; for (size_t j = 0; j < row.size(); ++j) { auto [idx, val] = row[j]; @@ -743,7 +727,7 @@ class InvertedIndex : public BaseInvertedIndex { } // Skip values equals to or close enough to zero(which contributes // little to the total IP score). - if (val == 0 || (drop_during_build_ && fabs(val) < value_threshold_)) { + if (val == 0) { continue; } auto dim_it = dim_map_.find(idx); @@ -752,12 +736,13 @@ class InvertedIndex : public BaseInvertedIndex { throw std::runtime_error("unexpected vector dimension in mmaped InvertedIndex"); } dim_it = dim_map_.insert({idx, next_dim_id_++}).first; + dim_map_reverse_[next_dim_id_ - 1] = idx; inverted_lut_.emplace_back(); if constexpr (use_wand) { max_score_in_dim_.emplace_back(0); } } - inverted_lut_[dim_it->second].emplace_back(id, val); + inverted_lut_[dim_it->second].emplace_back(vec_id, val); if constexpr (use_wand) { auto score = val; if constexpr (bm25) { @@ -773,22 +758,16 @@ class InvertedIndex : public BaseInvertedIndex { // key is raw sparse vector dim/idx, value is the mapped dim/idx id in the index. std::unordered_map dim_map_; + std::unordered_map dim_map_reverse_; template using Vector = std::conditional_t, std::vector>; // reserve, [], size, emplace_back - Vector> raw_data_; - Vector>> inverted_lut_; - // If we want to drop small values during build, we must first train the - // index with all the data to compute value_threshold_. - bool drop_during_build_ = false; - // when drop_during_build_ is true, any value smaller than value_threshold_ - // will not be added to inverted_lut_. value_threshold_ is set to the - // drop_ratio_build-th percentile of all absolute values in the index. - T value_threshold_ = 0.0f; Vector max_score_in_dim_; + + size_t n_rows_internal_ = 0; size_t max_dim_ = 0; uint32_t next_dim_id_ = 0; diff --git a/tests/ut/test_sparse.cc b/tests/ut/test_sparse.cc index 85341bb05..83a73eb29 100644 --- a/tests/ut/test_sparse.cc +++ b/tests/ut/test_sparse.cc @@ -45,10 +45,7 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") { auto topk = 5; int64_t nq = 10; - auto [drop_ratio_build, drop_ratio_search] = GENERATE(table({ - {0.0, 0.0}, - {0.15, 0.3}, - })); + auto drop_ratio_search = GENERATE(0.0, 0.3); auto metric = GENERATE(knowhere::metric::IP, knowhere::metric::BM25); auto version = GenTestVersionList(); @@ -64,10 +61,8 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") { return json; }; - auto sparse_inverted_index_gen = [base_gen, drop_ratio_build = drop_ratio_build, - drop_ratio_search = drop_ratio_search]() { + auto sparse_inverted_index_gen = [base_gen, drop_ratio_search = drop_ratio_search]() { knowhere::Json json = base_gen(); - json[knowhere::indexparam::DROP_RATIO_BUILD] = drop_ratio_build; json[knowhere::indexparam::DROP_RATIO_SEARCH] = drop_ratio_search; return json; }; @@ -144,9 +139,8 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") { REQUIRE(results.has_value()); float recall = GetKNNRecall(*gt.value(), *results.value()); check_distance_decreasing(*results.value()); - auto drop_ratio_build = json[knowhere::indexparam::DROP_RATIO_BUILD].get(); auto drop_ratio_search = json[knowhere::indexparam::DROP_RATIO_SEARCH].get(); - if (drop_ratio_build == 0 && drop_ratio_search == 0) { + if (drop_ratio_search == 0) { REQUIRE(recall == 1); } else { // most test cases are above 0.95, only a few between 0.9 and 0.95 @@ -189,9 +183,8 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") { float recall = GetKNNRecall(*filter_gt.value(), *results.value()); check_distance_decreasing(*results.value()); - auto drop_ratio_build = json[knowhere::indexparam::DROP_RATIO_BUILD].get(); auto drop_ratio_search = json[knowhere::indexparam::DROP_RATIO_SEARCH].get(); - if (drop_ratio_build == 0 && drop_ratio_search == 0) { + if (drop_ratio_search == 0) { REQUIRE(recall == 1); } else { REQUIRE(recall >= 0.8); @@ -311,108 +304,6 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") { } } -TEST_CASE("Test Mem Sparse Index GetVectorByIds", "[float metrics]") { - auto [nb, dim, doc_sparsity, query_sparsity] = GENERATE(table({ - // 300 dim, avg doc nnz 12, avg query nnz 9 - {2000, 300, 0.95, 0.97}, - // 300 dim, avg doc nnz 9, avg query nnz 3 - {2000, 300, 0.97, 0.99}, - // 3000 dim, avg doc nnz 90, avg query nnz 30 - {20000, 3000, 0.97, 0.99}, - })); - int64_t nq = GENERATE(10, 100); - - auto [drop_ratio_build, drop_ratio_search] = GENERATE(table({ - {0.0, 0.0}, - {0.32, 0.0}, - })); - - auto metric = knowhere::metric::IP; - auto version = GenTestVersionList(); - - auto base_gen = [=, dim = dim]() { - knowhere::Json json; - json[knowhere::meta::DIM] = dim; - json[knowhere::meta::METRIC_TYPE] = metric; - json[knowhere::meta::TOPK] = 1; - return json; - }; - - auto sparse_inverted_index_gen = [base_gen, drop_ratio_build = drop_ratio_build, - drop_ratio_search = drop_ratio_search]() { - knowhere::Json json = base_gen(); - json[knowhere::indexparam::DROP_RATIO_BUILD] = drop_ratio_build; - json[knowhere::indexparam::DROP_RATIO_SEARCH] = drop_ratio_search; - return json; - }; - - const auto train_ds = GenSparseDataSet(nb, dim, doc_sparsity); - - SECTION("Test GetVectorByIds") { - using std::make_tuple; - auto [name, gen] = GENERATE_REF(table>({ - make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX, sparse_inverted_index_gen), - make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND, sparse_inverted_index_gen), - })); - auto use_mmap = GENERATE(true, false); - auto tmp_file = "/tmp/knowhere_sparse_inverted_index_test"; - { - auto idx = knowhere::IndexFactory::Instance().Create(name, version).value(); - auto cfg_json = gen().dump(); - CAPTURE(name, cfg_json); - knowhere::Json json = knowhere::Json::parse(cfg_json); - - auto ids_ds = GenIdsDataSet(nb, nq); - REQUIRE(idx.Type() == name); - auto res = idx.Build(train_ds, json); - REQUIRE(idx.HasRawData(metric) == - knowhere::IndexStaticFaced::HasRawData(name, version, json)); - if (!idx.HasRawData(metric)) { - return; - } - REQUIRE(res == knowhere::Status::success); - knowhere::BinarySet bs; - idx.Serialize(bs); - - auto idx_new = knowhere::IndexFactory::Instance().Create(name, version).value(); - if (use_mmap) { - WriteBinaryToFile(tmp_file, bs.GetByName(idx.Type())); - REQUIRE(idx_new.DeserializeFromFile(tmp_file, json) == knowhere::Status::success); - } else { - REQUIRE(idx_new.Deserialize(bs, json) == knowhere::Status::success); - } - - auto retrieve_task = [&]() { - auto results = idx_new.GetVectorByIds(ids_ds); - REQUIRE(results.has_value()); - auto xb = (knowhere::sparse::SparseRow*)train_ds->GetTensor(); - auto res_data = (knowhere::sparse::SparseRow*)results.value()->GetTensor(); - for (int i = 0; i < nq; ++i) { - const auto id = ids_ds->GetIds()[i]; - const auto& truth_row = xb[id]; - const auto& res_row = res_data[i]; - REQUIRE(truth_row.size() == res_row.size()); - for (size_t j = 0; j < truth_row.size(); ++j) { - REQUIRE(truth_row[j] == res_row[j]); - } - } - }; - - std::vector> retrieve_task_list; - for (int i = 0; i < 20; i++) { - retrieve_task_list.push_back(std::async(std::launch::async, [&] { return retrieve_task(); })); - } - for (auto& task : retrieve_task_list) { - task.wait(); - } - // idx/idx_new to destroy and munmap - } - if (use_mmap) { - REQUIRE(std::remove(tmp_file) == 0); - } - } -} - TEST_CASE("Test Mem Sparse Index Handle Empty Vector", "[float metrics]") { auto [base_data, has_first_result] = GENERATE(table>, bool>( {{std::vector>{ @@ -431,14 +322,9 @@ TEST_CASE("Test Mem Sparse Index Handle Empty Vector", "[float metrics]") { auto metric = GENERATE(knowhere::metric::IP, knowhere::metric::BM25); auto version = GenTestVersionList(); - auto [drop_ratio_build, drop_ratio_search] = GENERATE(table({ - {0.0, 0.0}, - {0.32, 0.0}, - {0.32, 0.6}, - {0.0, 0.6}, - })); + auto drop_ratio_search = GENERATE(0.0, 0.6); - auto base_gen = [=, dim = dim, drop_ratio_build = drop_ratio_build, drop_ratio_search = drop_ratio_search]() { + auto base_gen = [=, dim = dim, drop_ratio_search = drop_ratio_search]() { knowhere::Json json; json[knowhere::meta::DIM] = dim; json[knowhere::meta::METRIC_TYPE] = metric; @@ -446,7 +332,6 @@ TEST_CASE("Test Mem Sparse Index Handle Empty Vector", "[float metrics]") { json[knowhere::meta::BM25_K1] = 1.2; json[knowhere::meta::BM25_B] = 0.75; json[knowhere::meta::BM25_AVGDL] = 100; - json[knowhere::indexparam::DROP_RATIO_BUILD] = drop_ratio_build; json[knowhere::indexparam::DROP_RATIO_SEARCH] = drop_ratio_search; return json; }; @@ -537,22 +422,6 @@ TEST_CASE("Test Mem Sparse Index Handle Empty Vector", "[float metrics]") { REQUIRE(results.has_value()); check_result(*results.value()); } - - SECTION("Test GetVectorByIds") { - std::vector ids = {0, 1, 2}; - auto results = idx.GetVectorByIds(GenIdsDataSet(3, ids)); - REQUIRE(results.has_value()); - auto xb = (knowhere::sparse::SparseRow*)train_ds->GetTensor(); - auto res_data = (knowhere::sparse::SparseRow*)results.value()->GetTensor(); - for (int i = 0; i < 3; ++i) { - const auto& truth_row = xb[i]; - const auto& res_row = res_data[i]; - REQUIRE(truth_row.size() == res_row.size()); - for (size_t j = 0; j < truth_row.size(); ++j) { - REQUIRE(truth_row[j] == res_row[j]); - } - } - } } TEST_CASE("Test Mem Sparse Index CC", "[float metrics]") { @@ -578,8 +447,6 @@ TEST_CASE("Test Mem Sparse Index CC", "[float metrics]") { auto query_ds = doc_vector_gen(nq, dim); - // drop ratio build is not supported in CC index - auto drop_ratio_build = 0.0; auto drop_ratio_search = GENERATE(0.0, 0.3); auto metric = GENERATE(knowhere::metric::IP); @@ -596,10 +463,8 @@ TEST_CASE("Test Mem Sparse Index CC", "[float metrics]") { return json; }; - auto sparse_inverted_index_gen = [base_gen, drop_ratio_build = drop_ratio_build, - drop_ratio_search = drop_ratio_search]() { + auto sparse_inverted_index_gen = [base_gen, drop_ratio_search = drop_ratio_search]() { knowhere::Json json = base_gen(); - json[knowhere::indexparam::DROP_RATIO_BUILD] = drop_ratio_build; json[knowhere::indexparam::DROP_RATIO_SEARCH] = drop_ratio_search; return json; };