-
Notifications
You must be signed in to change notification settings - Fork 140
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Heemin Kim <[email protected]>
- Loading branch information
Showing
19 changed files
with
933 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
jni/include/knn_extension/faiss/MultiVectorResultCollector.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <faiss/impl/ResultCollector.h> | ||
#include <faiss/MetricType.h> | ||
#include "knn_extension/faiss/utils/BitSet.h" | ||
#include <unordered_map> | ||
|
||
namespace os_faiss { | ||
|
||
using idx_t = faiss::idx_t; | ||
/** | ||
* Implementation of ResultCollector to support multi vector | ||
* | ||
* By using parent_bit_set, it convert a doc id to its parent doc id and store the parend doc id | ||
* while collecting search result. Using group_id_to_index, it de-duplicates result from same parent | ||
* doc. Once all results are collected, post_process method is called where it converts parent doc id | ||
* to its original id using group_id_to_id. | ||
*/ | ||
struct MultiVectorResultCollector:faiss::ResultCollector { | ||
std::unordered_map<idx_t, idx_t> group_id_to_id; | ||
std::unordered_map<idx_t, size_t> group_id_to_index; | ||
BitSet* parent_bit_set; | ||
// mapping data from Faiss ID to Lucene ID | ||
const std::vector<int64_t>* id_map; | ||
MultiVectorResultCollector(BitSet* parent_bit_set, const std::vector<int64_t>* id_map); | ||
void collect( | ||
int k, | ||
int& nres, | ||
float* bh_val, | ||
int64_t* bh_ids, | ||
float val, | ||
int64_t ids) override; | ||
void post_process(int64_t nres, int64_t* bh_ids) override; | ||
}; | ||
|
||
} |
25 changes: 25 additions & 0 deletions
25
jni/include/knn_extension/faiss/MultiVectorResultCollectorFactory.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <faiss/impl/ResultCollectorFactory.h> | ||
#include "knn_extension/faiss/utils/BitSet.h" | ||
|
||
namespace os_faiss { | ||
/** | ||
* Create MultiVectorResultCollector for single query request | ||
* | ||
* Creating new collector is required because MultiVectorResultCollector has instance variables | ||
* which should be isolated for each query. | ||
*/ | ||
struct MultiVectorResultCollectorFactory:faiss::ResultCollectorFactory { | ||
BitSet* parent_bit_set; | ||
|
||
MultiVectorResultCollectorFactory(BitSet* parent_bit_set); | ||
faiss::ResultCollector* new_collector() override; | ||
void delete_collector(faiss::ResultCollector* resultCollector) override; | ||
}; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <faiss/MetricType.h> | ||
#include <faiss/impl/platform_macros.h> | ||
#include <limits> | ||
|
||
using idx_t = faiss::idx_t; | ||
/** | ||
* This class is used to store parent and child doc id mapping | ||
* | ||
* For example, let's say there are two documents with 3 nested field each. Then, lucene store each nested field as | ||
* individual document with its own doc id. The document ids are assigned as following. | ||
* | ||
* 0, 1, 2, 3(parent doc for 0, 1, 2), 4, 5, 6, 7(parent doc for 4, 5, 6) | ||
* | ||
* Therefore, we can represent the value in BitSet like 10001000 where parent doc id position is set as 1 | ||
* and child doc id position is set as 0. Finally, by using nextSetBit method, we can find parent ID of a | ||
* given document ID. | ||
*/ | ||
class BitSet { | ||
protected: | ||
const int NO_MORE_DOCS = std::numeric_limits<int>::max(); | ||
public: | ||
virtual idx_t nextSetBit(idx_t index) = 0; | ||
virtual ~BitSet() = default; | ||
}; | ||
|
||
|
||
/** | ||
* BitSet implementation by using an array of unit64 | ||
*/ | ||
class FixedBitSet : public BitSet { | ||
public: | ||
size_t n; | ||
// using uint64_t to leverage function __builtin_ctzll which is defined in faiss/impl/platform_macros.h | ||
uint64_t* bitmap; | ||
|
||
public: | ||
FixedBitSet(const int* intArray, const int length); | ||
idx_t nextSetBit(idx_t index) override; | ||
~FixedBitSet(); | ||
}; |
Oops, something went wrong.