forked from facebookresearch/faiss
-
Notifications
You must be signed in to change notification settings - Fork 5
/
AuxIndexStructures.h
217 lines (149 loc) · 5.19 KB
/
AuxIndexStructures.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/**
* Copyright (c) 2015-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD+Patents license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
// Auxiliary index structures, that are used in indexes but that can
// be forward-declared
#ifndef FAISS_AUX_INDEX_STRUCTURES_H
#define FAISS_AUX_INDEX_STRUCTURES_H
#include <stdint.h>
#include <vector>
#include <unordered_set>
#include "Index.h"
namespace faiss {
/** The objective is to have a simple result structure while
* minimizing the number of mem copies in the result. The method
* do_allocation can be overloaded to allocate the result tables in
* the matrix type of a scripting language like Lua or Python. */
struct RangeSearchResult {
size_t nq; ///< nb of queries
size_t *lims; ///< size (nq + 1)
typedef Index::idx_t idx_t;
idx_t *labels; ///< result for query i is labels[lims[i]:lims[i+1]]
float *distances; ///< corresponding distances (not sorted)
size_t buffer_size; ///< size of the result buffers used
/// lims must be allocated on input to range_search.
explicit RangeSearchResult (idx_t nq, bool alloc_lims=true);
/// called when lims contains the nb of elements result entries
/// for each query
virtual void do_allocation ();
virtual ~RangeSearchResult ();
};
/** Encapsulates a set of ids to remove. */
struct IDSelector {
typedef Index::idx_t idx_t;
virtual bool is_member (idx_t id) const = 0;
virtual ~IDSelector() {}
};
/** remove ids between [imni, imax) */
struct IDSelectorRange: IDSelector {
idx_t imin, imax;
IDSelectorRange (idx_t imin, idx_t imax);
bool is_member(idx_t id) const override;
~IDSelectorRange() override {}
};
/** Remove ids from a set. Repetitions of ids in the indices set
* passed to the constructor does not hurt performance. The hash
* function used for the bloom filter and GCC's implementation of
* unordered_set are just the least significant bits of the id. This
* works fine for random ids or ids in sequences but will produce many
* hash collisions if lsb's are always the same */
struct IDSelectorBatch: IDSelector {
std::unordered_set<idx_t> set;
typedef unsigned char uint8_t;
std::vector<uint8_t> bloom; // assumes low bits of id are a good hash value
int nbits;
idx_t mask;
IDSelectorBatch (long n, const idx_t *indices);
bool is_member(idx_t id) const override;
~IDSelectorBatch() override {}
};
// Below are structures used only by Index implementations
/** List of temporary buffers used to store results before they are
* copied to the RangeSearchResult object. */
struct BufferList {
typedef Index::idx_t idx_t;
// buffer sizes in # entries
size_t buffer_size;
struct Buffer {
idx_t *ids;
float *dis;
};
std::vector<Buffer> buffers;
size_t wp; ///< write pointer in the last buffer.
explicit BufferList (size_t buffer_size);
~BufferList ();
// create a new buffer
void append_buffer ();
inline void add (idx_t id, float dis)
{
if (wp == buffer_size) { // need new buffer
append_buffer();
}
Buffer & buf = buffers.back();
buf.ids [wp] = id;
buf.dis [wp] = dis;
wp++;
}
/// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
/// tables dest_ids, dest_dis
void copy_range (size_t ofs, size_t n,
idx_t * dest_ids, float *dest_dis);
};
/// the entries in the buffers are split per query
struct RangeSearchPartialResult: BufferList {
RangeSearchResult * res;
explicit RangeSearchPartialResult (RangeSearchResult * res_in);
/// result structure for a single query
struct QueryResult {
idx_t qno;
size_t nres;
RangeSearchPartialResult * pres;
inline void add (float dis, idx_t id) {
nres++;
pres->add (id, dis);
}
};
std::vector<QueryResult> queries;
/// begin a new result
QueryResult & new_result (idx_t qno);
void finalize ();
/// called by range_search before do_allocation
void set_lims ();
/// called by range_search after do_allocation
void set_result (bool incremental = false);
};
/***********************************************************
* Abstract I/O objects
***********************************************************/
struct IOReader {
// fread
virtual size_t operator()(
void *ptr, size_t size, size_t nitems) = 0;
// return a file number that can be memory-mapped
virtual int fileno ();
virtual ~IOReader() {}
};
struct IOWriter {
// fwrite
virtual size_t operator()(
const void *ptr, size_t size, size_t nitems) = 0;
// return a file number that can be memory-mapped
virtual int fileno ();
virtual ~IOWriter() {}
};
struct VectorIOReader:IOReader {
std::vector<uint8_t> data;
size_t rp = 0;
size_t operator()(void *ptr, size_t size, size_t nitems) override;
};
struct VectorIOWriter:IOWriter {
std::vector<uint8_t> data;
size_t operator()(const void *ptr, size_t size, size_t nitems) override;
};
}; // namespace faiss
#endif