Skip to content

Commit

Permalink
Add cpp examples (#435)
Browse files Browse the repository at this point in the history
* Add cpp examples
* Add multithreaded cpp examples
  • Loading branch information
dyashuni authored Jan 30, 2023
1 parent 68a3387 commit 488ab52
Show file tree
Hide file tree
Showing 18 changed files with 743 additions and 5 deletions.
8 changes: 7 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
- name: Test
timeout-minutes: 15
run: |
python -m unittest discover -v --start-directory examples --pattern "example*.py"
python -m unittest discover -v --start-directory examples/python --pattern "example*.py"
python -m unittest discover -v --start-directory tests/python --pattern "bindings_test*.py"
test_cpp:
Expand Down Expand Up @@ -61,6 +61,12 @@ jobs:
if [ "$RUNNER_OS" == "Windows" ]; then
cp ./Release/* ./
fi
./example_search
./example_filter
./example_replace_deleted
./example_mt_search
./example_mt_filter
./example_mt_replace_deleted
./searchKnnCloserFirst_test
./searchKnnWithFilter_test
./multiThreadLoad_test
Expand Down
2 changes: 1 addition & 1 deletion ALGO_PARAMS.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,5 @@ ef_construction leads to longer construction, but better index quality. At some
not improve the quality of the index. One way to check if the selection of ef_construction was ok is to measure a recall
for M nearest neighbor search when ```ef``` =```ef_construction```: if the recall is lower than 0.9, than there is room
for improvement.
* ```num_elements``` - defines the maximum number of elements in the index. The index can be extened by saving/loading(load_index
* ```num_elements``` - defines the maximum number of elements in the index. The index can be extended by saving/loading (load_index
function has a parameter which defines the new maximum number of elements).
20 changes: 20 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,26 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
endif()

# examples
add_executable(example_search examples/cpp/example_search.cpp)
target_link_libraries(example_search hnswlib)

add_executable(example_filter examples/cpp/example_filter.cpp)
target_link_libraries(example_filter hnswlib)

add_executable(example_replace_deleted examples/cpp/example_replace_deleted.cpp)
target_link_libraries(example_replace_deleted hnswlib)

add_executable(example_mt_search examples/cpp/example_mt_search.cpp)
target_link_libraries(example_mt_search hnswlib)

add_executable(example_mt_filter examples/cpp/example_mt_filter.cpp)
target_link_libraries(example_mt_filter hnswlib)

add_executable(example_mt_replace_deleted examples/cpp/example_mt_replace_deleted.cpp)
target_link_libraries(example_mt_replace_deleted hnswlib)

# tests
add_executable(test_updates tests/cpp/updates_test.cpp)
target_link_libraries(test_updates hnswlib)

Expand Down
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,12 @@ Properties of `hnswlib.Index` that support reading and writing:

#### Python bindings examples
[See more examples here](examples/EXAMPLES.md)
[See more examples here](examples/python/EXAMPLES.md):
* Creating index, inserting elements, searching, serialization/deserialization
* Filtering during the search with a boolean function
* Deleting the elements and reusing the memory of the deleted elements for newly added elements

An example of creating index, inserting elements, searching and pickle serialization:
```python
import hnswlib
import numpy as np
Expand Down Expand Up @@ -218,6 +223,14 @@ labels, distances = p.knn_query(data, k=1)
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")
```

#### C++ examples
[See examples here](examples/cpp/EXAMPLES.md):
* creating index, inserting elements, searching, serialization/deserialization
* filtering during the search with a boolean function
* deleting the elements and reusing the memory of the deleted elements for newly added elements
* multithreaded usage


### Bindings installation

You can install from sources:
Expand Down
185 changes: 185 additions & 0 deletions examples/cpp/EXAMPLES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
# C++ examples

Creating index, inserting elements, searching and serialization
```cpp
#include "../../hnswlib/hnswlib.h"


int main() {
int dim = 16; // Dimension of the elements
int max_elements = 10000; // Maximum number of elements, should be known beforehand
int M = 16; // Tightly connected with internal dimensionality of the data
// strongly affects the memory consumption
int ef_construction = 200; // Controls index search speed/build speed tradeoff

// Initing index
hnswlib::L2Space space(dim);
hnswlib::HierarchicalNSW<float>* alg_hnsw = new hnswlib::HierarchicalNSW<float>(&space, max_elements, M, ef_construction);

// Generate random data
std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;
float* data = new float[dim * max_elements];
for (int i = 0; i < dim * max_elements; i++) {
data[i] = distrib_real(rng);
}

// Add data to index
for (int i = 0; i < max_elements; i++) {
alg_hnsw->addPoint(data + i * dim, i);
}

// Query the elements for themselves and measure recall
float correct = 0;
for (int i = 0; i < max_elements; i++) {
std::priority_queue<std::pair<float, hnswlib::labeltype>> result = alg_hnsw->searchKnn(data + i * dim, 1);
hnswlib::labeltype label = result.top().second;
if (label == i) correct++;
}
float recall = correct / max_elements;
std::cout << "Recall: " << recall << "\n";

// Serialize index
std::string hnsw_path = "hnsw.bin";
alg_hnsw->saveIndex(hnsw_path);
delete alg_hnsw;

// Deserialize index and check recall
alg_hnsw = new hnswlib::HierarchicalNSW<float>(&space, hnsw_path);
correct = 0;
for (int i = 0; i < max_elements; i++) {
std::priority_queue<std::pair<float, hnswlib::labeltype>> result = alg_hnsw->searchKnn(data + i * dim, 1);
hnswlib::labeltype label = result.top().second;
if (label == i) correct++;
}
recall = (float)correct / max_elements;
std::cout << "Recall of deserialized index: " << recall << "\n";

delete[] data;
delete alg_hnsw;
return 0;
}
```

An example of filtering with a boolean function during the search:
```cpp
#include "../../hnswlib/hnswlib.h"


// Filter that allows labels divisible by divisor
class PickDivisibleIds: public hnswlib::BaseFilterFunctor {
unsigned int divisor = 1;
public:
PickDivisibleIds(unsigned int divisor): divisor(divisor) {
assert(divisor != 0);
}
bool operator()(hnswlib::labeltype label_id) {
return label_id % divisor == 0;
}
};


int main() {
int dim = 16; // Dimension of the elements
int max_elements = 10000; // Maximum number of elements, should be known beforehand
int M = 16; // Tightly connected with internal dimensionality of the data
// strongly affects the memory consumption
int ef_construction = 200; // Controls index search speed/build speed tradeoff

// Initing index
hnswlib::L2Space space(dim);
hnswlib::HierarchicalNSW<float>* alg_hnsw = new hnswlib::HierarchicalNSW<float>(&space, max_elements, M, ef_construction);

// Generate random data
std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;
float* data = new float[dim * max_elements];
for (int i = 0; i < dim * max_elements; i++) {
data[i] = distrib_real(rng);
}

// Add data to index
for (int i = 0; i < max_elements; i++) {
alg_hnsw->addPoint(data + i * dim, i);
}

// Create filter that allows only even labels
PickDivisibleIds pickIdsDivisibleByTwo(2);

// Query the elements for themselves with filter and check returned labels
int k = 10;
for (int i = 0; i < max_elements; i++) {
std::vector<std::pair<float, hnswlib::labeltype>> result = alg_hnsw->searchKnnCloserFirst(data + i * dim, k, &pickIdsDivisibleByTwo);
for (auto item: result) {
if (item.second % 2 == 1) std::cout << "Error: found odd label\n";
}
}

delete[] data;
delete alg_hnsw;
return 0;
}
```

An example with reusing the memory of the deleted elements when new elements are being added (via `allow_replace_deleted` flag):
```cpp
#include "../../hnswlib/hnswlib.h"


int main() {
int dim = 16; // Dimension of the elements
int max_elements = 10000; // Maximum number of elements, should be known beforehand
int M = 16; // Tightly connected with internal dimensionality of the data
// strongly affects the memory consumption
int ef_construction = 200; // Controls index search speed/build speed tradeoff

// Initing index
hnswlib::L2Space space(dim);
hnswlib::HierarchicalNSW<float>* alg_hnsw = new hnswlib::HierarchicalNSW<float>(&space, max_elements, M, ef_construction, 100, true);

// Generate random data
std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;
float* data = new float[dim * max_elements];
for (int i = 0; i < dim * max_elements; i++) {
data[i] = distrib_real(rng);
}

// Add data to index
for (int i = 0; i < max_elements; i++) {
alg_hnsw->addPoint(data + i * dim, i);
}

// Mark first half of elements as deleted
int num_deleted = max_elements / 2;
for (int i = 0; i < num_deleted; i++) {
alg_hnsw->markDelete(i);
}

float* add_data = new float[dim * num_deleted];
for (int i = 0; i < dim * num_deleted; i++) {
add_data[i] = distrib_real(rng);
}

// Replace deleted data with new elements
// Maximum number of elements is reached therefore we cannot add new items,
// but we can replace the deleted ones by using replace_deleted=true
for (int i = 0; i < num_deleted; i++) {
int label = max_elements + i;
alg_hnsw->addPoint(add_data + i * dim, label, true);
}

delete[] data;
delete[] add_data;
delete alg_hnsw;
return 0;
}
```

Multithreaded examples:
* Creating index, inserting elements, searching [example_mt_search.cpp](example_mt_search.cpp)
* Filtering during the search with a boolean function [example_mt_filter.cpp](example_mt_filter.cpp)
* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)
57 changes: 57 additions & 0 deletions examples/cpp/example_filter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#include "../../hnswlib/hnswlib.h"


// Filter that allows labels divisible by divisor
class PickDivisibleIds: public hnswlib::BaseFilterFunctor {
unsigned int divisor = 1;
public:
PickDivisibleIds(unsigned int divisor): divisor(divisor) {
assert(divisor != 0);
}
bool operator()(hnswlib::labeltype label_id) {
return label_id % divisor == 0;
}
};


int main() {
int dim = 16; // Dimension of the elements
int max_elements = 10000; // Maximum number of elements, should be known beforehand
int M = 16; // Tightly connected with internal dimensionality of the data
// strongly affects the memory consumption
int ef_construction = 200; // Controls index search speed/build speed tradeoff

// Initing index
hnswlib::L2Space space(dim);
hnswlib::HierarchicalNSW<float>* alg_hnsw = new hnswlib::HierarchicalNSW<float>(&space, max_elements, M, ef_construction);

// Generate random data
std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;
float* data = new float[dim * max_elements];
for (int i = 0; i < dim * max_elements; i++) {
data[i] = distrib_real(rng);
}

// Add data to index
for (int i = 0; i < max_elements; i++) {
alg_hnsw->addPoint(data + i * dim, i);
}

// Create filter that allows only even labels
PickDivisibleIds pickIdsDivisibleByTwo(2);

// Query the elements for themselves with filter and check returned labels
int k = 10;
for (int i = 0; i < max_elements; i++) {
std::vector<std::pair<float, hnswlib::labeltype>> result = alg_hnsw->searchKnnCloserFirst(data + i * dim, k, &pickIdsDivisibleByTwo);
for (auto item: result) {
if (item.second % 2 == 1) std::cout << "Error: found odd label\n";
}
}

delete[] data;
delete alg_hnsw;
return 0;
}
Loading

0 comments on commit 488ab52

Please sign in to comment.