diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 0000000000..9d35e3f97f
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,30 @@
+# syntax=docker/dockerfile:1.5
+
+ARG BASE
+ARG PYTHON_PACKAGE_MANAGER=conda
+
+FROM ${BASE} as pip-base
+
+ENV DEFAULT_VIRTUAL_ENV=rapids
+
+FROM ${BASE} as conda-base
+
+ENV DEFAULT_CONDA_ENV=rapids
+
+FROM ${PYTHON_PACKAGE_MANAGER}-base
+
+ARG CUDA
+ENV CUDAARCHS="RAPIDS"
+ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
+
+ARG PYTHON_PACKAGE_MANAGER
+ENV PYTHON_PACKAGE_MANAGER="${PYTHON_PACKAGE_MANAGER}"
+
+ENV PYTHONSAFEPATH="1"
+ENV PYTHONUNBUFFERED="1"
+ENV PYTHONDONTWRITEBYTECODE="1"
+
+ENV SCCACHE_REGION="us-east-2"
+ENV SCCACHE_BUCKET="rapids-sccache-devs"
+ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV HISTFILE="/home/coder/.cache/._bash_history"
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
new file mode 100644
index 0000000000..3c76b8963d
--- /dev/null
+++ b/.devcontainer/README.md
@@ -0,0 +1,64 @@
+# RAFT Development Containers
+
+This directory contains [devcontainer configurations](https://containers.dev/implementors/json_reference/) for using VSCode to [develop in a container](https://code.visualstudio.com/docs/devcontainers/containers) via the `Remote Containers` [extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) or [GitHub Codespaces](https://github.com/codespaces).
+
+This container is a turnkey development environment for building and testing the RAFT C++ and Python libraries.
+
+## Table of Contents
+
+* [Prerequisites](#prerequisites)
+* [Host bind mounts](#host-bind-mounts)
+* [Launch a Dev Container](#launch-a-dev-container)
+
+## Prerequisites
+
+* [VSCode](https://code.visualstudio.com/download)
+* [VSCode Remote Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
+
+## Host bind mounts
+
+By default, the following directories are bind-mounted into the devcontainer:
+
+* `${repo}:/home/coder/raft`
+* `${repo}/../.aws:/home/coder/.aws`
+* `${repo}/../.local:/home/coder/.local`
+* `${repo}/../.cache:/home/coder/.cache`
+* `${repo}/../.conda:/home/coder/.conda`
+* `${repo}/../.config:/home/coder/.config`
+
+This ensures caches, configurations, dependencies, and your commits are persisted on the host across container runs.
+
+## Launch a Dev Container
+
+To launch a devcontainer from VSCode, open the RAFT repo and select the "Reopen in Container" button in the bottom right:<br/><img src="https://user-images.githubusercontent.com/178183/221771999-97ab29d5-e718-4e5f-b32f-2cdd51bba25c.png"/>
+
+Alternatively, open the VSCode command palette (typically `cmd/ctrl + shift + P`) and run the "Rebuild and Reopen in Container" command.
+
+## Using the devcontainer
+
+On startup, the devcontainer creates or updates the conda/pip environment using `raft/dependencies.yaml`.
+
+The container includes convenience functions to clean, configure, and build the various RAFT components:
+
+```shell
+$ clean-raft-cpp # only cleans the C++ build dir
+$ clean-pylibraft-python # only cleans the Python build dir
+$ clean-raft # cleans both C++ and Python build dirs
+
+$ configure-raft-cpp # only configures raft C++ lib
+
+$ build-raft-cpp # only builds raft C++ lib
+$ build-pylibraft-python # only builds raft Python lib
+$ build-raft # builds both C++ and Python libs
+```
+
+* The C++ build script is a small wrapper around `cmake -S ~/raft/cpp -B ~/raft/cpp/build` and `cmake --build ~/raft/cpp/build`
+* The Python build script is a small wrapper around `pip install --editable ~/raft/cpp`
+
+Unlike `build.sh`, these convenience scripts *don't* install the libraries after building them. Instead, they automatically inject the correct arguments to build the C++ libraries from source and use their build dirs as package roots:
+
+```shell
+$ cmake -S ~/raft/cpp -B ~/raft/cpp/build
+$ CMAKE_ARGS="-Draft_ROOT=~/raft/cpp/build" \ # <-- this argument is automatic
+  pip install -e ~/raft/cpp
+```
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
new file mode 100644
index 0000000000..8da9b5428a
--- /dev/null
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda11.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
new file mode 100644
index 0000000000..0b3ec79e37
--- /dev/null
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -0,0 +1,38 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:23.10": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/ucx",
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
new file mode 100644
index 0000000000..f5af166b46
--- /dev/null
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.0-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
new file mode 100644
index 0000000000..9f28002d38
--- /dev/null
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -0,0 +1,38 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda12.0-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:23.10": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/ucx",
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 00004c4e4d..107823d5ee 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -62,7 +62,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4437e0dc85..e539877851 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -22,6 +22,7 @@ jobs:
       - wheel-tests-pylibraft
       - wheel-build-raft-dask
       - wheel-tests-raft-dask
+      - devcontainer
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
   checks:
@@ -62,7 +63,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibraft:
     needs: checks
@@ -92,3 +93,11 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
+  devcontainer:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.10
+    with:
+      build_command: |
+        sccache -z;
+        build-all -DBUILD_PRIMS_BENCH=ON -DBUILD_ANN_BENCH=ON --verbose;
+        sccache -s;
diff --git a/.gitignore b/.gitignore
index 7939fc1622..11b7bc3eba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -62,3 +62,7 @@ _xml
 # sphinx
 _html
 _text
+
+# clang tooling
+compile_commands.json
+.clangd/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2a70632497..66862ada5e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -43,7 +43,7 @@ repos:
                 additional_dependencies: [toml]
                 args: ["--config=pyproject.toml"]
       - repo: https://github.com/pre-commit/mirrors-clang-format
-        rev: v16.0.1
+        rev: v16.0.6
         hooks:
               - id: clang-format
                 types_or: [c, c++, cuda]
diff --git a/build.sh b/build.sh
index 071820ba93..6200e6a2fa 100755
--- a/build.sh
+++ b/build.sh
@@ -78,8 +78,8 @@ INSTALL_TARGET=install
 BUILD_REPORT_METRICS=""
 BUILD_REPORT_INCL_CACHE_STATS=OFF
 
-TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;NEIGHBORS_TEST;NEIGHBORS_ANN_CAGRA_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;STATS_TEST;UTILS_TEST"
-BENCH_TARGETS="CLUSTER_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_BENCH;SPARSE_BENCH;RANDOM_BENCH"
+TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;NEIGHBORS_TEST;NEIGHBORS_ANN_CAGRA_TEST;NEIGHBORS_ANN_NN_DESCENT_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;STATS_TEST;UTILS_TEST"
+BENCH_TARGETS="CLUSTER_BENCH;CORE_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_BENCH;SPARSE_BENCH;RANDOM_BENCH"
 
 CACHE_ARGS=""
 NVTX=ON
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 853ae095d3..a41f81152d 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -11,6 +11,6 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
-rapids-mamba-retry mambabuild conda/recipes/libraft
+rapids-conda-retry mambabuild conda/recipes/libraft
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 2a31deb46a..c49677e78c 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -15,19 +15,19 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
 # TODO: Remove `--no-test` flags once importing on a CPU
 # node works correctly
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/pylibraft
 
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/raft-dask
 
 # Build ann-bench for each cuda and python version
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
 --no-test \
 --channel "${CPP_CHANNEL}" \
 --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -37,7 +37,7 @@ conda/recipes/raft-ann-bench
 # version
 RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
 if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then
-  rapids-mamba-retry mambabuild \
+  rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index a9f7f64294..662a11ad0e 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -5,6 +5,7 @@ set -euo pipefail
 
 package_name=$1
 package_dir=$2
+underscore_package_name=$(echo "${package_name}" | tr "-" "_")
 
 source rapids-configure-sccache
 source rapids-date-string
@@ -15,9 +16,36 @@ version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-ci/release/apply_wheel_modifications.sh ${version_override} "-${RAPIDS_PY_CUDA_SUFFIX}"
-echo "The package name and/or version was modified in the package source. The git diff is:"
-git diff
+# This is the version of the suffix with a preceding hyphen. It's used
+# everywhere except in the final wheel name.
+PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
+
+# Patch project metadata files to include the CUDA version suffix and version override.
+pyproject_file="${package_dir}/pyproject.toml"
+
+sed -i "s/^version = .*/version = \"${version_override}\"/g" ${pyproject_file}
+sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+
+# For nightlies we want to ensure that we're pulling in alphas as well. The
+# easiest way to do so is to augment the spec with a constraint containing a
+# min alpha version that doesn't affect the version bounds but does allow usage
+# of alpha versions for that dependency without --pre
+alpha_spec=''
+if ! rapids-is-release-build; then
+    alpha_spec=',>=0.0.0a0'
+fi
+
+if [[ ${package_name} == "raft-dask" ]]; then
+    sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -i "s/ucx-py/ucx-py${PACKAGE_CUDA_SUFFIX}/g" python/raft-dask/pyproject.toml
+else
+    sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
+fi
+
+if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
+    sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
+    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
+fi
 
 cd "${package_dir}"
 
@@ -27,4 +55,4 @@ python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 mkdir -p final_dist
 python -m auditwheel repair -w final_dist dist/*
 
-RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh
index f0204d45c0..ff89f4da23 100755
--- a/ci/build_wheel_raft_dask.sh
+++ b/ci/build_wheel_raft_dask.sh
@@ -6,9 +6,4 @@ set -euo pipefail
 # Set up skbuild options. Enable sccache in skbuild config options
 export SKBUILD_CONFIGURE_OPTIONS="-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-
-RAPIDS_PY_WHEEL_NAME=pylibraft_${RAPIDS_PY_CUDA_SUFFIX} rapids-download-wheels-from-s3 ./local-pylibraft
-python -m pip install --no-deps ./local-pylibraft/pylibraft*.whl
-
-ci/build_wheel.sh raft_dask python/raft-dask
+ci/build_wheel.sh raft-dask python/raft-dask
diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh
deleted file mode 100755
index fd6c2f929e..0000000000
--- a/ci/release/apply_wheel_modifications.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Usage: bash apply_wheel_modifications.sh <new_version> <cuda_suffix>
-
-VERSION=${1}
-CUDA_SUFFIX=${2}
-
-# pyproject.toml versions
-sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/pylibraft/pyproject.toml
-sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/raft-dask/pyproject.toml
-
-# pylibraft pyproject.toml cuda suffixes
-sed -i "s/^name = \"pylibraft\"/name = \"pylibraft${CUDA_SUFFIX}\"/g" python/pylibraft/pyproject.toml
-sed -i "s/rmm/rmm${CUDA_SUFFIX}/g" python/pylibraft/pyproject.toml
-
-# raft-dask pyproject.toml cuda suffixes
-sed -i "s/^name = \"raft-dask\"/name = \"raft-dask${CUDA_SUFFIX}\"/g" python/raft-dask/pyproject.toml
-sed -i "s/pylibraft/pylibraft${CUDA_SUFFIX}/g" python/raft-dask/pyproject.toml
-sed -i "s/ucx-py/ucx-py${CUDA_SUFFIX}/g" python/raft-dask/pyproject.toml
-
-if [[ $CUDA_SUFFIX == "-cu12" ]]; then
-    sed -i "s/cuda-python[<=>\.,0-9]*/cuda-python>=12.0,<13.0/g" python/pylibraft/pyproject.toml
-    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" python/pylibraft/pyproject.toml
-fi
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 6a7e319f5d..a867a71f68 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -47,10 +47,6 @@ sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cma
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/pylibraft/pylibraft/__init__.py
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/raft-dask/raft_dask/__init__.py
 
-# Python pyproject.toml updates
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/pylibraft/pyproject.toml
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/raft-dask/pyproject.toml
-
 # Wheel testing script
 sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_raft_dask.sh
 
@@ -74,6 +70,7 @@ for FILE in python/*/pyproject.toml; do
   for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE}
   done
+  sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" "${FILE}"
   sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\"/g" ${FILE}
 done
 
@@ -94,3 +91,10 @@ sed_runner "/^PROJECT_NUMBER/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" cpp/doxygen/Doxy
 sed_runner "/^set(RAFT_VERSION/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" docs/source/build.md
 sed_runner "/GIT_TAG.*branch-/ s|branch-.*|branch-${NEXT_SHORT_TAG}|g" docs/source/build.md
 sed_runner "/rapidsai\/raft/ s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" docs/source/developer_guide.md
+
+# .devcontainer files
+find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
+    sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+done
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index 676d642de9..fd9668e968 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -12,7 +12,7 @@ RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
+python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://github.com/dask/distributed.git@2023.9.2 git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/raft_dask*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 7e921decd5..739e1e9785 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,8 +9,8 @@ channels:
 dependencies:
 - breathe
 - c-compiler
-- clang-tools=16.0.1
-- clang=16.0.1
+- clang-tools=16.0.6
+- clang==16.0.6
 - cmake>=3.26.4
 - cuda-profiler-api=11.8.86
 - cuda-python>=11.7.1,<12.0a0
@@ -19,10 +19,10 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.10.*
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -43,6 +43,8 @@ dependencies:
 - numba>=0.57
 - numpy>=1.21
 - numpydoc
+- nvcc_linux-64=11.8
+- pre-commit
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 2ea685b529..321c17bf4f 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -9,20 +9,21 @@ channels:
 dependencies:
 - breathe
 - c-compiler
-- clang-tools=16.0.1
-- clang=16.0.1
+- clang-tools=16.0.6
+- clang==16.0.6
 - cmake>=3.26.4
 - cuda-cudart-dev
+- cuda-nvcc
 - cuda-profiler-api
 - cuda-python>=12.0,<13.0a0
 - cuda-version=12.0
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.10.*
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -39,6 +40,7 @@ dependencies:
 - numba>=0.57
 - numpy>=1.21
 - numpydoc
+- pre-commit
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 742040ad50..4f1df12dfa 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -9,8 +9,8 @@ channels:
 dependencies:
 - benchmark>=1.8.2
 - c-compiler
-- clang-tools=16.0.1
-- clang=16.0.1
+- clang-tools=16.0.6
+- clang==16.0.6
 - cmake>=3.26.4
 - cuda-profiler-api=11.8.86
 - cuda-version=11.8
@@ -34,6 +34,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
+- nvcc_linux-64=11.8
 - scikit-build>=0.13.1
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/recipes/raft-ann-bench-cpu/meta.yaml b/conda/recipes/raft-ann-bench-cpu/meta.yaml
index 355ea640ff..06737b0497 100644
--- a/conda/recipes/raft-ann-bench-cpu/meta.yaml
+++ b/conda/recipes/raft-ann-bench-cpu/meta.yaml
@@ -50,6 +50,7 @@ requirements:
     - nlohmann_json {{ nlohmann_json_version }}
     - python
     - pyyaml
+    - pandas
 
   run:
     - glog {{ glog_version }}
@@ -57,6 +58,8 @@ requirements:
     - matplotlib
     - python
     - pyyaml
+    - pandas
+    - benchmark
 
 about:
   home: https://rapids.ai/
diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml
index 882ff6cc49..a2ab0af643 100644
--- a/conda/recipes/raft-ann-bench/meta.yaml
+++ b/conda/recipes/raft-ann-bench/meta.yaml
@@ -75,6 +75,14 @@ requirements:
     - faiss-proc=*=cuda
     - libfaiss {{ faiss_version }}
     {% endif %}
+    - h5py {{ h5py_version }}
+    - benchmark
+    - matplotlib
+    - python
+    - pandas
+    - pyyaml
+    # rmm is needed to determine if package is gpu-enabled
+    - rmm ={{ minor_version }}
 
   run:
     - python
@@ -90,7 +98,14 @@ requirements:
     - libfaiss {{ faiss_version }}
     {% endif %}
     - h5py {{ h5py_version }}
-
+    - benchmark
+    - glog {{ glog_version }}
+    - matplotlib
+    - python
+    - pandas
+    - pyyaml
+    # rmm is needed to determine if package is gpu-enabled
+    - rmm ={{ minor_version }}
 about:
   home: https://rapids.ai/
   license: Apache-2.0
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index c9caa4dd9b..04dfef5063 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -60,10 +60,10 @@ requirements:
     - cudatoolkit
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
-    - dask >=2023.7.1
-    - dask-core >=2023.7.1
+    - dask ==2023.9.2
+    - dask-core ==2023.9.2
     - dask-cuda ={{ minor_version }}
-    - distributed >=2023.7.1
+    - distributed ==2023.9.2
     - joblib >=0.11
     - nccl >=2.9.9
     - pylibraft {{ version }}
diff --git a/cpp/.clangd b/cpp/.clangd
new file mode 100644
index 0000000000..7c4fe036dd
--- /dev/null
+++ b/cpp/.clangd
@@ -0,0 +1,65 @@
+# https://clangd.llvm.org/config
+
+# Apply a config conditionally to all C files
+If:
+  PathMatch: .*\.(c|h)$
+
+---
+
+# Apply a config conditionally to all C++ files
+If:
+  PathMatch: .*\.(c|h)pp
+
+---
+
+# Apply a config conditionally to all CUDA files
+If:
+  PathMatch: .*\.cuh?
+CompileFlags:
+  Add:
+    - "-x"
+    - "cuda"
+    # No error on unknown CUDA versions
+    - "-Wno-unknown-cuda-version"
+    # Allow variadic CUDA functions
+    - "-Xclang=-fcuda-allow-variadic-functions"
+Diagnostics:
+  Suppress:
+    - "variadic_device_fn"
+    - "attributes_not_allowed"
+
+---
+
+# Tweak the clangd parse settings for all files
+CompileFlags:
+  Add:
+    # report all errors
+    - "-ferror-limit=0"
+    - "-fmacro-backtrace-limit=0"
+    - "-ftemplate-backtrace-limit=0"
+    # Skip the CUDA version check
+    - "--no-cuda-version-check"
+  Remove:
+    # remove gcc's -fcoroutines
+    - -fcoroutines
+    # remove nvc++ flags unknown to clang
+    - "-gpu=*"
+    - "-stdpar*"
+    # remove nvcc flags unknown to clang
+    - "-arch*"
+    - "-gencode*"
+    - "--generate-code*"
+    - "-ccbin*"
+    - "-t=*"
+    - "--threads*"
+    - "-Xptxas*"
+    - "-Xcudafe*"
+    - "-Xfatbin*"
+    - "-Xcompiler*"
+    - "--diag-suppress*"
+    - "--diag_suppress*"
+    - "--compiler-options*"
+    - "--expt-extended-lambda"
+    - "--expt-relaxed-constexpr"
+    - "-forward-unknown-to-host-compiler"
+    - "-Werror=cross-execution-space-call"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d93b19f784..7d63751906 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -22,7 +22,8 @@ include(rapids-find)
 
 option(BUILD_CPU_ONLY "Build CPU only components. Applies to RAFT ANN benchmarks currently" OFF)
 
-# workaround for rapids_cuda_init_architectures not working for arch detection with enable_language(CUDA)
+# workaround for rapids_cuda_init_architectures not working for arch detection with
+# enable_language(CUDA)
 set(lang_list "CXX")
 
 if(NOT BUILD_CPU_ONLY)
@@ -286,7 +287,8 @@ endif()
 set_target_properties(raft_compiled PROPERTIES EXPORT_NAME compiled)
 
 if(RAFT_COMPILE_LIBRARY)
-  add_library(raft_objs OBJECT
+  add_library(
+    raft_objs OBJECT
     src/core/logger.cpp
     src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
@@ -331,6 +333,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
     src/neighbors/brute_force_knn_int_float_int.cu
     src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
+    src/neighbors/brute_force_knn_index_float.cu
     src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
     src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
@@ -452,18 +455,21 @@ if(RAFT_COMPILE_LIBRARY)
     src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
     src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
     src/util/memory_pool.cpp
-    )
+  )
   set_target_properties(
     raft_objs
     PROPERTIES CXX_STANDARD 17
                CXX_STANDARD_REQUIRED ON
                CUDA_STANDARD 17
                CUDA_STANDARD_REQUIRED ON
-               POSITION_INDEPENDENT_CODE ON)
+               POSITION_INDEPENDENT_CODE ON
+  )
 
   target_compile_definitions(raft_objs PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
-  target_compile_options(raft_objs PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-                                           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>")
+  target_compile_options(
+    raft_objs PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+                      "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+  )
 
   add_library(raft_lib SHARED $<TARGET_OBJECTS:raft_objs>)
   add_library(raft_lib_static STATIC $<TARGET_OBJECTS:raft_objs>)
@@ -477,13 +483,15 @@ if(RAFT_COMPILE_LIBRARY)
   )
 
   foreach(target raft_lib raft_lib_static raft_objs)
-    target_link_libraries(${target} PUBLIC
-      raft::raft
-      ${RAFT_CTK_MATH_DEPENDENCIES} # TODO: Once `raft::resources` is used everywhere, this
-                                    # will just be cublas
-      $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>)
+    target_link_libraries(
+      ${target}
+      PUBLIC raft::raft
+             ${RAFT_CTK_MATH_DEPENDENCIES} # TODO: Once `raft::resources` is used everywhere, this
+                                           # will just be cublas
+             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+    )
 
-    #So consumers know when using libraft.so/libraft.a
+    # So consumers know when using libraft.so/libraft.a
     target_compile_definitions(${target} PUBLIC "RAFT_COMPILED")
     # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
     target_link_options(${target} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 4e91ee0690..4ec977700d 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -211,9 +211,10 @@ void bench_search(::benchmark::State& state,
     try {
       algo->set_search_dataset(dataset->base_set(algo_property.dataset_memory_type),
                                dataset->base_set_size());
-    } catch (const std::exception&) {
+    } catch (const std::exception& ex) {
       state.SkipWithError("The algorithm '" + index.name +
-                          "' requires the base set, but it's not available.");
+                          "' requires the base set, but it's not available. " +
+                          "Exception: " + std::string(ex.what()));
       return;
     }
   }
diff --git a/cpp/bench/ann/src/faiss/faiss_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
index 231154ccfd..56885cce5c 100644
--- a/cpp/bench/ann/src/faiss/faiss_benchmark.cu
+++ b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
@@ -30,19 +30,27 @@
 
 namespace raft::bench::ann {
 
+template <typename T>
+void parse_base_build_param(const nlohmann::json& conf,
+                            typename raft::bench::ann::FaissGpu<T>::BuildParam& param)
+{
+  param.nlist = conf.at("nlist");
+  if (conf.contains("ratio")) { param.ratio = conf.at("ratio"); }
+}
+
 template <typename T>
 void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::FaissGpuIVFFlat<T>::BuildParam& param)
 {
-  param.nlist = conf.at("nlist");
+  parse_base_build_param<T>(conf, param);
 }
 
 template <typename T>
 void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::FaissGpuIVFPQ<T>::BuildParam& param)
 {
-  param.nlist = conf.at("nlist");
-  param.M     = conf.at("M");
+  parse_base_build_param<T>(conf, param);
+  param.M = conf.at("M");
   if (conf.contains("usePrecomputed")) {
     param.usePrecomputed = conf.at("usePrecomputed");
   } else {
@@ -59,7 +67,7 @@ template <typename T>
 void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::FaissGpuIVFSQ<T>::BuildParam& param)
 {
-  param.nlist          = conf.at("nlist");
+  parse_base_build_param<T>(conf, param);
   param.quantizer_type = conf.at("quantizer_type");
 }
 
diff --git a/cpp/bench/ann/src/faiss/faiss_wrapper.h b/cpp/bench/ann/src/faiss/faiss_wrapper.h
index ec80e6cbfd..672c685b1f 100644
--- a/cpp/bench/ann/src/faiss/faiss_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_wrapper.h
@@ -18,6 +18,7 @@
 
 #include "../common/ann_types.hpp"
 
+#include <raft/core/logger.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 #include <faiss/IndexFlat.h>
@@ -85,7 +86,23 @@ class FaissGpu : public ANN<T> {
     float refine_ratio = 1.0;
   };
 
-  FaissGpu(Metric metric, int dim, int nlist);
+  struct BuildParam {
+    int nlist = 1;
+    int ratio = 2;
+  };
+
+  FaissGpu(Metric metric, int dim, const BuildParam& param)
+    : ANN<T>(metric, dim),
+      metric_type_(parse_metric_type(metric)),
+      nlist_{param.nlist},
+      training_sample_fraction_{1.0 / double(param.ratio)}
+  {
+    static_assert(std::is_same_v<T, float>, "faiss support only float type");
+    RAFT_CUDA_TRY(cudaGetDevice(&device_));
+    RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming));
+    faiss_default_stream_ = gpu_resource_.getDefaultStream(device_);
+  }
+
   virtual ~FaissGpu() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); }
 
   void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) final;
@@ -131,23 +148,35 @@ class FaissGpu : public ANN<T> {
   int device_;
   cudaEvent_t sync_{nullptr};
   cudaStream_t faiss_default_stream_{nullptr};
+  double training_sample_fraction_;
 };
 
-template <typename T>
-FaissGpu<T>::FaissGpu(Metric metric, int dim, int nlist)
-  : ANN<T>(metric, dim), metric_type_(parse_metric_type(metric)), nlist_(nlist)
-{
-  static_assert(std::is_same_v<T, float>, "faiss support only float type");
-  RAFT_CUDA_TRY(cudaGetDevice(&device_));
-  RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming));
-  faiss_default_stream_ = gpu_resource_.getDefaultStream(device_);
-}
-
 template <typename T>
 void FaissGpu<T>::build(const T* dataset, size_t nrow, cudaStream_t stream)
 {
   OmpSingleThreadScope omp_single_thread;
-
+  auto index_ivf = dynamic_cast<faiss::gpu::GpuIndexIVF*>(index_.get());
+  if (index_ivf != nullptr) {
+    // set the min/max training size for clustering to use the whole provided training set.
+    double trainset_size       = training_sample_fraction_ * static_cast<double>(nrow);
+    double points_per_centroid = trainset_size / static_cast<double>(nlist_);
+    int max_ppc                = std::ceil(points_per_centroid);
+    int min_ppc                = std::floor(points_per_centroid);
+    if (min_ppc < index_ivf->cp.min_points_per_centroid) {
+      RAFT_LOG_WARN(
+        "The suggested training set size %zu (data size %zu, training sample ratio %f) yields %d "
+        "points per cluster (n_lists = %d). This is smaller than the FAISS default "
+        "min_points_per_centroid = %d.",
+        static_cast<size_t>(trainset_size),
+        nrow,
+        training_sample_fraction_,
+        min_ppc,
+        nlist_,
+        index_ivf->cp.min_points_per_centroid);
+    }
+    index_ivf->cp.max_points_per_centroid = max_ppc;
+    index_ivf->cp.min_points_per_centroid = min_ppc;
+  }
   index_->train(nrow, dataset);  // faiss::gpu::GpuIndexFlat::train() will do nothing
   assert(index_->is_trained);
   index_->add(nrow, dataset);
@@ -208,12 +237,9 @@ void FaissGpu<T>::load_(const std::string& file)
 template <typename T>
 class FaissGpuIVFFlat : public FaissGpu<T> {
  public:
-  struct BuildParam {
-    int nlist;
-  };
+  using typename FaissGpu<T>::BuildParam;
 
-  FaissGpuIVFFlat(Metric metric, int dim, const BuildParam& param)
-    : FaissGpu<T>(metric, dim, param.nlist)
+  FaissGpuIVFFlat(Metric metric, int dim, const BuildParam& param) : FaissGpu<T>(metric, dim, param)
   {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = this->device_;
@@ -234,15 +260,13 @@ class FaissGpuIVFFlat : public FaissGpu<T> {
 template <typename T>
 class FaissGpuIVFPQ : public FaissGpu<T> {
  public:
-  struct BuildParam {
-    int nlist;
+  struct BuildParam : public FaissGpu<T>::BuildParam {
     int M;
     bool useFloat16;
     bool usePrecomputed;
   };
 
-  FaissGpuIVFPQ(Metric metric, int dim, const BuildParam& param)
-    : FaissGpu<T>(metric, dim, param.nlist)
+  FaissGpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissGpu<T>(metric, dim, param)
   {
     faiss::gpu::GpuIndexIVFPQConfig config;
     config.useFloat16LookupTables = param.useFloat16;
@@ -271,13 +295,11 @@ class FaissGpuIVFPQ : public FaissGpu<T> {
 template <typename T>
 class FaissGpuIVFSQ : public FaissGpu<T> {
  public:
-  struct BuildParam {
-    int nlist;
+  struct BuildParam : public FaissGpu<T>::BuildParam {
     std::string quantizer_type;
   };
 
-  FaissGpuIVFSQ(Metric metric, int dim, const BuildParam& param)
-    : FaissGpu<T>(metric, dim, param.nlist)
+  FaissGpuIVFSQ(Metric metric, int dim, const BuildParam& param) : FaissGpu<T>(metric, dim, param)
   {
     faiss::ScalarQuantizer::QuantizerType qtype;
     if (param.quantizer_type == "fp16") {
@@ -310,7 +332,8 @@ class FaissGpuIVFSQ : public FaissGpu<T> {
 template <typename T>
 class FaissGpuFlat : public FaissGpu<T> {
  public:
-  FaissGpuFlat(Metric metric, int dim) : FaissGpu<T>(metric, dim, 0)
+  FaissGpuFlat(Metric metric, int dim)
+    : FaissGpu<T>(metric, dim, typename FaissGpu<T>::BuildParam{})
   {
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = this->device_;
diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
index 99481c2921..3b2e97062f 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
+++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
@@ -33,8 +33,7 @@ template <typename T>
 void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::Ggnn<T>::BuildParam& param)
 {
-  param.dataset_size = conf.at("dataset_size");
-  param.k            = conf.at("k");
+  param.k = conf.at("k");
 
   if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); }
   if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); }
diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
index 74c7cddc3c..664ec511dd 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
+++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -38,8 +38,6 @@ class Ggnn : public ANN<T> {
     int num_layers{4};     // L
     float tau{0.5};
     int refine_iterations{2};
-
-    size_t dataset_size;
     int k;  // GGNN requires to know k during building
   };
 
@@ -182,12 +180,6 @@ GgnnImpl<T, measure, D, KBuild, KQuery, S>::GgnnImpl(Metric metric,
   }
 
   if (dim != D) { throw std::runtime_error("mis-matched dim"); }
-
-  int device;
-  RAFT_CUDA_TRY(cudaGetDevice(&device));
-
-  ggnn_ = std::make_unique<GGNNGPUInstance>(
-    device, build_param_.dataset_size, build_param_.num_layers, true, build_param_.tau);
 }
 
 template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
@@ -195,11 +187,10 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset,
                                                        size_t nrow,
                                                        cudaStream_t stream)
 {
-  if (nrow != build_param_.dataset_size) {
-    throw std::runtime_error(
-      "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
-      " , but nrow = " + std::to_string(nrow));
-  }
+  int device;
+  RAFT_CUDA_TRY(cudaGetDevice(&device));
+  ggnn_ = std::make_unique<GGNNGPUInstance>(
+    device, nrow, build_param_.num_layers, true, build_param_.tau);
 
   ggnn_->set_base_data(dataset);
   ggnn_->set_stream(stream);
@@ -212,11 +203,6 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset,
 template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
 void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_dataset(const T* dataset, size_t nrow)
 {
-  if (nrow != build_param_.dataset_size) {
-    throw std::runtime_error(
-      "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
-      " , but nrow = " + std::to_string(nrow));
-  }
   ggnn_->set_base_data(dataset);
 }
 
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
index 5cd33ef94d..4d7b993aa1 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -31,6 +31,8 @@
 #include <utility>
 #include <vector>
 
+#include <omp.h>
+
 #include "../common/ann_types.hpp"
 #include <hnswlib.h>
 
@@ -164,13 +166,13 @@ class HnswLib : public ANN<T> {
   struct BuildParam {
     int M;
     int ef_construction;
-    int num_threads{1};
+    int num_threads = omp_get_num_procs();
   };
 
   using typename ANN<T>::AnnSearchParam;
   struct SearchParam : public AnnSearchParam {
     int ef;
-    int num_threads{1};
+    int num_threads = omp_get_num_procs();
   };
 
   HnswLib(Metric metric, int dim, const BuildParam& param);
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index aa25d1532f..a9ff6c2922 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -58,10 +58,7 @@ void parse_build_param(const nlohmann::json& conf,
 {
   param.n_lists = conf.at("nlist");
   if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
-  if (conf.contains("ratio")) {
-    param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio");
-    std::cout << "kmeans_trainset_fraction " << param.kmeans_trainset_fraction;
-  }
+  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
 }
 
 template <typename T, typename IdxT>
@@ -82,6 +79,17 @@ void parse_build_param(const nlohmann::json& conf,
   if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
   if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
   if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
+  if (conf.contains("codebook_kind")) {
+    std::string kind = conf.at("codebook_kind");
+    if (kind == "cluster") {
+      param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_CLUSTER;
+    } else if (kind == "subspace") {
+      param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE;
+    } else {
+      throw std::runtime_error("codebook_kind: '" + kind +
+                               "', should be either 'cluster' or 'subspace'");
+    }
+  }
 }
 
 template <typename T, typename IdxT>
@@ -139,6 +147,13 @@ void parse_build_param(const nlohmann::json& conf,
   if (conf.contains("intermediate_graph_degree")) {
     param.intermediate_graph_degree = conf.at("intermediate_graph_degree");
   }
+  if (conf.contains("graph_build_algo")) {
+    if (conf.at("graph_build_algo") == "IVF_PQ") {
+      param.build_algo = raft::neighbors::cagra::graph_build_algo::IVF_PQ;
+    } else if (conf.at("graph_build_algo") == "NN_DESCENT") {
+      param.build_algo = raft::neighbors::cagra::graph_build_algo::NN_DESCENT;
+    }
+  }
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index 1554c1f016..8f1e43a706 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -63,9 +63,14 @@ class RaftIvfPQ : public ANN<T> {
     rmm::mr::set_current_device_resource(&mr_);
     index_params_.metric = parse_metric_type(metric);
     RAFT_CUDA_TRY(cudaGetDevice(&device_));
+    RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming));
   }
 
-  ~RaftIvfPQ() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); }
+  ~RaftIvfPQ() noexcept
+  {
+    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_));
+    rmm::mr::set_current_device_resource(mr_.get_upstream());
+  }
 
   void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
 
@@ -96,6 +101,7 @@ class RaftIvfPQ : public ANN<T> {
   // `mr_` must go first to make sure it dies last
   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
   raft::device_resources handle_;
+  cudaEvent_t sync_{nullptr};
   BuildParam index_params_;
   raft::neighbors::ivf_pq::search_params search_params_;
   std::optional<raft::neighbors::ivf_pq::index<IdxT>> index_;
@@ -103,6 +109,12 @@ class RaftIvfPQ : public ANN<T> {
   int dimension_;
   float refine_ratio_ = 1.0;
   raft::device_matrix_view<const T, IdxT> dataset_;
+
+  void stream_wait(cudaStream_t stream) const
+  {
+    RAFT_CUDA_TRY(cudaEventRecord(sync_, resource::get_cuda_stream(handle_)));
+    RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_));
+  }
 };
 
 template <typename T, typename IdxT>
@@ -121,12 +133,12 @@ void RaftIvfPQ<T, IdxT>::load(const std::string& file)
 }
 
 template <typename T, typename IdxT>
-void RaftIvfPQ<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
+void RaftIvfPQ<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t stream)
 {
   auto dataset_v = raft::make_device_matrix_view<const T, IdxT>(dataset, IdxT(nrow), dim_);
 
   index_.emplace(raft::runtime::neighbors::ivf_pq::build(handle_, index_params_, dataset_v));
-  return;
+  stream_wait(stream);
 }
 
 template <typename T, typename IdxT>
@@ -176,16 +188,14 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
                                        neighbors_v,
                                        distances_v,
                                        index_->metric());
+      stream_wait(stream);  // RAFT stream -> bench stream
     } else {
       auto queries_host    = raft::make_host_matrix<T, IdxT>(batch_size, index_->dim());
       auto candidates_host = raft::make_host_matrix<IdxT, IdxT>(batch_size, k0);
       auto neighbors_host  = raft::make_host_matrix<IdxT, IdxT>(batch_size, k);
       auto distances_host  = raft::make_host_matrix<float, IdxT>(batch_size, k);
 
-      raft::copy(queries_host.data_handle(),
-                 queries,
-                 queries_host.size(),
-                 resource::get_cuda_stream(handle_));
+      raft::copy(queries_host.data_handle(), queries, queries_host.size(), stream);
       raft::copy(candidates_host.data_handle(),
                  candidates.data_handle(),
                  candidates_host.size(),
@@ -194,6 +204,10 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
       auto dataset_v = raft::make_host_matrix_view<const T, IdxT>(
         dataset_.data_handle(), dataset_.extent(0), dataset_.extent(1));
 
+      // wait for the queries to copy to host in 'stream` and for IVF-PQ::search to finish
+      RAFT_CUDA_TRY(cudaEventRecord(sync_, resource::get_cuda_stream(handle_)));
+      RAFT_CUDA_TRY(cudaEventRecord(sync_, stream));
+      RAFT_CUDA_TRY(cudaEventSynchronize(sync_));
       raft::runtime::neighbors::refine(handle_,
                                        dataset_v,
                                        queries_host.view(),
@@ -202,14 +216,8 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
                                        distances_host.view(),
                                        index_->metric());
 
-      raft::copy(neighbors,
-                 (size_t*)neighbors_host.data_handle(),
-                 neighbors_host.size(),
-                 resource::get_cuda_stream(handle_));
-      raft::copy(distances,
-                 distances_host.data_handle(),
-                 distances_host.size(),
-                 resource::get_cuda_stream(handle_));
+      raft::copy(neighbors, (size_t*)neighbors_host.data_handle(), neighbors_host.size(), stream);
+      raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
     }
   } else {
     auto queries_v =
@@ -219,8 +227,7 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
 
     raft::runtime::neighbors::ivf_pq::search(
       handle_, search_params_, *index_, queries_v, neighbors_v, distances_v);
+    stream_wait(stream);  // RAFT stream -> bench stream
   }
-  resource::sync_stream(handle_);
-  return;
 }
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index e8d4739384..ca4b0f099d 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -77,6 +77,7 @@ if(BUILD_PRIMS_BENCH)
     NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
     bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
+  ConfigureBench(NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/main.cpp)
 
   ConfigureBench(
     NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
@@ -155,4 +156,5 @@ if(BUILD_PRIMS_BENCH)
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
+
 endif()
diff --git a/cpp/bench/prims/core/bitset.cu b/cpp/bench/prims/core/bitset.cu
new file mode 100644
index 0000000000..5f44aa9af5
--- /dev/null
+++ b/cpp/bench/prims/core/bitset.cu
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/core/bitset.cuh>
+#include <raft/core/device_mdspan.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::core {
+
+struct bitset_inputs {
+  uint32_t bitset_len;
+  uint32_t mask_len;
+  uint32_t query_len;
+};  // struct bitset_inputs
+
+template <typename bitset_t, typename index_t>
+struct bitset_bench : public fixture {
+  bitset_bench(const bitset_inputs& p)
+    : params(p),
+      mask{raft::make_device_vector<index_t, index_t>(res, p.mask_len)},
+      queries{raft::make_device_vector<index_t, index_t>(res, p.query_len)},
+      outputs{raft::make_device_vector<bool, index_t>(res, p.query_len)}
+  {
+    raft::random::RngState state{42};
+    raft::random::uniformInt(res, state, mask.view(), index_t{0}, index_t{p.bitset_len});
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      auto my_bitset = raft::core::bitset<bitset_t, index_t>(
+        this->res, raft::make_const_mdspan(mask.view()), params.bitset_len);
+      my_bitset.test(res, raft::make_const_mdspan(queries.view()), outputs.view());
+    });
+  }
+
+ private:
+  raft::resources res;
+  bitset_inputs params;
+  raft::device_vector<index_t, index_t> mask, queries;
+  raft::device_vector<bool, index_t> outputs;
+};  // struct bitset
+
+const std::vector<bitset_inputs> bitset_input_vecs{
+  {256 * 1024 * 1024, 64 * 1024 * 1024, 256 * 1024 * 1024},    // Standard Bench
+  {256 * 1024 * 1024, 64 * 1024 * 1024, 1024 * 1024 * 1024},   // Extra queries
+  {128 * 1024 * 1024, 1024 * 1024 * 1024, 256 * 1024 * 1024},  // Extra mask to test atomics impact
+};
+
+using Uint8_32  = bitset_bench<uint8_t, uint32_t>;
+using Uint16_64 = bitset_bench<uint16_t, uint32_t>;
+using Uint32_32 = bitset_bench<uint32_t, uint32_t>;
+using Uint32_64 = bitset_bench<uint32_t, uint64_t>;
+
+RAFT_BENCH_REGISTER(Uint8_32, "", bitset_input_vecs);
+RAFT_BENCH_REGISTER(Uint16_64, "", bitset_input_vecs);
+RAFT_BENCH_REGISTER(Uint32_32, "", bitset_input_vecs);
+RAFT_BENCH_REGISTER(Uint32_64, "", bitset_input_vecs);
+
+}  // namespace raft::bench::core
diff --git a/cpp/bench/prims/matrix/select_k.cu b/cpp/bench/prims/matrix/select_k.cu
index 1bff66cac4..d3994e59c5 100644
--- a/cpp/bench/prims/matrix/select_k.cu
+++ b/cpp/bench/prims/matrix/select_k.cu
@@ -19,6 +19,7 @@
 #include <common/benchmark.hpp>
 
 #include <raft/core/device_resources.hpp>
+#include <raft/core/nvtx.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/sparse/detail/utils.h>
 #include <raft/util/cudart_utils.hpp>
@@ -38,6 +39,19 @@
 namespace raft::matrix {
 using namespace raft::bench;  // NOLINT
 
+template <typename KeyT>
+struct replace_with_mask {
+  KeyT replacement;
+  int64_t line_length;
+  int64_t spared_inputs;
+  constexpr auto inline operator()(int64_t offset, KeyT x, uint8_t mask) -> KeyT
+  {
+    auto i = offset % line_length;
+    // don't replace all the inputs, spare a few elements at the beginning of the input
+    return (mask && i >= spared_inputs) ? replacement : x;
+  }
+};
+
 template <typename KeyT, typename IdxT, select::Algo Algo>
 struct selection : public fixture {
   explicit selection(const select::params& p)
@@ -67,6 +81,21 @@ struct selection : public fixture {
       }
     }
     raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), min_value, max_value);
+    if (p.frac_infinities > 0.0) {
+      rmm::device_uvector<uint8_t> mask_buf(p.batch_size * p.len, stream);
+      auto mask = make_device_vector_view<uint8_t, size_t>(mask_buf.data(), mask_buf.size());
+      raft::random::bernoulli(handle, state, mask, p.frac_infinities);
+      KeyT bound = p.select_min ? raft::upper_bound<KeyT>() : raft::lower_bound<KeyT>();
+      auto mask_in =
+        make_device_vector_view<const uint8_t, size_t>(mask_buf.data(), mask_buf.size());
+      auto dists_in  = make_device_vector_view<const KeyT>(in_dists_.data(), in_dists_.size());
+      auto dists_out = make_device_vector_view<KeyT>(in_dists_.data(), in_dists_.size());
+      raft::linalg::map_offset(handle,
+                               dists_out,
+                               replace_with_mask<KeyT>{bound, int64_t(p.len), int64_t(p.k / 2)},
+                               dists_in,
+                               mask_in);
+    }
   }
 
   void run_benchmark(::benchmark::State& state) override  // NOLINT
@@ -75,8 +104,12 @@ struct selection : public fixture {
       std::ostringstream label_stream;
       label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k;
       if (params_.use_same_leading_bits) { label_stream << "#same-leading-bits"; }
+      if (params_.frac_infinities > 0) { label_stream << "#infs-" << params_.frac_infinities; }
       state.SetLabel(label_stream.str());
-      loop_on_state(state, [this]() {
+      common::nvtx::range case_scope("%s - %s", state.name().c_str(), label_stream.str().c_str());
+      int iter = 0;
+      loop_on_state(state, [&iter, this]() {
+        common::nvtx::range lap_scope("lap-", iter++);
         select::select_k_impl<KeyT, IdxT>(handle,
                                           Algo,
                                           in_dists_.data(),
@@ -149,6 +182,35 @@ const std::vector<select::params> kInputs{
   {10, 1000000, 64, true, false, true},
   {10, 1000000, 128, true, false, true},
   {10, 1000000, 256, true, false, true},
+
+  {10, 1000000, 1, true, false, false, true, 0.1},
+  {10, 1000000, 16, true, false, false, true, 0.1},
+  {10, 1000000, 64, true, false, false, true, 0.1},
+  {10, 1000000, 128, true, false, false, true, 0.1},
+  {10, 1000000, 256, true, false, false, true, 0.1},
+
+  {10, 1000000, 1, true, false, false, true, 0.9},
+  {10, 1000000, 16, true, false, false, true, 0.9},
+  {10, 1000000, 64, true, false, false, true, 0.9},
+  {10, 1000000, 128, true, false, false, true, 0.9},
+  {10, 1000000, 256, true, false, false, true, 0.9},
+  {1000, 10000, 1, true, false, false, true, 0.9},
+  {1000, 10000, 16, true, false, false, true, 0.9},
+  {1000, 10000, 64, true, false, false, true, 0.9},
+  {1000, 10000, 128, true, false, false, true, 0.9},
+  {1000, 10000, 256, true, false, false, true, 0.9},
+
+  {10, 1000000, 1, true, false, false, true, 1.0},
+  {10, 1000000, 16, true, false, false, true, 1.0},
+  {10, 1000000, 64, true, false, false, true, 1.0},
+  {10, 1000000, 128, true, false, false, true, 1.0},
+  {10, 1000000, 256, true, false, false, true, 1.0},
+  {1000, 10000, 1, true, false, false, true, 1.0},
+  {1000, 10000, 16, true, false, false, true, 1.0},
+  {1000, 10000, 64, true, false, false, true, 1.0},
+  {1000, 10000, 128, true, false, false, true, 1.0},
+  {1000, 10000, 256, true, false, false, true, 1.0},
+  {1000, 10000, 256, true, false, false, true, 0.999},
 };
 
 #define SELECTION_REGISTER(KeyT, IdxT, A)                        \
@@ -157,28 +219,28 @@ const std::vector<select::params> kInputs{
   RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \
   }
 
-SELECTION_REGISTER(float, uint32_t, kPublicApi);              // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix8bits);             // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix11bits);            // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix11bitsExtraPass);   // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpAuto);               // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpImmediate);          // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpFiltered);           // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpDistributed);        // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm);     // NOLINT
+SELECTION_REGISTER(float, uint32_t, kPublicApi);             // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix8bits);            // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix11bits);           // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix11bitsExtraPass);  // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpAuto);              // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpImmediate);         // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpFiltered);          // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpDistributed);       // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm);    // NOLINT
 
 SELECTION_REGISTER(double, uint32_t, kRadix8bits);            // NOLINT
 SELECTION_REGISTER(double, uint32_t, kRadix11bits);           // NOLINT
 SELECTION_REGISTER(double, uint32_t, kRadix11bitsExtraPass);  // NOLINT
 SELECTION_REGISTER(double, uint32_t, kWarpAuto);              // NOLINT
 
-SELECTION_REGISTER(double, int64_t, kRadix8bits);             // NOLINT
-SELECTION_REGISTER(double, int64_t, kRadix11bits);            // NOLINT
-SELECTION_REGISTER(double, int64_t, kRadix11bitsExtraPass);   // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpImmediate);          // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpFiltered);           // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpDistributed);        // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpDistributedShm);     // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix8bits);            // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix11bits);           // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix11bitsExtraPass);  // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpImmediate);         // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpFiltered);          // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpDistributed);       // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpDistributedShm);    // NOLINT
 
 // For learning a heuristic of which selection algorithm to use, we
 // have a couple of additional constraints when generating the dataset:
diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
index 866a0ebdfa..ade3a6e348 100644
--- a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -438,7 +438,7 @@ __global__ void __launch_bounds__((WarpSize * BlockDimY))
   adjust_centers_kernel(MathT* centers,  // [n_clusters, dim]
                         IdxT n_clusters,
                         IdxT dim,
-                        const T* dataset,               // [n_rows, dim]
+                        const T* dataset,  // [n_rows, dim]
                         IdxT n_rows,
                         const LabelT* labels,           // [n_rows]
                         const CounterT* cluster_sizes,  // [n_clusters]
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
new file mode 100644
index 0000000000..6747c5fab0
--- /dev/null
+++ b/cpp/include/raft/core/bitset.cuh
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_container_policy.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/util/device_atomics.cuh>
+#include <thrust/for_each.h>
+
+namespace raft::core {
+/**
+ * @defgroup bitset Bitset
+ * @{
+ */
+/**
+ * @brief View of a RAFT Bitset.
+ *
+ * This lightweight structure stores a pointer to a bitset in device memory with it's length.
+ * It provides a test() device function to check if a given index is set in the bitset.
+ *
+ * @tparam bitset_t Underlying type of the bitset array. Default is uint32_t.
+ * @tparam index_t Indexing type used. Default is uint32_t.
+ */
+template <typename bitset_t = uint32_t, typename index_t = uint32_t>
+struct bitset_view {
+  index_t static constexpr const bitset_element_size = sizeof(bitset_t) * 8;
+
+  _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr, index_t bitset_len)
+    : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len}
+  {
+  }
+  /**
+   * @brief Create a bitset view from a device vector view of the bitset.
+   *
+   * @param bitset_span Device vector view of the bitset
+   * @param bitset_len Number of bits in the bitset
+   */
+  _RAFT_HOST_DEVICE bitset_view(raft::device_vector_view<bitset_t, index_t> bitset_span,
+                                index_t bitset_len)
+    : bitset_ptr_{bitset_span.data_handle()}, bitset_len_{bitset_len}
+  {
+  }
+  /**
+   * @brief Device function to test if a given index is set in the bitset.
+   *
+   * @param sample_index Single index to test
+   * @return bool True if index has not been unset in the bitset
+   */
+  inline _RAFT_DEVICE auto test(const index_t sample_index) const -> bool
+  {
+    const bitset_t bit_element = bitset_ptr_[sample_index / bitset_element_size];
+    const index_t bit_index    = sample_index % bitset_element_size;
+    const bool is_bit_set      = (bit_element & (bitset_t{1} << bit_index)) != 0;
+    return is_bit_set;
+  }
+
+  /**
+   * @brief Get the device pointer to the bitset.
+   */
+  inline _RAFT_HOST_DEVICE auto data_handle() -> bitset_t* { return bitset_ptr_; }
+  inline _RAFT_HOST_DEVICE auto data_handle() const -> const bitset_t* { return bitset_ptr_; }
+  /**
+   * @brief Get the number of bits of the bitset representation.
+   */
+  inline _RAFT_HOST_DEVICE auto size() const -> index_t { return bitset_len_; }
+
+  /**
+   * @brief Get the number of elements used by the bitset representation.
+   */
+  inline _RAFT_HOST_DEVICE auto n_elements() const -> index_t
+  {
+    return raft::ceildiv(bitset_len_, bitset_element_size);
+  }
+
+  inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<bitset_t, index_t>(bitset_ptr_, n_elements());
+  }
+  inline auto to_mdspan() const -> raft::device_vector_view<const bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
+  }
+
+ private:
+  bitset_t* bitset_ptr_;
+  index_t bitset_len_;
+};
+
+/**
+ * @brief RAFT Bitset.
+ *
+ * This structure encapsulates a bitset in device memory. It provides a view() method to get a
+ * device-usable lightweight view of the bitset.
+ * Each index is represented by a single bit in the bitset. The total number of bytes used is
+ * ceil(bitset_len / 8).
+ * @tparam bitset_t Underlying type of the bitset array. Default is uint32_t.
+ * @tparam index_t Indexing type used. Default is uint32_t.
+ */
+template <typename bitset_t = uint32_t, typename index_t = uint32_t>
+struct bitset {
+  index_t static constexpr const bitset_element_size = sizeof(bitset_t) * 8;
+
+  /**
+   * @brief Construct a new bitset object with a list of indices to unset.
+   *
+   * @param res RAFT resources
+   * @param mask_index List of indices to unset in the bitset
+   * @param bitset_len Length of the bitset
+   * @param default_value Default value to set the bits to. Default is true.
+   */
+  bitset(const raft::resources& res,
+         raft::device_vector_view<const index_t, index_t> mask_index,
+         index_t bitset_len,
+         bool default_value = true)
+    : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
+              raft::resource::get_cuda_stream(res)},
+      bitset_len_{bitset_len},
+      default_value_{default_value}
+  {
+    cudaMemsetAsync(bitset_.data(),
+                    default_value ? 0xff : 0x00,
+                    n_elements() * sizeof(bitset_t),
+                    resource::get_cuda_stream(res));
+    set(res, mask_index, !default_value);
+  }
+
+  /**
+   * @brief Construct a new bitset object
+   *
+   * @param res RAFT resources
+   * @param bitset_len Length of the bitset
+   * @param default_value Default value to set the bits to. Default is true.
+   */
+  bitset(const raft::resources& res, index_t bitset_len, bool default_value = true)
+    : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
+              resource::get_cuda_stream(res)},
+      bitset_len_{bitset_len},
+      default_value_{default_value}
+  {
+    cudaMemsetAsync(bitset_.data(),
+                    default_value ? 0xff : 0x00,
+                    n_elements() * sizeof(bitset_t),
+                    resource::get_cuda_stream(res));
+  }
+  // Disable copy constructor
+  bitset(const bitset&)            = delete;
+  bitset(bitset&&)                 = default;
+  bitset& operator=(const bitset&) = delete;
+  bitset& operator=(bitset&&)      = default;
+
+  /**
+   * @brief Create a device-usable view of the bitset.
+   *
+   * @return bitset_view<bitset_t, index_t>
+   */
+  inline auto view() -> raft::core::bitset_view<bitset_t, index_t>
+  {
+    return bitset_view<bitset_t, index_t>(to_mdspan(), bitset_len_);
+  }
+  [[nodiscard]] inline auto view() const -> raft::core::bitset_view<const bitset_t, index_t>
+  {
+    return bitset_view<const bitset_t, index_t>(to_mdspan(), bitset_len_);
+  }
+
+  /**
+   * @brief Get the device pointer to the bitset.
+   */
+  inline auto data_handle() -> bitset_t* { return bitset_.data(); }
+  inline auto data_handle() const -> const bitset_t* { return bitset_.data(); }
+  /**
+   * @brief Get the number of bits of the bitset representation.
+   */
+  inline auto size() const -> index_t { return bitset_len_; }
+
+  /**
+   * @brief Get the number of elements used by the bitset representation.
+   */
+  inline auto n_elements() const -> index_t
+  {
+    return raft::ceildiv(bitset_len_, bitset_element_size);
+  }
+
+  /** @brief Get an mdspan view of the current bitset */
+  inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<bitset_t, index_t>(bitset_.data(), n_elements());
+  }
+  [[nodiscard]] inline auto to_mdspan() const -> raft::device_vector_view<const bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
+  }
+
+  /** @brief Resize the bitset. If the requested size is larger, new memory is allocated and set to
+   * the default value. */
+  void resize(const raft::resources& res, index_t new_bitset_len)
+  {
+    auto old_size = raft::ceildiv(bitset_len_, bitset_element_size);
+    auto new_size = raft::ceildiv(new_bitset_len, bitset_element_size);
+    bitset_.resize(new_size);
+    bitset_len_ = new_bitset_len;
+    if (old_size < new_size) {
+      // If the new size is larger, set the new bits to the default value
+      cudaMemsetAsync(bitset_.data() + old_size,
+                      default_value_ ? 0xff : 0x00,
+                      (new_size - old_size) * sizeof(bitset_t),
+                      resource::get_cuda_stream(res));
+    }
+  }
+
+  /**
+   * @brief Test a list of indices in a bitset.
+   *
+   * @tparam output_t Output type of the test. Default is bool.
+   * @param res RAFT resources
+   * @param queries List of indices to test
+   * @param output List of outputs
+   */
+  template <typename output_t = bool>
+  void test(const raft::resources& res,
+            raft::device_vector_view<const index_t, index_t> queries,
+            raft::device_vector_view<output_t, index_t> output) const
+  {
+    RAFT_EXPECTS(output.extent(0) == queries.extent(0), "Output and queries must be same size");
+    auto bitset_view = view();
+    raft::linalg::map(
+      res,
+      output,
+      [bitset_view] __device__(index_t query) { return output_t(bitset_view.test(query)); },
+      queries);
+  }
+  /**
+   * @brief Set a list of indices in a bitset to set_value.
+   *
+   * @param res RAFT resources
+   * @param mask_index indices to remove from the bitset
+   * @param set_value Value to set the bits to (true or false)
+   */
+  void set(const raft::resources& res,
+           raft::device_vector_view<const index_t, index_t> mask_index,
+           bool set_value = false)
+  {
+    auto* bitset_ptr = this->data_handle();
+    thrust::for_each_n(resource::get_thrust_policy(res),
+                       mask_index.data_handle(),
+                       mask_index.extent(0),
+                       [bitset_ptr, set_value] __device__(const index_t sample_index) {
+                         const index_t bit_element = sample_index / bitset_element_size;
+                         const index_t bit_index   = sample_index % bitset_element_size;
+                         const bitset_t bitmask    = bitset_t{1} << bit_index;
+                         if (set_value) {
+                           atomicOr(bitset_ptr + bit_element, bitmask);
+                         } else {
+                           const bitset_t bitmask2 = ~bitmask;
+                           atomicAnd(bitset_ptr + bit_element, bitmask2);
+                         }
+                       });
+  }
+  /**
+   * @brief Flip all the bits in a bitset.
+   *
+   * @param res RAFT resources
+   */
+  void flip(const raft::resources& res)
+  {
+    auto bitset_span = this->to_mdspan();
+    raft::linalg::map(
+      res,
+      bitset_span,
+      [] __device__(bitset_t element) { return bitset_t(~element); },
+      raft::make_const_mdspan(bitset_span));
+  }
+  /**
+   * @brief Reset the bits in a bitset.
+   *
+   * @param res RAFT resources
+   */
+  void reset(const raft::resources& res)
+  {
+    cudaMemsetAsync(bitset_.data(),
+                    default_value_ ? 0xff : 0x00,
+                    n_elements() * sizeof(bitset_t),
+                    resource::get_cuda_stream(res));
+  }
+
+ private:
+  raft::device_uvector<bitset_t> bitset_;
+  index_t bitset_len_;
+  bool default_value_;
+};
+
+/** @} */
+}  // end namespace raft::core
diff --git a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
index 328080da1f..8e41aa96f3 100644
--- a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
+++ b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
@@ -75,7 +75,7 @@ namespace numpy_serializer {
 
 #if RAFT_SYSTEM_LITTLE_ENDIAN == 1
 #define RAFT_NUMPY_HOST_ENDIAN_CHAR RAFT_NUMPY_LITTLE_ENDIAN_CHAR
-#else   // RAFT_SYSTEM_LITTLE_ENDIAN == 1
+#else  // RAFT_SYSTEM_LITTLE_ENDIAN == 1
 #define RAFT_NUMPY_HOST_ENDIAN_CHAR RAFT_NUMPY_BIG_ENDIAN_CHAR
 #endif  // RAFT_SYSTEM_LITTLE_ENDIAN == 1
 
diff --git a/cpp/include/raft/core/detail/nvtx.hpp b/cpp/include/raft/core/detail/nvtx.hpp
index e734c99029..f077a49b77 100644
--- a/cpp/include/raft/core/detail/nvtx.hpp
+++ b/cpp/include/raft/core/detail/nvtx.hpp
@@ -193,7 +193,7 @@ inline void pop_range()
 
 }  // namespace raft::common::nvtx::detail
 
-#else   // NVTX_ENABLED
+#else  // NVTX_ENABLED
 
 namespace raft::common::nvtx::detail {
 
diff --git a/cpp/include/raft/core/kvp.hpp b/cpp/include/raft/core/kvp.hpp
index 2e0d1117a1..192d160d45 100644
--- a/cpp/include/raft/core/kvp.hpp
+++ b/cpp/include/raft/core/kvp.hpp
@@ -32,8 +32,8 @@ struct KeyValuePair {
   typedef _Key Key;      ///< Key data type
   typedef _Value Value;  ///< Value data type
 
-  Key key;               ///< Item key
-  Value value;           ///< Item value
+  Key key;      ///< Item key
+  Value value;  ///< Item value
 
   /// Constructor
   RAFT_INLINE_FUNCTION KeyValuePair() {}
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index 2dc4eb1f9d..8e331293bf 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -42,7 +42,7 @@ enum resource_type {
   THRUST_POLICY,           // thrust execution policy
   WORKSPACE_RESOURCE,      // rmm device memory resource
 
-  LAST_KEY                 // reserved for the last key
+  LAST_KEY  // reserved for the last key
 };
 
 /**
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h b/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h
index 10827a8778..f659ed256d 100644
--- a/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h
@@ -397,7 +397,7 @@ class EpilogueWithBroadcastCustom : public EpilogueBase<Shape_,
     TensorTileIterator
       tensor_iterator,  ///< Threadblock tile iterator for additional tensor operand
     MatrixCoord const&
-      problem_size =    ///< Problem size needed to guard against out-of-bounds accesses
+      problem_size =  ///< Problem size needed to guard against out-of-bounds accesses
     MatrixCoord(Shape::kM, Shape::kN),
     MatrixCoord const&
       threadblock_offset =  ///< Threadblock's initial offset within the problem size space
@@ -418,7 +418,7 @@ class EpilogueWithBroadcastCustom : public EpilogueBase<Shape_,
       broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
     ElementVector const* broadcast_ptr,  ///< Broadcast vector
     MatrixCoord const&
-      problem_size,       ///< Problem size needed to guard against out-of-bounds accesses
+      problem_size,  ///< Problem size needed to guard against out-of-bounds accesses
     MatrixCoord const&
       threadblock_offset  ///< Threadblock's initial offset within the problem size space
   )
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/predicated_tile_iterator_normvec_smem.h b/cpp/include/raft/distance/detail/fused_distance_nn/predicated_tile_iterator_normvec_smem.h
index c35a64f105..14c09f6aef 100644
--- a/cpp/include/raft/distance/detail/fused_distance_nn/predicated_tile_iterator_normvec_smem.h
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/predicated_tile_iterator_normvec_smem.h
@@ -88,8 +88,8 @@ namespace threadblock {
 ///
 /// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
 ///
-template <typename ThreadMap_,        ///< Thread map (conept: OutputTileThreadMap)
-          typename Element_,          ///< Element data type
+template <typename ThreadMap_,  ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,    ///< Element data type
           typename Layout_,
           bool ScatterD     = false,  ///< Scatter D operand or not
           bool UseCUDAStore = false>
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 68922943f4..f0f12acdb1 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include <cstddef>                                          // size_t
-#include <limits>                                           // std::numeric_limits
-#include <raft/core/kvp.hpp>                                // raft::KeyValuePair
-#include <raft/core/operators.hpp>                          // raft::identity_op
-#include <raft/distance/detail/distance_ops/l2_exp.cuh>     // ops::l2_exp_distance_op
+#include <cstddef>                                       // size_t
+#include <limits>                                        // std::numeric_limits
+#include <raft/core/kvp.hpp>                             // raft::KeyValuePair
+#include <raft/core/operators.hpp>                       // raft::identity_op
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>  // ops::l2_exp_distance_op
 #include <raft/distance/detail/fused_distance_nn/cutlass_base.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>  // PairwiseDistances
 #include <raft/linalg/contractions.cuh>                     // Policy
diff --git a/cpp/include/raft/distance/detail/masked_distance_base.cuh b/cpp/include/raft/distance/detail/masked_distance_base.cuh
index 5a33c9ce4a..55da634145 100644
--- a/cpp/include/raft/distance/detail/masked_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/masked_distance_base.cuh
@@ -217,7 +217,7 @@ struct MaskedDistances : public BaseClass {
         }  // tile_idx_n
       }    // idx_g
       rowEpilog_op(tile_idx_m);
-    }      // tile_idx_m
+    }  // tile_idx_m
   }
 
  private:
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 58b5daa8ca..c6b09be31e 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -18,7 +18,7 @@
 #include <raft/util/cuda_dev_essentials.cuh>  // ceildiv
 #include <raft/util/cuda_rt_essentials.hpp>   // RAFT_CUDA_TRY
 
-#include <cstddef>                            // size_t
+#include <cstddef>  // size_t
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
index dd58ab4328..e1dc6f9b37 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -45,7 +45,7 @@ void pairwise_matrix_dispatch(OpT distance_op,
                               cudaStream_t stream,
                               bool is_row_major) RAFT_EXPLICIT;
 
-};      // namespace raft::distance::detail
+};  // namespace raft::distance::detail
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
diff --git a/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h b/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h
index cd748b9e6b..951f8a0132 100644
--- a/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h
+++ b/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h
@@ -57,8 +57,8 @@ namespace threadblock {
 ///
 /// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
 ///
-template <typename ThreadMap_,        ///< Thread map (conept: OutputTileThreadMap)
-          typename Element_,          ///< Element data type
+template <typename ThreadMap_,  ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,    ///< Element data type
           typename Layout_,
           bool ScatterD     = false,  ///< Scatter D operand or not
           bool UseCUDAStore = false>
diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
index 3f7f2b0a23..7171ba605f 100644
--- a/cpp/include/raft/distance/distance-ext.cuh
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -140,8 +140,8 @@ void pairwise_distance(raft::resources const& handle,
                        raft::distance::DistanceType metric,
                        Type metric_arg = 2.0f) RAFT_EXPLICIT;
 
-};      // namespace distance
-};      // namespace raft
+};  // namespace distance
+};  // namespace raft
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 30f4a2d167..b2cd736c57 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -217,7 +217,7 @@ void add_scalar(raft::resources const& handle,
 
 /** @} */  // end of group add
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index f6889e959b..03beb1d1d1 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -82,7 +82,7 @@ void binary_op(raft::resources const& handle, InType in1, InType in2, OutType ou
 
 /** @} */  // end of group binary_op
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index 5609656234..afa58d73fc 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -160,7 +160,7 @@ void coalesced_reduction(raft::resources const& handle,
 
 /** @} */  // end of group coalesced_reduction
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
index 3b1e8c41c4..cb6488bedf 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -100,7 +100,7 @@ struct KernelPolicy {
     SmemSize = 2 * SmemPage * sizeof(DataT),
   };  // enum
 
-};    // struct KernelPolicy
+};  // struct KernelPolicy
 
 template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
 struct ColKernelPolicy {
diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 5a7356a4c2..d15e343c9a 100644
--- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -550,7 +550,7 @@ cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,
 template <>
 inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
                                          int n,
-                                         float* const A[],       // NOLINT
+                                         float* const A[],  // NOLINT
                                          int lda,
                                          int* P,
                                          int* info,
@@ -564,7 +564,7 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
 template <>
 inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
                                          int n,
-                                         double* const A[],      // NOLINT
+                                         double* const A[],  // NOLINT
                                          int lda,
                                          int* P,
                                          int* info,
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index d617b065da..17ec5c3136 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -96,7 +96,7 @@ void divide_scalar(raft::resources const& handle,
 
 /** @} */  // end of group add
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 954bf19334..57f3b61388 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -220,7 +220,7 @@ void eig_jacobi(raft::resources const& handle,
 
 /** @} */  // end of eig
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/gemv.cuh b/cpp/include/raft/linalg/gemv.cuh
index 640964d018..610ea07f96 100644
--- a/cpp/include/raft/linalg/gemv.cuh
+++ b/cpp/include/raft/linalg/gemv.cuh
@@ -305,6 +305,6 @@ void gemv(raft::resources const& handle,
 }
 /** @} */  // end of gemv
 
-};         // namespace linalg
-};         // namespace raft
+};  // namespace linalg
+};  // namespace raft
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lstsq.cuh b/cpp/include/raft/linalg/lstsq.cuh
index 20588cbe17..21575d7806 100644
--- a/cpp/include/raft/linalg/lstsq.cuh
+++ b/cpp/include/raft/linalg/lstsq.cuh
@@ -245,7 +245,7 @@ void lstsq_qr(raft::resources const& handle,
 
 /** @} */  // end of lstsq
 
-};         // namespace linalg
-};         // namespace raft
+};  // namespace linalg
+};  // namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index e620d227eb..a65f6ed390 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -240,7 +240,7 @@ void matrix_vector_op(raft::resources const& handle,
 
 /** @} */  // end of group matrix_vector_op
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
index d45f11524d..b59a0fcef7 100644
--- a/cpp/include/raft/linalg/mean_squared_error.cuh
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -79,7 +79,7 @@ void mean_squared_error(raft::resources const& handle,
 
 /** @} */  // end of group mean_squared_error
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
index 3ade108235..9973a3cc6c 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -98,7 +98,7 @@ void multiply_scalar(
 
 /** @} */  // end of group multiply
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
index 26ac1035ca..5c7dcbd5cf 100644
--- a/cpp/include/raft/linalg/power.cuh
+++ b/cpp/include/raft/linalg/power.cuh
@@ -154,7 +154,7 @@ void power_scalar(
 
 /** @} */  // end of group add
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index a3d0ef71d0..3181dd0224 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -162,7 +162,7 @@ void reduce(raft::resources const& handle,
 
 /** @} */  // end of group reduction
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
index 6eaf1e2ba7..5ed0fb7407 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -113,7 +113,7 @@ void reduce_cols_by_key(
 
 /** @} */  // end of group reduce_cols_by_key
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index fa624b2191..7d93c3946f 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -192,7 +192,7 @@ void reduce_rows_by_key(
 
 /** @} */  // end of group reduce_rows_by_key
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh
index 2dece5b957..163f360481 100644
--- a/cpp/include/raft/linalg/rsvd.cuh
+++ b/cpp/include/raft/linalg/rsvd.cuh
@@ -876,7 +876,7 @@ void randomized_svd(const raft::resources& handle,
 
 /** @} */  // end of group rsvd
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
index 99754c4eb2..81b7ab7dec 100644
--- a/cpp/include/raft/linalg/sqrt.cuh
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -84,7 +84,7 @@ void sqrt(raft::resources const& handle, InType in, OutType out)
 
 /** @} */  // end of group add
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index f971d0e40b..c7ff000e00 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -171,7 +171,7 @@ void strided_reduction(raft::resources const& handle,
 
 /** @} */  // end of group strided_reduction
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index 688e60a806..f4243f9dc1 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -223,7 +223,7 @@ void subtract_scalar(
 
 /** @} */  // end of group subtract
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 08f9462ba9..f7071de75b 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -416,7 +416,7 @@ void svd_reconstruction(raft::resources const& handle,
 
 /** @} */  // end of group svd
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh
index f46133abd9..67b04c6791 100644
--- a/cpp/include/raft/linalg/ternary_op.cuh
+++ b/cpp/include/raft/linalg/ternary_op.cuh
@@ -83,7 +83,7 @@ void ternary_op(
 
 /** @} */  // end of group ternary_op
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/linalg/transpose.cuh b/cpp/include/raft/linalg/transpose.cuh
index afe1962223..1b46082fbe 100644
--- a/cpp/include/raft/linalg/transpose.cuh
+++ b/cpp/include/raft/linalg/transpose.cuh
@@ -103,7 +103,7 @@ auto transpose(raft::resources const& handle,
 
 /** @} */  // end of group transpose
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index 47a432f415..5ebe27923a 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -125,7 +125,7 @@ void write_only_unary_op(const raft::resources& handle, OutType out, Lambda op)
 
 /** @} */  // end of group unary_op
 
-};         // end namespace linalg
-};         // end namespace raft
+};  // end namespace linalg
+};  // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh
index 887741ad71..c94b2506d3 100644
--- a/cpp/include/raft/matrix/col_wise_sort.cuh
+++ b/cpp/include/raft/matrix/col_wise_sort.cuh
@@ -134,6 +134,6 @@ void sort_cols_per_row(Args... args)
 
 /** @} */  // end of group col_wise_sort
 
-};         // end namespace raft::matrix
+};  // end namespace raft::matrix
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index f934d7e3b4..870f0c3240 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <cstdint>                                   // uint32_t
-#include <cuda_fp16.h>                               // __half
+#include <cstdint>      // uint32_t
+#include <cuda_fp16.h>  // __half
 #include <raft/core/device_resources.hpp>
 #include <raft/util/raft_explicit.hpp>               // RAFT_EXPLICIT
 #include <rmm/cuda_stream_view.hpp>                  // rmm:cuda_stream_view
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index dc86a04733..2927604e7d 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -959,7 +959,7 @@ void calc_launch_parameter(
       if (batch_size >= size_t(another_min_grid_size)  // still have enough work
           && another_block_size < block_size           // protect against an infinite loop
           && another_min_grid_size * another_block_size >
-               min_grid_size * block_size              // improve occupancy
+               min_grid_size * block_size  // improve occupancy
       ) {
         block_size    = another_block_size;
         min_grid_size = another_min_grid_size;
diff --git a/cpp/include/raft/neighbors/ann_types.hpp b/cpp/include/raft/neighbors/ann_types.hpp
index 469d3c09d4..c17be4a8ff 100644
--- a/cpp/include/raft/neighbors/ann_types.hpp
+++ b/cpp/include/raft/neighbors/ann_types.hpp
@@ -49,4 +49,4 @@ struct search_params {};
 
 /** @} */  // end group ann_types
 
-};         // namespace raft::neighbors::ann
+};  // namespace raft::neighbors::ann
diff --git a/cpp/include/raft/neighbors/brute_force-ext.cuh b/cpp/include/raft/neighbors/brute_force-ext.cuh
index 862db75866..b8c00616da 100644
--- a/cpp/include/raft/neighbors/brute_force-ext.cuh
+++ b/cpp/include/raft/neighbors/brute_force-ext.cuh
@@ -22,7 +22,8 @@
 #include <raft/core/operators.hpp>           // raft::identity_op
 #include <raft/core/resources.hpp>           // raft::resources
 #include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
-#include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
+#include <raft/neighbors/brute_force_types.hpp>
+#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
@@ -38,6 +39,19 @@ inline void knn_merge_parts(
   size_t n_samples,
   std::optional<raft::device_vector_view<idx_t, idx_t>> translations = std::nullopt) RAFT_EXPLICIT;
 
+template <typename T, typename Accessor>
+index<T> build(raft::resources const& res,
+               mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+               raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
+               T metric_arg                        = 0.0) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+void search(raft::resources const& res,
+            const index<T>& idx,
+            raft::device_matrix_view<const T, int64_t, row_major> queries,
+            raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, row_major> distances) RAFT_EXPLICIT;
+
 template <typename idx_t,
           typename value_t,
           typename matrix_idx,
@@ -93,6 +107,29 @@ instantiate_raft_neighbors_brute_force_knn(
 
 #undef instantiate_raft_neighbors_brute_force_knn
 
+namespace raft::neighbors::brute_force {
+
+extern template void search<float, int>(
+  raft::resources const& res,
+  const raft::neighbors::brute_force::index<float>& idx,
+  raft::device_matrix_view<const float, int64_t, row_major> queries,
+  raft::device_matrix_view<int, int64_t, row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, row_major> distances);
+
+extern template void search<float, int64_t>(
+  raft::resources const& res,
+  const raft::neighbors::brute_force::index<float>& idx,
+  raft::device_matrix_view<const float, int64_t, row_major> queries,
+  raft::device_matrix_view<int64_t, int64_t, row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, row_major> distances);
+
+extern template raft::neighbors::brute_force::index<float> build<float>(
+  raft::resources const& res,
+  raft::device_matrix_view<const float, int64_t, row_major> dataset,
+  raft::distance::DistanceType metric,
+  float metric_arg);
+}  // namespace raft::neighbors::brute_force
+
 #define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
   value_t, idx_t, idx_layout, query_layout)                             \
   extern template void raft::neighbors::brute_force::fused_l2_knn(      \
diff --git a/cpp/include/raft/neighbors/brute_force-inl.cuh b/cpp/include/raft/neighbors/brute_force-inl.cuh
index bc9e09e5b0..88439a738b 100644
--- a/cpp/include/raft/neighbors/brute_force-inl.cuh
+++ b/cpp/include/raft/neighbors/brute_force-inl.cuh
@@ -19,6 +19,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/brute_force_types.hpp>
 #include <raft/neighbors/detail/knn_brute_force.cuh>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 
@@ -280,6 +281,101 @@ void fused_l2_knn(raft::resources const& handle,
                                          metric);
 }
 
-/** @} */  // end group brute_force_knn
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * @tparam T data element type
+ *
+ * @param[in] res
+ * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
+ * @param[in] metric: distance metric to use. Euclidean (L2) is used by default
+ * @param[in] metric_arg: the value of `p` for Minkowski (l-p) distances. This
+ *           is ignored if the metric_type is not Minkowski.
+ *
+ * @return the constructed brute force index
+ */
+template <typename T, typename Accessor>
+index<T> build(raft::resources const& res,
+               mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+               raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
+               T metric_arg                        = 0.0)
+{
+  // certain distance metrics can benefit by pre-calculating the norms for the index dataset
+  // which lets us avoid calculating these at query time
+  std::optional<device_vector<T, int64_t>> norms;
+  if (metric == raft::distance::DistanceType::L2Expanded ||
+      metric == raft::distance::DistanceType::L2SqrtExpanded ||
+      metric == raft::distance::DistanceType::CosineExpanded) {
+    norms = make_device_vector<T, int64_t>(res, dataset.extent(0));
+    // cosine needs the l2norm, where as l2 distances needs the squared norm
+    if (metric == raft::distance::DistanceType::CosineExpanded) {
+      raft::linalg::norm(res,
+                         dataset,
+                         norms->view(),
+                         raft::linalg::NormType::L2Norm,
+                         raft::linalg::Apply::ALONG_ROWS,
+                         raft::sqrt_op{});
+    } else {
+      raft::linalg::norm(res,
+                         dataset,
+                         norms->view(),
+                         raft::linalg::NormType::L2Norm,
+                         raft::linalg::Apply::ALONG_ROWS);
+    }
+  }
+
+  return index<T>(res, dataset, std::move(norms), metric, metric_arg);
+}
 
+/**
+ * @brief Brute Force search using the constructed index.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] res raft resources
+ * @param[in] idx brute force index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ */
+template <typename T, typename IdxT>
+void search(raft::resources const& res,
+            const index<T>& idx,
+            raft::device_matrix_view<const T, int64_t, row_major> queries,
+            raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, row_major> distances)
+{
+  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1), "Value of k must match for outputs");
+  RAFT_EXPECTS(idx.dataset().extent(1) == queries.extent(1),
+               "Number of columns in queries must match brute force index");
+
+  auto k = neighbors.extent(1);
+  auto d = idx.dataset().extent(1);
+
+  std::vector<T*> dataset    = {const_cast<T*>(idx.dataset().data_handle())};
+  std::vector<int64_t> sizes = {idx.dataset().extent(0)};
+  std::vector<T*> norms;
+  if (idx.has_norms()) { norms.push_back(const_cast<T*>(idx.norms().data_handle())); }
+
+  detail::brute_force_knn_impl<int64_t, IdxT, T>(res,
+                                                 dataset,
+                                                 sizes,
+                                                 d,
+                                                 const_cast<T*>(queries.data_handle()),
+                                                 queries.extent(0),
+                                                 neighbors.data_handle(),
+                                                 distances.data_handle(),
+                                                 k,
+                                                 true,
+                                                 true,
+                                                 nullptr,
+                                                 idx.metric(),
+                                                 idx.metric_arg(),
+                                                 raft::identity_op(),
+                                                 norms.size() ? &norms : nullptr);
+}
+/** @} */  // end group brute_force_knn
 }  // namespace raft::neighbors::brute_force
diff --git a/cpp/include/raft/neighbors/brute_force_types.hpp b/cpp/include/raft/neighbors/brute_force_types.hpp
new file mode 100644
index 0000000000..cc934b7a98
--- /dev/null
+++ b/cpp/include/raft/neighbors/brute_force_types.hpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "ann_types.hpp"
+#include <raft/core/resource/cuda_stream.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
+
+#include <raft/core/logger.hpp>
+
+namespace raft::neighbors::brute_force {
+/**
+ * @addtogroup brute_force
+ * @{
+ */
+
+/**
+ * @brief Brute Force index.
+ *
+ * The index stores the dataset and norms for the dataset in device memory.
+ *
+ * @tparam T data element type
+ */
+template <typename T>
+struct index : ann::index {
+ public:
+  /** Distance metric used for retrieval */
+  [[nodiscard]] constexpr inline raft::distance::DistanceType metric() const noexcept
+  {
+    return metric_;
+  }
+
+  /** Total length of the index (number of vectors). */
+  [[nodiscard]] constexpr inline int64_t size() const noexcept { return dataset_view_.extent(0); }
+
+  /** Dimensionality of the data. */
+  [[nodiscard]] constexpr inline uint32_t dim() const noexcept { return dataset_view_.extent(1); }
+
+  /** Dataset [size, dim] */
+  [[nodiscard]] inline auto dataset() const noexcept
+    -> device_matrix_view<const T, int64_t, row_major>
+  {
+    return dataset_view_;
+  }
+
+  /** Dataset norms */
+  [[nodiscard]] inline auto norms() const -> device_vector_view<const T, int64_t, row_major>
+  {
+    return make_const_mdspan(norms_.value().view());
+  }
+
+  /** Whether ot not this index has dataset norms */
+  [[nodiscard]] inline bool has_norms() const noexcept { return norms_.has_value(); }
+
+  [[nodiscard]] inline T metric_arg() const noexcept { return metric_arg_; }
+
+  // Don't allow copying the index for performance reasons (try avoiding copying data)
+  index(const index&)                    = delete;
+  index(index&&)                         = default;
+  auto operator=(const index&) -> index& = delete;
+  auto operator=(index&&) -> index&      = default;
+  ~index()                               = default;
+
+  /** Construct a brute force index from dataset
+   *
+   * Constructs a brute force index from a dataset. This lets us precompute norms for
+   * the dataset, providing a speed benefit over doing this at query time.
+
+   * If the dataset is already in GPU memory, then this class stores a non-owning reference to
+   * the dataset. If the dataset is in host memory, it will be copied to the device and the
+   * index will own the device memory.
+   */
+  template <typename data_accessor>
+  index(raft::resources const& res,
+        mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
+        std::optional<raft::device_vector<T, int64_t>>&& norms,
+        raft::distance::DistanceType metric,
+        T metric_arg = 0.0)
+    : ann::index(),
+      metric_(metric),
+      dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
+      norms_(std::move(norms)),
+      metric_arg_(metric_arg)
+  {
+    update_dataset(res, dataset);
+    resource::sync_stream(res);
+  }
+
+ private:
+  /**
+   * Replace the dataset with a new dataset.
+   */
+  void update_dataset(raft::resources const& res,
+                      raft::device_matrix_view<const T, int64_t, row_major> dataset)
+  {
+    dataset_view_ = dataset;
+  }
+
+  /**
+   * Replace the dataset with a new dataset.
+   *
+   * We create a copy of the dataset on the device. The index manages the lifetime of this copy.
+   */
+  void update_dataset(raft::resources const& res,
+                      raft::host_matrix_view<const T, int64_t, row_major> dataset)
+  {
+    dataset_ = make_device_matrix<T, int64_t>(dataset.extents(0), dataset.extents(1));
+    raft::copy(dataset_.data_handle(),
+               dataset.data_handle(),
+               dataset.size(),
+               resource::get_cuda_stream(res));
+    dataset_view_ = make_const_mdspan(dataset_.view());
+  }
+
+  raft::distance::DistanceType metric_;
+  raft::device_matrix<T, int64_t, row_major> dataset_;
+  std::optional<raft::device_vector<T, int64_t>> norms_;
+  raft::device_matrix_view<const T, int64_t, row_major> dataset_view_;
+  T metric_arg_;
+};
+
+/** @} */
+
+}  // namespace raft::neighbors::brute_force
diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
index 6bb7beca55..f96dd34e05 100644
--- a/cpp/include/raft/neighbors/cagra.cuh
+++ b/cpp/include/raft/neighbors/cagra.cuh
@@ -35,12 +35,11 @@ namespace raft::neighbors::cagra {
  */
 
 /**
- * @brief Build a kNN graph.
+ * @brief Build a kNN graph using IVF-PQ.
  *
  * The kNN graph is the first building block for CAGRA index.
- * This function uses the IVF-PQ method to build a kNN graph.
  *
- * The output is a dense matrix that stores the neighbor indices for each pont in the dataset.
+ * The output is a dense matrix that stores the neighbor indices for each point in the dataset.
  * Each point has the same number of neighbors.
  *
  * See [cagra::build](#cagra::build) for an alternative method.
@@ -52,16 +51,16 @@ namespace raft::neighbors::cagra {
  * @code{.cpp}
  *   using namespace raft::neighbors;
  *   // use default index parameters
- *   cagra::index_params build_params;
- *   cagra::search_params search_params
+ *   ivf_pq::index_params build_params;
+ *   ivf_pq::search_params search_params
  *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
  *   // create knn graph
  *   cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
- *   auto optimized_gaph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 64);
+ *   auto optimized_gaph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 64);
  *   cagra::optimize(res, dataset, knn_graph.view(), optimized_graph.view());
  *   // Construct an index from dataset and optimized knn_graph
  *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset,
- * optimized_graph.view());
+ *                                      optimized_graph.view());
  * @endcode
  *
  * @tparam DataT data element type
@@ -70,7 +69,7 @@ namespace raft::neighbors::cagra {
  * @param[in] res raft resources
  * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
  * @param[out] knn_graph a host matrix view to store the output knn graph [n_rows, graph_degree]
- * @param[in] refine_rate refinement rate for ivf-pq search
+ * @param[in] refine_rate (optional) refinement rate for ivf-pq search
  * @param[in] build_params (optional) ivf_pq index building parameters for knn graph
  * @param[in] search_params (optional) ivf_pq search parameters
  */
@@ -95,6 +94,58 @@ void build_knn_graph(raft::resources const& res,
     res, dataset_internal, knn_graph_internal, refine_rate, build_params, search_params);
 }
 
+/**
+ * @brief Build a kNN graph using NN-descent.
+ *
+ * The kNN graph is the first building block for CAGRA index.
+ *
+ * The output is a dense matrix that stores the neighbor indices for each point in the dataset.
+ * Each point has the same number of neighbors.
+ *
+ * See [cagra::build](#cagra::build) for an alternative method.
+ *
+ * The following distance metrics are supported:
+ * - L2Expanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   using namespace raft::neighbors::experimental;
+ *   // use default index parameters
+ *   nn_descent::index_params build_params;
+ *   build_params.graph_degree = 128;
+ *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
+ *   // create knn graph
+ *   cagra::build_knn_graph(res, dataset, knn_graph.view(), build_params);
+ *   auto optimized_gaph      = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), 64);
+ *   cagra::optimize(res, dataset, nn_descent_index.graph.view(), optimized_graph.view());
+ *   // Construct an index from dataset and optimized knn_graph
+ *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset,
+ * optimized_graph.view());
+ * @endcode
+ *
+ * @tparam DataT data element type
+ * @tparam IdxT type of the dataset vector indices
+ * @tparam accessor host or device accessor_type for the dataset
+ * @param[in] res raft::resources is an object mangaging resources
+ * @param[in] dataset input raft::host/device_matrix_view that can be located in
+ *                in host or device memory
+ * @param[out] knn_graph a host matrix view to store the output knn graph [n_rows, graph_degree]
+ * @param[in] build_params an instance of experimental::nn_descent::index_params that are parameters
+ *                     to run the nn-descent algorithm
+ */
+template <typename DataT,
+          typename IdxT = uint32_t,
+          typename accessor =
+            host_device_accessor<std::experimental::default_accessor<DataT>, memory_type::device>>
+void build_knn_graph(raft::resources const& res,
+                     mdspan<const DataT, matrix_extent<int64_t>, row_major, accessor> dataset,
+                     raft::host_matrix_view<IdxT, int64_t, row_major> knn_graph,
+                     experimental::nn_descent::index_params build_params)
+{
+  detail::build_knn_graph<DataT, IdxT>(res, dataset, knn_graph, build_params);
+}
+
 /**
  * @brief Sort a KNN graph index.
  * Preprocessing step for `cagra::optimize`: If a KNN graph is not built using
@@ -106,7 +157,7 @@ void build_knn_graph(raft::resources const& res,
  * @code{.cpp}
  *   using namespace raft::neighbors;
  *   cagra::index_params build_params;
- *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
+ *   auto knn_graph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
  *   // build KNN graph not using `cagra::build_knn_graph`
  *   // build(knn_graph, dataset, ...);
  *   // sort graph index
@@ -115,7 +166,7 @@ void build_knn_graph(raft::resources const& res,
  *   cagra::optimize(res, dataset, knn_graph.view(), optimized_graph.view());
  *   // Construct an index from dataset and optimized knn_graph
  *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset,
- * optimized_graph.view());
+ *                                      optimized_graph.view());
  * @endcode
  *
  * @tparam DataT type of the data in the source dataset
@@ -256,13 +307,26 @@ index<T, IdxT> build(raft::resources const& res,
     graph_degree = intermediate_degree;
   }
 
-  auto knn_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree);
+  std::optional<raft::host_matrix<IdxT, int64_t>> knn_graph(
+    raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree));
 
-  build_knn_graph(res, dataset, knn_graph.view());
+  if (params.build_algo == graph_build_algo::IVF_PQ) {
+    build_knn_graph(res, dataset, knn_graph->view());
+
+  } else {
+    // Use nn-descent to build CAGRA knn graph
+    auto nn_descent_params                      = experimental::nn_descent::index_params();
+    nn_descent_params.graph_degree              = intermediate_degree;
+    nn_descent_params.intermediate_graph_degree = 1.5 * intermediate_degree;
+    build_knn_graph<T, IdxT>(res, dataset, knn_graph->view(), nn_descent_params);
+  }
 
   auto cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), graph_degree);
 
-  optimize<IdxT>(res, knn_graph.view(), cagra_graph.view());
+  optimize<IdxT>(res, knn_graph->view(), cagra_graph.view());
+
+  // free intermediate graph before trying to create the index
+  knn_graph.reset();
 
   // Construct an index from dataset and optimized knn graph.
   return index<T, IdxT>(res, params.metric, dataset, raft::make_const_mdspan(cagra_graph.view()));
@@ -312,9 +376,70 @@ void search(raft::resources const& res,
   auto distances_internal = raft::make_device_matrix_view<float, int64_t, row_major>(
     distances.data_handle(), distances.extent(0), distances.extent(1));
 
-  cagra::detail::search_main<T, internal_IdxT, IdxT>(
-    res, params, idx, queries_internal, neighbors_internal, distances_internal);
+  cagra::detail::search_main<T,
+                             internal_IdxT,
+                             decltype(raft::neighbors::filtering::none_cagra_sample_filter()),
+                             IdxT>(res,
+                                   params,
+                                   idx,
+                                   queries_internal,
+                                   neighbors_internal,
+                                   distances_internal,
+                                   raft::neighbors::filtering::none_cagra_sample_filter());
+}
+
+/**
+ * @brief Search ANN using the constructed index with the given sample filter.
+ *
+ * See the [cagra::build](#cagra::build) documentation for a usage example.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ * @tparam CagraSampleFilterT Device filter function, with the signature
+ *         `(uint32_t query ix, uint32_t sample_ix) -> bool`
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the search
+ * @param[in] idx cagra index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ * @param[in] sample_filter a device filter function that greenlights samples for a given query
+ */
+template <typename T, typename IdxT, typename CagraSampleFilterT>
+void search_with_filtering(raft::resources const& res,
+                           const search_params& params,
+                           const index<T, IdxT>& idx,
+                           raft::device_matrix_view<const T, int64_t, row_major> queries,
+                           raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
+                           raft::device_matrix_view<float, int64_t, row_major> distances,
+                           CagraSampleFilterT sample_filter = CagraSampleFilterT())
+{
+  RAFT_EXPECTS(
+    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
+    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
+
+  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
+               "Number of columns in output neighbors and distances matrices must equal k");
+  RAFT_EXPECTS(queries.extent(1) == idx.dim(),
+               "Number of query dimensions should equal number of dimensions in the index.");
+
+  using internal_IdxT   = typename std::make_unsigned<IdxT>::type;
+  auto queries_internal = raft::make_device_matrix_view<const T, int64_t, row_major>(
+    queries.data_handle(), queries.extent(0), queries.extent(1));
+  auto neighbors_internal = raft::make_device_matrix_view<internal_IdxT, int64_t, row_major>(
+    reinterpret_cast<internal_IdxT*>(neighbors.data_handle()),
+    neighbors.extent(0),
+    neighbors.extent(1));
+  auto distances_internal = raft::make_device_matrix_view<float, int64_t, row_major>(
+    distances.data_handle(), distances.extent(0), distances.extent(1));
+
+  cagra::detail::search_main<T, internal_IdxT, CagraSampleFilterT, IdxT>(
+    res, params, idx, queries_internal, neighbors_internal, distances_internal, sample_filter);
 }
+
 /** @} */  // end group cagra
 
 }  // namespace raft::neighbors::cagra
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 02e3f5338e..5061d6082d 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -40,11 +40,24 @@ namespace raft::neighbors::cagra {
  * @{
  */
 
+/**
+ * @brief ANN algorithm used by CAGRA to build knn graph
+ *
+ */
+enum class graph_build_algo {
+  /* Use IVF-PQ to build all-neighbors knn graph */
+  IVF_PQ,
+  /* Experimental, use NN-Descent to build all-neighbors knn graph */
+  NN_DESCENT
+};
+
 struct index_params : ann::index_params {
   /** Degree of input graph for pruning. */
   size_t intermediate_graph_degree = 128;
   /** Degree of output graph. */
   size_t graph_degree = 64;
+  /** ANN algorithm to build knn graph. */
+  graph_build_algo build_algo = graph_build_algo::IVF_PQ;
 };
 
 enum class search_algo {
@@ -165,9 +178,10 @@ struct index : ann::index {
   ~index()                               = default;
 
   /** Construct an empty index. */
-  index(raft::resources const& res)
+  index(raft::resources const& res,
+        raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
     : ann::index(),
-      metric_(raft::distance::DistanceType::L2Expanded),
+      metric_(metric),
       dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
       graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0))
   {
@@ -296,7 +310,11 @@ struct index : ann::index {
                     raft::host_matrix_view<const IdxT, int64_t, row_major> knn_graph)
   {
     RAFT_LOG_DEBUG("Copying CAGRA knn graph from host to device");
-    graph_ = make_device_matrix<IdxT, int64_t>(res, knn_graph.extent(0), knn_graph.extent(1));
+    if ((graph_.extent(0) != knn_graph.extent(0)) || (graph_.extent(1) != knn_graph.extent(1))) {
+      // clear existing memory before allocating to prevent OOM errors on large graphs
+      if (graph_.size()) { graph_ = make_device_matrix<IdxT, int64_t>(res, 0, 0); }
+      graph_ = make_device_matrix<IdxT, int64_t>(res, knn_graph.extent(0), knn_graph.extent(1));
+    }
     raft::copy(graph_.data_handle(),
                knn_graph.data_handle(),
                knn_graph.size(),
@@ -311,7 +329,13 @@ struct index : ann::index {
                    mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset)
   {
     size_t padded_dim = round_up_safe<size_t>(dataset.extent(1) * sizeof(T), 16) / sizeof(T);
-    dataset_          = make_device_matrix<T, int64_t>(res, dataset.extent(0), padded_dim);
+
+    if ((dataset_.extent(0) != dataset.extent(0)) ||
+        (static_cast<size_t>(dataset_.extent(1)) != padded_dim)) {
+      // clear existing memory before allocating to prevent OOM errors on large datasets
+      if (dataset_.size()) { dataset_ = make_device_matrix<T, int64_t>(res, 0, 0); }
+      dataset_ = make_device_matrix<T, int64_t>(res, dataset.extent(0), padded_dim);
+    }
     if (dataset_.extent(1) == dataset.extent(1)) {
       raft::copy(dataset_.data_handle(),
                  dataset.data_handle(),
@@ -351,6 +375,7 @@ struct index : ann::index {
 
 // TODO: Remove deprecated experimental namespace in 23.12 release
 namespace raft::neighbors::experimental::cagra {
+using raft::neighbors::cagra::graph_build_algo;
 using raft::neighbors::cagra::hash_mode;
 using raft::neighbors::cagra::index;
 using raft::neighbors::cagra::index_params;
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
index d19d7e7904..40024a3deb 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
@@ -28,12 +28,14 @@
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/core/resource/detail/device_memory_resource.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
 #include <raft/neighbors/detail/refine.cuh>
 #include <raft/neighbors/ivf_pq.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>
+#include <raft/neighbors/nn_descent.cuh>
 #include <raft/neighbors/refine.cuh>
 
 namespace raft::neighbors::cagra::detail {
@@ -46,6 +48,7 @@ void build_knn_graph(raft::resources const& res,
                      std::optional<ivf_pq::index_params> build_params   = std::nullopt,
                      std::optional<ivf_pq::search_params> search_params = std::nullopt)
 {
+  resource::detail::warn_non_pool_workspace(res, "raft::neighbors::cagra::build");
   RAFT_EXPECTS(!build_params || build_params->metric == distance::DistanceType::L2Expanded,
                "Currently only L2Expanded metric is supported");
 
@@ -238,4 +241,27 @@ void build_knn_graph(raft::resources const& res,
   if (!first) RAFT_LOG_DEBUG("# Finished building kNN graph");
 }
 
+template <typename DataT, typename IdxT, typename accessor>
+void build_knn_graph(raft::resources const& res,
+                     mdspan<const DataT, matrix_extent<int64_t>, row_major, accessor> dataset,
+                     raft::host_matrix_view<IdxT, int64_t, row_major> knn_graph,
+                     experimental::nn_descent::index_params build_params)
+{
+  auto nn_descent_idx = experimental::nn_descent::index<IdxT>(res, knn_graph);
+  experimental::nn_descent::build<DataT, IdxT>(res, build_params, dataset, nn_descent_idx);
+
+  using internal_IdxT = typename std::make_unsigned<IdxT>::type;
+  using g_accessor    = typename decltype(nn_descent_idx.graph())::accessor_type;
+  using g_accessor_internal =
+    host_device_accessor<std::experimental::default_accessor<internal_IdxT>, g_accessor::mem_type>;
+
+  auto knn_graph_internal =
+    mdspan<internal_IdxT, matrix_extent<int64_t>, row_major, g_accessor_internal>(
+      reinterpret_cast<internal_IdxT*>(nn_descent_idx.graph().data_handle()),
+      nn_descent_idx.graph().extent(0),
+      nn_descent_idx.graph().extent(1));
+
+  graph::sort_knn_graph(res, dataset, knn_graph_internal);
+}
+
 }  // namespace raft::neighbors::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
index 8190817b5b..81e714dc4e 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -18,10 +18,13 @@
 
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/neighbors/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/detail/device_memory_resource.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/neighbors/cagra_types.hpp>
 #include <rmm/cuda_stream_view.hpp>
@@ -32,6 +35,48 @@
 
 namespace raft::neighbors::cagra::detail {
 
+template <class CagraSampleFilterT>
+struct CagraSampleFilterWithQueryIdOffset {
+  const uint32_t offset;
+  CagraSampleFilterT filter;
+
+  CagraSampleFilterWithQueryIdOffset(const uint32_t offset, const CagraSampleFilterT filter)
+    : offset(offset), filter(filter)
+  {
+  }
+
+  _RAFT_DEVICE auto operator()(const uint32_t query_id, const uint32_t sample_id)
+  {
+    return filter(query_id + offset, sample_id);
+  }
+};
+
+template <class CagraSampleFilterT>
+struct CagraSampleFilterT_Selector {
+  using type = CagraSampleFilterWithQueryIdOffset<CagraSampleFilterT>;
+};
+template <>
+struct CagraSampleFilterT_Selector<raft::neighbors::filtering::none_cagra_sample_filter> {
+  using type = raft::neighbors::filtering::none_cagra_sample_filter;
+};
+
+// A helper function to set a query id offset
+template <class CagraSampleFilterT>
+inline typename CagraSampleFilterT_Selector<CagraSampleFilterT>::type set_offset(
+  CagraSampleFilterT filter, const uint32_t offset)
+{
+  typename CagraSampleFilterT_Selector<CagraSampleFilterT>::type new_filter(offset, filter);
+  return new_filter;
+}
+template <>
+inline
+  typename CagraSampleFilterT_Selector<raft::neighbors::filtering::none_cagra_sample_filter>::type
+  set_offset<raft::neighbors::filtering::none_cagra_sample_filter>(
+    raft::neighbors::filtering::none_cagra_sample_filter filter, const uint32_t)
+{
+  return filter;
+}
+
 /**
  * @brief Search ANN using the constructed index.
  *
@@ -52,27 +97,37 @@ namespace raft::neighbors::cagra::detail {
  * k]
  */
 
-template <typename T, typename internal_IdxT, typename IdxT = uint32_t, typename DistanceT = float>
+template <typename T,
+          typename internal_IdxT,
+          typename CagraSampleFilterT,
+          typename IdxT      = uint32_t,
+          typename DistanceT = float>
 void search_main(raft::resources const& res,
                  search_params params,
                  const index<T, IdxT>& index,
                  raft::device_matrix_view<const T, int64_t, row_major> queries,
                  raft::device_matrix_view<internal_IdxT, int64_t, row_major> neighbors,
-                 raft::device_matrix_view<DistanceT, int64_t, row_major> distances)
+                 raft::device_matrix_view<DistanceT, int64_t, row_major> distances,
+                 CagraSampleFilterT sample_filter = CagraSampleFilterT())
 {
+  resource::detail::warn_non_pool_workspace(res, "raft::neighbors::cagra::search");
   RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n",
                  static_cast<size_t>(index.dataset().extent(0)),
                  static_cast<size_t>(index.dataset().extent(1)));
   RAFT_LOG_DEBUG("# query size = %lu, dim = %lu\n",
                  static_cast<size_t>(queries.extent(0)),
                  static_cast<size_t>(queries.extent(1)));
-  RAFT_EXPECTS(queries.extent(1) == index.dim(), "Querise and index dim must match");
+  RAFT_EXPECTS(queries.extent(1) == index.dim(), "Queries and index dim must match");
   const uint32_t topk = neighbors.extent(1);
 
   if (params.max_queries == 0) { params.max_queries = queries.extent(0); }
 
-  std::unique_ptr<search_plan_impl<T, internal_IdxT, DistanceT>> plan =
-    factory<T, internal_IdxT, DistanceT>::create(
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "cagra::search(max_queries = %u, k = %u, dim = %zu)", params.max_queries, topk, index.dim());
+
+  using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector<CagraSampleFilterT>::type;
+  std::unique_ptr<search_plan_impl<T, internal_IdxT, DistanceT, CagraSampleFilterT_s>> plan =
+    factory<T, internal_IdxT, DistanceT, CagraSampleFilterT_s>::create(
       res, params, index.dim(), index.graph_degree(), topk);
 
   plan->check(neighbors.extent(1));
@@ -113,7 +168,8 @@ void search_main(raft::resources const& res,
             n_queries,
             _seed_ptr,
             _num_executed_iterations,
-            topk);
+            topk,
+            set_offset(sample_filter, qid));
   }
 
   static_assert(std::is_same_v<DistanceT, float>,
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
index 2c9cbd2563..8261f637e1 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/core/mdarray.hpp>
+#include <raft/core/nvtx.hpp>
 #include <raft/core/serialize.hpp>
 #include <raft/neighbors/cagra_types.hpp>
 
@@ -54,6 +55,8 @@ void serialize(raft::resources const& res,
                const index<T, IdxT>& index_,
                bool include_dataset)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("cagra::serialize");
+
   RAFT_LOG_DEBUG(
     "Saving CAGRA index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
 
@@ -113,6 +116,8 @@ void serialize(raft::resources const& res,
 template <typename T, typename IdxT>
 auto deserialize(raft::resources const& res, std::istream& is) -> index<T, IdxT>
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("cagra::deserialize");
+
   char dtype_string[4];
   is.read(dtype_string, 4);
 
@@ -125,15 +130,22 @@ auto deserialize(raft::resources const& res, std::istream& is) -> index<T, IdxT>
   auto graph_degree = deserialize_scalar<std::uint32_t>(res, is);
   auto metric       = deserialize_scalar<raft::distance::DistanceType>(res, is);
 
-  auto dataset = raft::make_host_matrix<T, int64_t>(n_rows, dim);
-  auto graph   = raft::make_host_matrix<IdxT, int64_t>(n_rows, graph_degree);
+  auto graph = raft::make_host_matrix<IdxT, int64_t>(n_rows, graph_degree);
   deserialize_mdspan(res, is, graph.view());
 
   bool has_dataset = deserialize_scalar<bool>(res, is);
-  if (has_dataset) { deserialize_mdspan(res, is, dataset.view()); }
-
-  return index<T, IdxT>(
-    res, metric, raft::make_const_mdspan(dataset.view()), raft::make_const_mdspan(graph.view()));
+  if (has_dataset) {
+    auto dataset = raft::make_host_matrix<T, int64_t>(n_rows, dim);
+    deserialize_mdspan(res, is, dataset.view());
+    return index<T, IdxT>(
+      res, metric, raft::make_const_mdspan(dataset.view()), raft::make_const_mdspan(graph.view()));
+  } else {
+    // create a new index with no dataset - the user must supply via update_dataset themselves
+    // later (this avoids allocating GPU memory in the meantime)
+    index<T, IdxT> idx(res, metric);
+    idx.update_graph(res, raft::make_const_mdspan(graph.view()));
+    return idx;
+  }
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
index 2758148942..55b7b47508 100644
--- a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
@@ -53,7 +53,7 @@ _RAFT_DEVICE void compute_distance_to_random_nodes(
   INDEX_T* const result_indices_ptr,       // [num_pickup]
   DISTANCE_T* const result_distances_ptr,  // [num_pickup]
   const float* const query_buffer,
-  const DATA_T* const dataset_ptr,         // [dataset_size, dataset_dim]
+  const DATA_T* const dataset_ptr,  // [dataset_size, dataset_dim]
   const std::size_t dataset_dim,
   const std::size_t dataset_size,
   const std::size_t dataset_ld,
@@ -133,7 +133,6 @@ _RAFT_DEVICE void compute_distance_to_random_nodes(
 }
 
 template <unsigned TEAM_SIZE,
-          unsigned BLOCK_SIZE,
           unsigned MAX_DATASET_DIM,
           unsigned MAX_N_FRAGS,
           class LOAD_T,
@@ -155,17 +154,20 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(INDEX_T* const result_child_in
                                                   INDEX_T* const visited_hashmap_ptr,
                                                   const std::uint32_t hash_bitlen,
                                                   const INDEX_T* const parent_indices,
+                                                  const INDEX_T* const internal_topk_list,
                                                   const std::uint32_t search_width)
 {
-  const INDEX_T invalid_index = utils::get_max_value<INDEX_T>();
+  constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+  const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
 
   // Read child indices of parents from knn graph and check if the distance
   // computaiton is necessary.
-  for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += BLOCK_SIZE) {
-    const INDEX_T parent_id = parent_indices[i / knn_k];
-    INDEX_T child_id        = invalid_index;
-    if (parent_id != invalid_index) {
-      child_id = knn_graph[(i % knn_k) + ((uint64_t)knn_k * parent_id)];
+  for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += blockDim.x) {
+    const INDEX_T smem_parent_id = parent_indices[i / knn_k];
+    INDEX_T child_id             = invalid_index;
+    if (smem_parent_id != invalid_index) {
+      const auto parent_id = internal_topk_list[smem_parent_id] & ~index_msb_1_mask;
+      child_id             = knn_graph[(i % knn_k) + ((uint64_t)knn_k * parent_id)];
     }
     if (child_id != invalid_index) {
       if (hashmap::insert(visited_hashmap_ptr, hash_bitlen, child_id) == 0) {
@@ -205,7 +207,8 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(INDEX_T* const result_child_in
   // Compute the distance to child nodes
   std::uint32_t max_i = knn_k * search_width;
   if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); }
-  for (std::uint32_t i = threadIdx.x / TEAM_SIZE; i < max_i; i += BLOCK_SIZE / TEAM_SIZE) {
+  for (std::uint32_t tid = threadIdx.x; tid < max_i * TEAM_SIZE; tid += blockDim.x) {
+    const auto i       = tid / TEAM_SIZE;
     const bool valid_i = (i < (knn_k * search_width));
     INDEX_T child_id   = invalid_index;
     if (valid_i) { child_id = result_child_indices_ptr[i]; }
diff --git a/cpp/include/raft/neighbors/detail/cagra/factory.cuh b/cpp/include/raft/neighbors/detail/cagra/factory.cuh
index 625040194b..78111a9310 100644
--- a/cpp/include/raft/neighbors/detail/cagra/factory.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/factory.cuh
@@ -20,20 +20,25 @@
 #include "search_multi_kernel.cuh"
 #include "search_plan.cuh"
 #include "search_single_cta.cuh"
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail {
 
-template <typename T, typename IdxT = uint32_t, typename DistanceT = float>
+template <typename T,
+          typename IdxT               = uint32_t,
+          typename DistanceT          = float,
+          typename CagraSampleFilterT = raft::neighbors::filtering::none_cagra_sample_filter>
 class factory {
  public:
   /**
    * Create a search structure for dataset with dim features.
    */
-  static std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>> create(raft::resources const& res,
-                                                                      search_params const& params,
-                                                                      int64_t dim,
-                                                                      int64_t graph_degree,
-                                                                      uint32_t topk)
+  static std::unique_ptr<search_plan_impl<T, IdxT, DistanceT, CagraSampleFilterT>> create(
+    raft::resources const& res,
+    search_params const& params,
+    int64_t dim,
+    int64_t graph_degree,
+    uint32_t topk)
   {
     search_plan_impl_base plan(params, dim, graph_degree, topk);
     switch (plan.max_dim) {
@@ -63,26 +68,29 @@ class factory {
         break;
       default: RAFT_LOG_DEBUG("Incorrect max_dim (%lu)\n", plan.max_dim);
     }
-    return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>>();
+    return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT, CagraSampleFilterT>>();
   }
 
  private:
   template <unsigned MAX_DATASET_DIM, unsigned TEAM_SIZE>
-  static std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>> dispatch_kernel(
+  static std::unique_ptr<search_plan_impl<T, IdxT, DistanceT, CagraSampleFilterT>> dispatch_kernel(
     raft::resources const& res, search_plan_impl_base& plan)
   {
     if (plan.algo == search_algo::SINGLE_CTA) {
-      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>>(
-        new single_cta_search::search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT>(
-          res, plan, plan.dim, plan.graph_degree, plan.topk));
+      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT, CagraSampleFilterT>>(
+        new single_cta_search::
+          search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT, CagraSampleFilterT>(
+            res, plan, plan.dim, plan.graph_degree, plan.topk));
     } else if (plan.algo == search_algo::MULTI_CTA) {
-      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>>(
-        new multi_cta_search::search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT>(
-          res, plan, plan.dim, plan.graph_degree, plan.topk));
+      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT, CagraSampleFilterT>>(
+        new multi_cta_search::
+          search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT, CagraSampleFilterT>(
+            res, plan, plan.dim, plan.graph_degree, plan.topk));
     } else {
-      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>>(
-        new multi_kernel_search::search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT>(
-          res, plan, plan.dim, plan.graph_degree, plan.topk));
+      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT, CagraSampleFilterT>>(
+        new multi_kernel_search::
+          search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT, CagraSampleFilterT>(
+            res, plan, plan.dim, plan.graph_degree, plan.topk));
     }
   }
 };
diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
index 0558d7ea39..8845e37973 100644
--- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -244,7 +244,7 @@ void sort_knn_graph(raft::resources const& res,
   const uint32_t input_graph_degree = knn_graph.extent(1);
   IdxT* const input_graph_ptr       = knn_graph.data_handle();
 
-  auto d_input_graph = raft::make_device_matrix<IdxT, IdxT>(res, graph_size, input_graph_degree);
+  auto d_input_graph = raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);
 
   //
   // Sorting kNN graph
@@ -334,18 +334,13 @@ void optimize(raft::resources const& res,
   auto output_graph_ptr              = new_graph.data_handle();
   const IdxT graph_size              = new_graph.extent(0);
 
-  auto pruned_graph = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);
-
   {
     //
     // Prune kNN graph
     //
-    auto d_input_graph =
-      raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);
-
-    auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, input_graph_degree);
     auto d_detour_count =
       raft::make_device_matrix<uint8_t, int64_t>(res, graph_size, input_graph_degree);
+
     RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
                                   0xff,
                                   graph_size * input_graph_degree * sizeof(uint8_t),
@@ -376,24 +371,13 @@ void optimize(raft::resources const& res,
     const double time_prune_start = cur_time();
     RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
 
-    raft::copy(d_input_graph.data_handle(),
-               input_graph_ptr,
-               graph_size * input_graph_degree,
-               resource::get_cuda_stream(res));
-    void (*kernel_prune)(const IdxT* const,
-                         const uint32_t,
-                         const uint32_t,
-                         const uint32_t,
-                         const uint32_t,
-                         const uint32_t,
-                         uint8_t* const,
-                         uint32_t* const,
-                         uint64_t* const);
+    // Copy input_graph_ptr over to device if necessary
+    device_matrix_view_from_host d_input_graph(
+      res,
+      raft::make_host_matrix_view<IdxT, int64_t>(input_graph_ptr, graph_size, input_graph_degree));
 
     constexpr int MAX_DEGREE = 1024;
-    if (input_graph_degree <= MAX_DEGREE) {
-      kernel_prune = kern_prune<MAX_DEGREE, IdxT>;
-    } else {
+    if (input_graph_degree > MAX_DEGREE) {
       RAFT_FAIL(
         "The degree of input knn graph is too large (%u). "
         "It must be equal to or smaller than %d.",
@@ -410,16 +394,17 @@ void optimize(raft::resources const& res,
       dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, resource::get_cuda_stream(res)));
 
     for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
-      kernel_prune<<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
-        d_input_graph.data_handle(),
-        graph_size,
-        input_graph_degree,
-        output_graph_degree,
-        batch_size,
-        i_batch,
-        d_detour_count.data_handle(),
-        d_num_no_detour_edges.data_handle(),
-        dev_stats.data_handle());
+      kern_prune<MAX_DEGREE, IdxT>
+        <<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
+          d_input_graph.data_handle(),
+          graph_size,
+          input_graph_degree,
+          output_graph_degree,
+          batch_size,
+          i_batch,
+          d_detour_count.data_handle(),
+          d_num_no_detour_edges.data_handle(),
+          dev_stats.data_handle());
       resource::sync_stream(res);
       RAFT_LOG_DEBUG(
         "# Pruning kNN Graph on GPUs (%.1lf %%)\r",
@@ -428,10 +413,7 @@ void optimize(raft::resources const& res,
     resource::sync_stream(res);
     RAFT_LOG_DEBUG("\n");
 
-    raft::copy(detour_count.data_handle(),
-               d_detour_count.data_handle(),
-               graph_size * input_graph_degree,
-               resource::get_cuda_stream(res));
+    host_matrix_view_from_device<uint8_t, int64_t> detour_count(res, d_detour_count.view());
 
     raft::copy(
       host_stats.data_handle(), dev_stats.data_handle(), 2, resource::get_cuda_stream(res));
@@ -447,7 +429,7 @@ void optimize(raft::resources const& res,
         if (max_detour < num_detour) { max_detour = num_detour; /* stats */ }
         for (uint64_t k = 0; k < input_graph_degree; k++) {
           if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; }
-          pruned_graph.data_handle()[pk + (output_graph_degree * i)] =
+          output_graph_ptr[pk + (output_graph_degree * i)] =
             input_graph_ptr[k + (input_graph_degree * i)];
           pk += 1;
           if (pk >= output_graph_degree) break;
@@ -478,8 +460,7 @@ void optimize(raft::resources const& res,
     //
     const double time_make_start = cur_time();
 
-    auto d_rev_graph =
-      raft::make_device_matrix<IdxT, int64_t>(res, graph_size, output_graph_degree);
+    device_matrix_view_from_host<IdxT, int64_t> d_rev_graph(res, rev_graph.view());
     RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(),
                                   0xff,
                                   graph_size * output_graph_degree * sizeof(IdxT),
@@ -497,7 +478,7 @@ void optimize(raft::resources const& res,
     for (uint64_t k = 0; k < output_graph_degree; k++) {
 #pragma omp parallel for
       for (uint64_t i = 0; i < graph_size; i++) {
-        dest_nodes.data_handle()[i] = pruned_graph.data_handle()[k + (output_graph_degree * i)];
+        dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)];
       }
       resource::sync_stream(res);
 
@@ -520,10 +501,12 @@ void optimize(raft::resources const& res,
     resource::sync_stream(res);
     RAFT_LOG_DEBUG("\n");
 
-    raft::copy(rev_graph.data_handle(),
-               d_rev_graph.data_handle(),
-               graph_size * output_graph_degree,
-               resource::get_cuda_stream(res));
+    if (d_rev_graph.allocated_memory()) {
+      raft::copy(rev_graph.data_handle(),
+                 d_rev_graph.data_handle(),
+                 graph_size * output_graph_degree,
+                 resource::get_cuda_stream(res));
+    }
     raft::copy(rev_graph_count.data_handle(),
                d_rev_graph_count.data_handle(),
                graph_size,
@@ -542,10 +525,6 @@ void optimize(raft::resources const& res,
     const uint64_t num_protected_edges = output_graph_degree / 2;
     RAFT_LOG_DEBUG("# num_protected_edges: %lu", num_protected_edges);
 
-    memcpy(output_graph_ptr,
-           pruned_graph.data_handle(),
-           sizeof(IdxT) * graph_size * output_graph_degree);
-
     constexpr int _omp_chunk = 1024;
 #pragma omp parallel for schedule(dynamic, _omp_chunk)
     for (uint64_t j = 0; j < graph_size; j++) {
@@ -578,7 +557,7 @@ void optimize(raft::resources const& res,
 #pragma omp parallel for reduction(+ : num_replaced_edges)
     for (uint64_t i = 0; i < graph_size; i++) {
       for (uint64_t k = 0; k < output_graph_degree; k++) {
-        const uint64_t j = pruned_graph.data_handle()[k + (output_graph_degree * i)];
+        const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)];
         const uint64_t pos =
           pos_in_array<IdxT>(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree);
         if (pos == output_graph_degree) { num_replaced_edges += 1; }
diff --git a/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp b/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp
index 346bbeaa9e..ed4763e475 100644
--- a/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp
@@ -28,8 +28,8 @@ namespace hashmap {
 
 _RAFT_HOST_DEVICE inline uint32_t get_size(const uint32_t bitlen) { return 1U << bitlen; }
 
-template <unsigned FIRST_TID = 0, class IdxT = void>
-_RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen)
+template <class IdxT>
+_RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen, unsigned FIRST_TID = 0)
 {
   if (threadIdx.x < FIRST_TID) return;
   for (unsigned i = threadIdx.x - FIRST_TID; i < get_size(bitlen); i += blockDim.x - FIRST_TID) {
@@ -37,15 +37,6 @@ _RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen)
   }
 }
 
-template <unsigned FIRST_TID, unsigned LAST_TID, class IdxT>
-_RAFT_DEVICE inline void init(IdxT* const table, const uint32_t bitlen)
-{
-  if ((FIRST_TID > 0 && threadIdx.x < FIRST_TID) || threadIdx.x >= LAST_TID) return;
-  for (unsigned i = threadIdx.x - FIRST_TID; i < get_size(bitlen); i += LAST_TID - FIRST_TID) {
-    table[i] = utils::get_max_value<IdxT>();
-  }
-}
-
 template <class IdxT>
 _RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, const IdxT key)
 {
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
index 314ab6e6a6..c6478bef84 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
@@ -48,42 +48,43 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
-
-struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_queries;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::itopk_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::algo;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::team_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::search_width;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::min_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::thread_block_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_mode;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_min_bitlen;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_max_fill_rate;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_random_samplings;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::rand_xor_mask;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_dim;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dim;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::graph_degree;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::topk;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hash_bitlen;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_bitlen;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_reset_interval;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dataset_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::result_buffer_size;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::smem_size;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_executed_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dev_seed;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_seeds;
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
+
+struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_queries;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::itopk_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::algo;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::team_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::search_width;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::min_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::thread_block_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_mode;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_min_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_max_fill_rate;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_random_samplings;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::rand_xor_mask;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::graph_degree;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::topk;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hash_bitlen;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::small_hash_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::small_hash_reset_interval;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dataset_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::result_buffer_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::smem_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_executed_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dev_seed;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_seeds;
 
   uint32_t num_cta_per_query;
   rmm::device_uvector<INDEX_T> intermediate_indices;
@@ -96,7 +97,8 @@ struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
          int64_t dim,
          int64_t graph_degree,
          uint32_t topk)
-    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>(res, params, dim, graph_degree, topk),
+    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk),
       intermediate_indices(0, resource::get_cuda_stream(res)),
       intermediate_distances(0, resource::get_cuda_stream(res)),
       topk_workspace(0, resource::get_cuda_stream(res))
@@ -107,9 +109,10 @@ struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
 
   void set_params(raft::resources const& res, const search_params& params)
   {
-    this->itopk_size   = 32;
-    search_width       = 1;
-    num_cta_per_query  = max(params.search_width, params.itopk_size / 32);
+    constexpr unsigned muti_cta_itopk_size = 32;
+    this->itopk_size                       = muti_cta_itopk_size;
+    search_width                           = 1;
+    num_cta_per_query  = max(params.search_width, params.itopk_size / muti_cta_itopk_size);
     result_buffer_size = itopk_size + search_width * graph_degree;
     typedef raft::Pow2<32> AlignBytes;
     unsigned result_buffer_size_32 = AlignBytes::roundUp(result_buffer_size);
@@ -190,13 +193,14 @@ struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
   void operator()(raft::resources const& res,
                   raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
                   raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
-                  INDEX_T* const topk_indices_ptr,          // [num_queries, topk]
-                  DISTANCE_T* const topk_distances_ptr,     // [num_queries, topk]
-                  const DATA_T* const queries_ptr,          // [num_queries, dataset_dim]
+                  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
+                  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
+                  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
                   const uint32_t num_queries,
                   const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
                   uint32_t* const num_executed_iterations,  // [num_queries,]
-                  uint32_t topk)
+                  uint32_t topk,
+                  SAMPLE_FILTER_T sample_filter)
   {
     cudaStream_t stream = resource::get_cuda_stream(res);
 
@@ -223,6 +227,7 @@ struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
       search_width,
       min_iterations,
       max_iterations,
+      sample_filter,
       stream);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
 
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
index de83acbb64..ee525587d7 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
@@ -15,7 +15,8 @@
  */
 #pragma once
 
-#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+#include <raft/neighbors/sample_filter_types.hpp>  // none_cagra_sample_filter
+#include <raft/util/raft_explicit.hpp>             // RAFT_EXPLICIT
 
 namespace raft::neighbors::cagra::detail {
 namespace multi_cta_search {
@@ -26,7 +27,8 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           class DATA_T,
           class INDEX_T,
-          class DISTANCE_T>
+          class DISTANCE_T,
+          class SAMPLE_FILTER_T>
 void select_and_run(raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
                     raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
                     INDEX_T* const topk_indices_ptr,
@@ -49,47 +51,63 @@ void select_and_run(raft::device_matrix_view<const DATA_T, int64_t, layout_strid
                     size_t search_width,
                     size_t min_iterations,
                     size_t max_iterations,
+                    SAMPLE_FILTER_T sample_filter,
                     cudaStream_t stream) RAFT_EXPLICIT;
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)   \
-  extern template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                     \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                          \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t result_buffer_size,                                                                \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    uint32_t num_cta_per_query,                                                                 \
-    uint32_t num_random_samplings,                                                              \
-    uint64_t rand_xor_mask,                                                                     \
-    uint32_t num_seeds,                                                                         \
-    size_t itopk_size,                                                                          \
-    size_t search_width,                                                                        \
-    size_t min_iterations,                                                                      \
-    size_t max_iterations,                                                                      \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  extern template void                                                                      \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 1024, float, uint32_t, float);
-instantiate_kernel_selection(8, 128, float, uint32_t, float);
-instantiate_kernel_selection(16, 256, float, uint32_t, float);
-instantiate_kernel_selection(32, 512, float, uint32_t, float);
-instantiate_kernel_selection(32, 1024, int8_t, uint32_t, float);
-instantiate_kernel_selection(8, 128, int8_t, uint32_t, float);
-instantiate_kernel_selection(16, 256, int8_t, uint32_t, float);
-instantiate_kernel_selection(32, 512, int8_t, uint32_t, float);
-instantiate_kernel_selection(32, 1024, uint8_t, uint32_t, float);
-instantiate_kernel_selection(8, 128, uint8_t, uint32_t, float);
-instantiate_kernel_selection(16, 256, uint8_t, uint32_t, float);
-instantiate_kernel_selection(32, 512, uint8_t, uint32_t, float);
+instantiate_kernel_selection(
+  32, 1024, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  8, 128, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  16, 256, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  32, 512, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  32, 1024, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  8, 128, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  16, 256, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  32, 512, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  32, 1024, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  8, 128, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  16, 256, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  32, 512, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 }  // namespace multi_cta_search
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index 0015b4a791..358a183971 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -26,6 +26,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 #include <vector>
 
@@ -48,7 +49,7 @@ namespace multi_cta_search {
 template <class INDEX_T>
 __device__ void pickup_next_parents(INDEX_T* const next_parent_indices,  // [search_width]
                                     const uint32_t search_width,
-                                    INDEX_T* const itopk_indices,        // [num_itopk]
+                                    INDEX_T* const itopk_indices,  // [num_itopk]
                                     const size_t num_itopk,
                                     uint32_t* const terminate_flag)
 {
@@ -75,7 +76,7 @@ __device__ void pickup_next_parents(INDEX_T* const next_parent_indices,  // [sea
     if (new_parent) {
       const auto i = __popc(ballot_mask & ((1 << lane_id) - 1)) + num_new_parents;
       if (i < search_width) {
-        next_parent_indices[i] = index;
+        next_parent_indices[i] = j;
         itopk_indices[j] |= index_msb_1_mask;  // set most significant bit as used node
       }
     }
@@ -86,8 +87,8 @@ __device__ void pickup_next_parents(INDEX_T* const next_parent_indices,  // [sea
 }
 
 template <unsigned MAX_ELEMENTS, class INDEX_T>
-__device__ inline void topk_by_bitonic_sort(float* distances,         // [num_elements]
-                                            INDEX_T* indices,         // [num_elements]
+__device__ inline void topk_by_bitonic_sort(float* distances,  // [num_elements]
+                                            INDEX_T* indices,  // [num_elements]
                                             const uint32_t num_elements,
                                             const uint32_t num_itopk  // num_itopk <= num_elements
 )
@@ -124,15 +125,14 @@ __device__ inline void topk_by_bitonic_sort(float* distances,         // [num_el
 // multiple CTAs per single query
 //
 template <unsigned TEAM_SIZE,
-          unsigned BLOCK_SIZE,
-          unsigned BLOCK_COUNT,
           unsigned MAX_ELEMENTS,
           unsigned MAX_DATASET_DIM,
           class DATA_T,
           class DISTANCE_T,
           class INDEX_T,
-          class LOAD_T>
-__launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(
+          class LOAD_T,
+          class SAMPLE_FILTER_T>
+__launch_bounds__(1024, 1) __global__ void search_kernel(
   INDEX_T* const result_indices_ptr,       // [num_queries, num_cta_per_query, itopk_size]
   DISTANCE_T* const result_distances_ptr,  // [num_queries, num_cta_per_query, itopk_size]
   const DATA_T* const dataset_ptr,         // [dataset_size, dataset_dim]
@@ -144,7 +144,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(
   const uint32_t graph_degree,
   const unsigned num_distilation,
   const uint64_t rand_xor_mask,
-  const INDEX_T* seed_ptr,             // [num_queries, num_seeds]
+  const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
   const uint32_t num_seeds,
   INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
   const uint32_t hash_bitlen,
@@ -152,10 +152,9 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(
   const uint32_t search_width,
   const uint32_t min_iteration,
   const uint32_t max_iteration,
-  uint32_t* const num_executed_iterations /* stats */
-)
+  uint32_t* const num_executed_iterations, /* stats */
+  SAMPLE_FILTER_T sample_filter)
 {
-  assert(blockDim.x == BLOCK_SIZE);
   assert(dataset_dim <= MAX_DATASET_DIM);
 
   const auto num_queries       = gridDim.y;
@@ -207,7 +206,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(
     }
 #endif
   const DATA_T* const query_ptr = queries_ptr + (dataset_dim * query_id);
-  for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) {
+  for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += blockDim.x) {
     unsigned j = device::swizzling(i);
     if (i < dataset_dim) {
       query_buffer[j] = spatial::knn::detail::utils::mapping<float>{}(query_ptr[i]);
@@ -274,27 +273,70 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(
     _CLK_START();
     // constexpr unsigned max_n_frags = 16;
     constexpr unsigned max_n_frags = 0;
-    device::
-      compute_distance_to_child_nodes<TEAM_SIZE, BLOCK_SIZE, MAX_DATASET_DIM, max_n_frags, LOAD_T>(
-        result_indices_buffer + itopk_size,
-        result_distances_buffer + itopk_size,
-        query_buffer,
-        dataset_ptr,
-        dataset_dim,
-        dataset_ld,
-        knn_graph,
-        graph_degree,
-        local_visited_hashmap_ptr,
-        hash_bitlen,
-        parent_indices_buffer,
-        search_width);
+    device::compute_distance_to_child_nodes<TEAM_SIZE, MAX_DATASET_DIM, max_n_frags, LOAD_T>(
+      result_indices_buffer + itopk_size,
+      result_distances_buffer + itopk_size,
+      query_buffer,
+      dataset_ptr,
+      dataset_dim,
+      dataset_ld,
+      knn_graph,
+      graph_degree,
+      local_visited_hashmap_ptr,
+      hash_bitlen,
+      parent_indices_buffer,
+      result_indices_buffer,
+      search_width);
     _CLK_REC(clk_compute_distance);
     __syncthreads();
 
+    // Filtering
+    if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                                raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+      constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+      const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
+
+      for (unsigned p = threadIdx.x; p < search_width; p += blockDim.x) {
+        if (parent_indices_buffer[p] != invalid_index) {
+          const auto parent_id =
+            result_indices_buffer[parent_indices_buffer[p]] & ~index_msb_1_mask;
+          if (!sample_filter(query_id, parent_id)) {
+            // If the parent must not be in the resulting top-k list, remove from the parent list
+            result_distances_buffer[parent_indices_buffer[p]] = utils::get_max_value<DISTANCE_T>();
+            result_indices_buffer[parent_indices_buffer[p]]   = invalid_index;
+          }
+        }
+      }
+      __syncthreads();
+    }
+
     iter++;
   }
 
-  for (uint32_t i = threadIdx.x; i < itopk_size; i += BLOCK_SIZE) {
+  // Post process for filtering
+  if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                              raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+    constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+    const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
+
+    for (unsigned i = threadIdx.x; i < itopk_size + search_width * graph_degree; i += blockDim.x) {
+      const auto node_id = result_indices_buffer[i] & ~index_msb_1_mask;
+      if (node_id != (invalid_index & ~index_msb_1_mask) && !sample_filter(query_id, node_id)) {
+        // If the parent must not be in the resulting top-k list, remove from the parent list
+        result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
+        result_indices_buffer[i]   = invalid_index;
+      }
+    }
+
+    __syncthreads();
+    topk_by_bitonic_sort<MAX_ELEMENTS, INDEX_T>(result_distances_buffer,
+                                                result_indices_buffer,
+                                                itopk_size + (search_width * graph_degree),
+                                                itopk_size);
+    __syncthreads();
+  }
+
+  for (uint32_t i = threadIdx.x; i < itopk_size; i += blockDim.x) {
     uint32_t j = i + (itopk_size * (cta_id + (num_cta_per_query * query_id)));
     if (result_distances_ptr != nullptr) { result_distances_ptr[j] = result_distances_buffer[i]; }
     constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
@@ -361,88 +403,52 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
 struct search_kernel_config {
   // Search kernel function type. Note that the actual values for the template value
   // parameters do not matter, because they are not part of the function signature. The
   // second to fourth value parameters will be selected by the choose_* functions below.
   using kernel_t = decltype(&search_kernel<TEAM_SIZE,
-                                           64,
-                                           16,
                                            128,
                                            MAX_DATASET_DIM,
                                            DATA_T,
                                            DISTANCE_T,
                                            INDEX_T,
-                                           device::LOAD_128BIT_T>);
+                                           device::LOAD_128BIT_T,
+                                           SAMPLE_FILTER_T>);
 
   static auto choose_buffer_size(unsigned result_buffer_size, unsigned block_size) -> kernel_t
   {
     if (result_buffer_size <= 64) {
-      return choose_max_elements<64>(block_size);
-    } else if (result_buffer_size <= 128) {
-      return choose_max_elements<128>(block_size);
-    } else if (result_buffer_size <= 256) {
-      return choose_max_elements<256>(block_size);
-    }
-    THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
-  }
-
-  template <unsigned MAX_ELEMENTS>
-  // Todo: rename this to choose block_size
-  static auto choose_max_elements(unsigned block_size) -> kernel_t
-  {
-    if (block_size == 64) {
       return search_kernel<TEAM_SIZE,
                            64,
-                           16,
-                           MAX_ELEMENTS,
                            MAX_DATASET_DIM,
                            DATA_T,
                            DISTANCE_T,
                            INDEX_T,
-                           device::LOAD_128BIT_T>;
-    } else if (block_size == 128) {
+                           device::LOAD_128BIT_T,
+                           SAMPLE_FILTER_T>;
+    } else if (result_buffer_size <= 128) {
       return search_kernel<TEAM_SIZE,
                            128,
-                           8,
-                           MAX_ELEMENTS,
                            MAX_DATASET_DIM,
                            DATA_T,
                            DISTANCE_T,
                            INDEX_T,
-                           device::LOAD_128BIT_T>;
-    } else if (block_size == 256) {
+                           device::LOAD_128BIT_T,
+                           SAMPLE_FILTER_T>;
+    } else if (result_buffer_size <= 256) {
       return search_kernel<TEAM_SIZE,
                            256,
-                           4,
-                           MAX_ELEMENTS,
-                           MAX_DATASET_DIM,
-                           DATA_T,
-                           DISTANCE_T,
-                           INDEX_T,
-                           device::LOAD_128BIT_T>;
-    } else if (block_size == 512) {
-      return search_kernel<TEAM_SIZE,
-                           512,
-                           2,
-                           MAX_ELEMENTS,
-                           MAX_DATASET_DIM,
-                           DATA_T,
-                           DISTANCE_T,
-                           INDEX_T,
-                           device::LOAD_128BIT_T>;
-    } else {
-      return search_kernel<TEAM_SIZE,
-                           1024,
-                           1,
-                           MAX_ELEMENTS,
                            MAX_DATASET_DIM,
                            DATA_T,
                            DISTANCE_T,
                            INDEX_T,
-                           device::LOAD_128BIT_T>;
+                           device::LOAD_128BIT_T,
+                           SAMPLE_FILTER_T>;
     }
+    THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
   }
 };
 
@@ -450,13 +456,14 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
 void select_and_run(  // raft::resources const& res,
   raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
   raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
-  INDEX_T* const topk_indices_ptr,          // [num_queries, topk]
-  DISTANCE_T* const topk_distances_ptr,     // [num_queries, topk]
-  const DATA_T* const queries_ptr,          // [num_queries, dataset_dim]
+  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
+  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
+  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
   const uint32_t num_queries,
   const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
   uint32_t* const num_executed_iterations,  // [num_queries,]
@@ -475,10 +482,12 @@ void select_and_run(  // raft::resources const& res,
   size_t search_width,
   size_t min_iterations,
   size_t max_iterations,
+  SAMPLE_FILTER_T sample_filter,
   cudaStream_t stream)
 {
-  auto kernel = search_kernel_config<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>::
-    choose_buffer_size(result_buffer_size, block_size);
+  auto kernel =
+    search_kernel_config<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::
+      choose_buffer_size(result_buffer_size, block_size);
 
   RAFT_CUDA_TRY(
     cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
@@ -489,7 +498,7 @@ void select_and_run(  // raft::resources const& res,
 
   dim3 block_dims(block_size, 1, 1);
   dim3 grid_dims(num_cta_per_query, num_queries, 1);
-  RAFT_LOG_DEBUG("Launching kernel with %u threads, (%u, %u) blocks %lu smem",
+  RAFT_LOG_DEBUG("Launching kernel with %u threads, (%u, %u) blocks %u smem",
                  block_size,
                  num_cta_per_query,
                  num_queries,
@@ -513,7 +522,8 @@ void select_and_run(  // raft::resources const& res,
                                                        search_width,
                                                        min_iterations,
                                                        max_iterations,
-                                                       num_executed_iterations);
+                                                       num_executed_iterations,
+                                                       sample_filter);
 }
 
 }  // namespace multi_cta_search
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
index e664764721..5dcfcb3929 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -25,6 +25,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/neighbors/sample_filter_types.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <vector>
@@ -98,7 +99,7 @@ __global__ void random_pickup_kernel(
   const std::size_t num_pickup,
   const unsigned num_distilation,
   const uint64_t rand_xor_mask,
-  const INDEX_T* seed_ptr,                 // [num_queries, num_seeds]
+  const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
   const uint32_t num_seeds,
   INDEX_T* const result_indices_ptr,       // [num_queries, ldr]
   DISTANCE_T* const result_distances_ptr,  // [num_queries, ldr]
@@ -170,7 +171,7 @@ void random_pickup(const DATA_T* const dataset_ptr,  // [dataset_size, dataset_d
                    const std::size_t num_pickup,
                    const unsigned num_distilation,
                    const uint64_t rand_xor_mask,
-                   const INDEX_T* seed_ptr,                 // [num_queries, num_seeds]
+                   const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
                    const uint32_t num_seeds,
                    INDEX_T* const result_indices_ptr,       // [num_queries, ldr]
                    DISTANCE_T* const result_distances_ptr,  // [num_queries, ldr]
@@ -242,7 +243,7 @@ __global__ void pickup_next_parents_kernel(
       if (new_parent) {
         const auto i = __popc(ballot_mask & ((1 << threadIdx.x) - 1)) + num_new_parents;
         if (i < parent_list_size) {
-          parent_list_ptr[i + (ldd * query_id)] = index;
+          parent_list_ptr[i + (ldd * query_id)] = j;
           parent_candidates_ptr[j + (lds * query_id)] |=
             index_msb_1_mask;  // set most significant bit as used node
         }
@@ -253,7 +254,7 @@ __global__ void pickup_next_parents_kernel(
     if ((num_new_parents > 0) && (threadIdx.x == 0)) { *terminate_flag = 0; }
   } else if (small_hash_bitlen) {
     // reset small-hash
-    hashmap::init<32>(visited_hashmap_ptr + (ldb * query_id), hash_bitlen);
+    hashmap::init(visited_hashmap_ptr + (ldb * query_id), hash_bitlen, 32);
   }
 
   if (small_hash_bitlen) {
@@ -306,31 +307,44 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           class DATA_T,
           class INDEX_T,
-          class DISTANCE_T>
+          class DISTANCE_T,
+          class SAMPLE_FILTER_T>
 __global__ void compute_distance_to_child_nodes_kernel(
   const INDEX_T* const parent_node_list,  // [num_queries, search_width]
+  INDEX_T* const parent_candidates_ptr,   // [num_queries, search_width]
+  DISTANCE_T* const parent_distance_ptr,  // [num_queries, search_width]
+  const std::size_t lds,
   const std::uint32_t search_width,
-  const DATA_T* const dataset_ptr,        // [dataset_size, data_dim]
+  const DATA_T* const dataset_ptr,  // [dataset_size, data_dim]
   const std::uint32_t data_dim,
   const std::uint32_t dataset_size,
   const std::uint32_t dataset_ld,
   const INDEX_T* const neighbor_graph_ptr,  // [dataset_size, graph_degree]
   const std::uint32_t graph_degree,
-  const DATA_T* query_ptr,                  // [num_queries, data_dim]
-  INDEX_T* const visited_hashmap_ptr,       // [num_queries, 1 << hash_bitlen]
+  const DATA_T* query_ptr,             // [num_queries, data_dim]
+  INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
   const std::uint32_t hash_bitlen,
-  INDEX_T* const result_indices_ptr,        // [num_queries, ldd]
-  DISTANCE_T* const result_distances_ptr,   // [num_queries, ldd]
-  const std::uint32_t ldd                   // (*) ldd >= search_width * graph_degree
-)
+  INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
+  DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
+  const std::uint32_t ldd,                 // (*) ldd >= search_width * graph_degree
+  SAMPLE_FILTER_T sample_filter)
 {
   const uint32_t ldb        = hashmap::get_size(hash_bitlen);
   const auto tid            = threadIdx.x + blockDim.x * blockIdx.x;
   const auto global_team_id = tid / TEAM_SIZE;
+  const auto query_id       = blockIdx.y;
+
   if (global_team_id >= search_width * graph_degree) { return; }
 
-  const std::size_t parent_index =
+  const std::size_t parent_list_index =
     parent_node_list[global_team_id / graph_degree + (search_width * blockIdx.y)];
+
+  if (parent_list_index == utils::get_max_value<INDEX_T>()) { return; }
+
+  constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+  const auto parent_index =
+    parent_candidates_ptr[parent_list_index + (lds * query_id)] & ~index_msb_1_mask;
+
   if (parent_index == utils::get_max_value<INDEX_T>()) {
     result_distances_ptr[ldd * blockIdx.y + global_team_id] = utils::get_max_value<DISTANCE_T>();
     return;
@@ -361,29 +375,43 @@ __global__ void compute_distance_to_child_nodes_kernel(
       result_distances_ptr[ldd * blockIdx.y + global_team_id] = utils::get_max_value<DISTANCE_T>();
     }
   }
+
+  if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                              raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+    if (!sample_filter(query_id, parent_index)) {
+      parent_candidates_ptr[parent_list_index + (lds * query_id)] = utils::get_max_value<INDEX_T>();
+      parent_distance_ptr[parent_list_index + (lds * query_id)] =
+        utils::get_max_value<DISTANCE_T>();
+    }
+  }
 }
 
 template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           class DATA_T,
           class INDEX_T,
-          class DISTANCE_T>
+          class DISTANCE_T,
+          class SAMPLE_FILTER_T>
 void compute_distance_to_child_nodes(
   const INDEX_T* const parent_node_list,  // [num_queries, search_width]
+  INDEX_T* const parent_candidates_ptr,   // [num_queries, search_width]
+  DISTANCE_T* const parent_distance_ptr,  // [num_queries, search_width]
+  const std::size_t lds,
   const uint32_t search_width,
-  const DATA_T* const dataset_ptr,        // [dataset_size, data_dim]
+  const DATA_T* const dataset_ptr,  // [dataset_size, data_dim]
   const std::uint32_t data_dim,
   const std::uint32_t dataset_size,
   const std::uint32_t dataset_ld,
   const INDEX_T* const neighbor_graph_ptr,  // [dataset_size, graph_degree]
   const std::uint32_t graph_degree,
-  const DATA_T* query_ptr,                  // [num_queries, data_dim]
+  const DATA_T* query_ptr,  // [num_queries, data_dim]
   const std::uint32_t num_queries,
-  INDEX_T* const visited_hashmap_ptr,       // [num_queries, 1 << hash_bitlen]
+  INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
   const std::uint32_t hash_bitlen,
-  INDEX_T* const result_indices_ptr,        // [num_queries, ldd]
-  DISTANCE_T* const result_distances_ptr,   // [num_queries, ldd]
-  const std::uint32_t ldd,                  // (*) ldd >= search_width * graph_degree
+  INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
+  DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
+  const std::uint32_t ldd,                 // (*) ldd >= search_width * graph_degree
+  SAMPLE_FILTER_T sample_filter,
   cudaStream_t cuda_stream = 0)
 {
   const auto block_size = 128;
@@ -392,6 +420,9 @@ void compute_distance_to_child_nodes(
     num_queries);
   compute_distance_to_child_nodes_kernel<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>
     <<<grid_size, block_size, 0, cuda_stream>>>(parent_node_list,
+                                                parent_candidates_ptr,
+                                                parent_distance_ptr,
+                                                lds,
                                                 search_width,
                                                 dataset_ptr,
                                                 data_dim,
@@ -404,7 +435,8 @@ void compute_distance_to_child_nodes(
                                                 hash_bitlen,
                                                 result_indices_ptr,
                                                 result_distances_ptr,
-                                                ldd);
+                                                ldd,
+                                                sample_filter);
 }
 
 template <class INDEX_T>
@@ -436,8 +468,52 @@ void remove_parent_bit(const std::uint32_t num_queries,
     num_queries, num_topk, topk_indices_ptr, ld);
 }
 
+// This function called after the `remove_parent_bit` function
+template <class INDEX_T, class DISTANCE_T, class SAMPLE_FILTER_T>
+__global__ void apply_filter_kernel(INDEX_T* const result_indices_ptr,
+                                    DISTANCE_T* const result_distances_ptr,
+                                    const std::size_t lds,
+                                    const std::uint32_t result_buffer_size,
+                                    const std::uint32_t num_queries,
+                                    const INDEX_T query_id_offset,
+                                    SAMPLE_FILTER_T sample_filter)
+{
+  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= result_buffer_size * num_queries) { return; }
+  const auto i     = tid % result_buffer_size;
+  const auto j     = tid / result_buffer_size;
+  const auto index = i + j * lds;
+
+  if (!sample_filter(query_id_offset + j, result_indices_ptr[index])) {
+    result_indices_ptr[index]   = utils::get_max_value<INDEX_T>();
+    result_distances_ptr[index] = utils::get_max_value<DISTANCE_T>();
+  }
+}
+
+template <class INDEX_T, class DISTANCE_T, class SAMPLE_FILTER_T>
+void apply_filter(INDEX_T* const result_indices_ptr,
+                  DISTANCE_T* const result_distances_ptr,
+                  const std::size_t lds,
+                  const std::uint32_t result_buffer_size,
+                  const std::uint32_t num_queries,
+                  const INDEX_T query_id_offset,
+                  SAMPLE_FILTER_T sample_filter,
+                  cudaStream_t cuda_stream)
+{
+  const std::uint32_t block_size = 256;
+  const std::uint32_t grid_size  = ceildiv(num_queries * result_buffer_size, block_size);
+
+  apply_filter_kernel<<<grid_size, block_size, 0, cuda_stream>>>(result_indices_ptr,
+                                                                 result_distances_ptr,
+                                                                 lds,
+                                                                 result_buffer_size,
+                                                                 num_queries,
+                                                                 query_id_offset,
+                                                                 sample_filter);
+}
+
 template <class T>
-__global__ void batched_memcpy_kernel(T* const dst,        // [batch_size, ld_dst]
+__global__ void batched_memcpy_kernel(T* const dst,  // [batch_size, ld_dst]
                                       const uint64_t ld_dst,
                                       const T* const src,  // [batch_size, ld_src]
                                       const uint64_t ld_src,
@@ -452,7 +528,7 @@ __global__ void batched_memcpy_kernel(T* const dst,        // [batch_size, ld_ds
 }
 
 template <class T>
-void batched_memcpy(T* const dst,        // [batch_size, ld_dst]
+void batched_memcpy(T* const dst,  // [batch_size, ld_dst]
                     const uint64_t ld_dst,
                     const T* const src,  // [batch_size, ld_src]
                     const uint64_t ld_src,
@@ -508,41 +584,42 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
-struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_queries;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::itopk_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::algo;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::team_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::search_width;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::min_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::thread_block_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_mode;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_min_bitlen;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_max_fill_rate;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_random_samplings;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::rand_xor_mask;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_dim;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dim;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::graph_degree;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::topk;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hash_bitlen;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_bitlen;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_reset_interval;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dataset_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::result_buffer_size;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::smem_size;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_executed_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dev_seed;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_seeds;
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
+struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_queries;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::itopk_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::algo;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::team_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::search_width;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::min_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::thread_block_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_mode;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_min_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_max_fill_rate;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_random_samplings;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::rand_xor_mask;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::graph_degree;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::topk;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hash_bitlen;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::small_hash_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::small_hash_reset_interval;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dataset_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::result_buffer_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::smem_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_executed_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dev_seed;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_seeds;
 
   size_t result_buffer_allocation_size;
   rmm::device_uvector<INDEX_T> result_indices;  // results_indices_buffer
@@ -557,7 +634,8 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
          int64_t dim,
          int64_t graph_degree,
          uint32_t topk)
-    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>(res, params, dim, graph_degree, topk),
+    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk),
       result_indices(0, resource::get_cuda_stream(res)),
       result_distances(0, resource::get_cuda_stream(res)),
       parent_node_list(0, resource::get_cuda_stream(res)),
@@ -596,13 +674,14 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
   void operator()(raft::resources const& res,
                   raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
                   raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
-                  INDEX_T* const topk_indices_ptr,          // [num_queries, topk]
-                  DISTANCE_T* const topk_distances_ptr,     // [num_queries, topk]
-                  const DATA_T* const queries_ptr,          // [num_queries, dataset_dim]
+                  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
+                  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
+                  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
                   const uint32_t num_queries,
                   const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
                   uint32_t* const num_executed_iterations,  // [num_queries,]
-                  uint32_t topk)
+                  uint32_t topk,
+                  SAMPLE_FILTER_T sample_filter)
   {
     // Init hashmap
     cudaStream_t stream      = resource::get_cuda_stream(res);
@@ -684,6 +763,9 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
       // Compute distance to child nodes that are adjacent to the parent node
       compute_distance_to_child_nodes<TEAM_SIZE, MAX_DATASET_DIM>(
         parent_node_list.data(),
+        result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size,
+        result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size,
+        result_buffer_allocation_size,
         search_width,
         dataset.data_handle(),
         dataset.extent(1),
@@ -698,22 +780,53 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
         result_indices.data() + itopk_size,
         result_distances.data() + itopk_size,
         result_buffer_allocation_size,
+        sample_filter,
         stream);
 
       iter++;
     }  // while ( 1 )
+    auto result_indices_ptr   = result_indices.data() + (iter & 0x1) * result_buffer_size;
+    auto result_distances_ptr = result_distances.data() + (iter & 0x1) * result_buffer_size;
 
     // Remove parent bit in search results
-    remove_parent_bit(num_queries,
-                      itopk_size,
-                      result_indices.data() + (iter & 0x1) * result_buffer_size,
-                      result_buffer_allocation_size,
-                      stream);
+    remove_parent_bit(
+      num_queries, itopk_size, result_indices_ptr, result_buffer_allocation_size, stream);
+
+    if (!std::is_same<SAMPLE_FILTER_T,
+                      raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+      apply_filter<INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>(
+        result_indices.data() + (iter & 0x1) * itopk_size,
+        result_distances.data() + (iter & 0x1) * itopk_size,
+        result_buffer_allocation_size,
+        result_buffer_size,
+        num_queries,
+        0,
+        sample_filter,
+        stream);
+
+      result_indices_ptr   = result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size;
+      result_distances_ptr = result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size;
+      _cuann_find_topk(itopk_size,
+                       num_queries,
+                       result_buffer_size,
+                       result_distances.data() + (iter & 0x1) * itopk_size,
+                       result_buffer_allocation_size,
+                       result_indices.data() + (iter & 0x1) * itopk_size,
+                       result_buffer_allocation_size,
+                       result_distances_ptr,
+                       result_buffer_allocation_size,
+                       result_indices_ptr,
+                       result_buffer_allocation_size,
+                       topk_workspace.data(),
+                       true,
+                       topk_hint.data(),
+                       stream);
+    }
 
     // Copy results from working buffer to final buffer
     batched_memcpy(topk_indices_ptr,
                    topk,
-                   result_indices.data() + (iter & 0x1) * result_buffer_size,
+                   result_indices_ptr,
                    result_buffer_allocation_size,
                    topk,
                    num_queries,
@@ -721,7 +834,7 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
     if (topk_distances_ptr) {
       batched_memcpy(topk_distances_ptr,
                      topk,
-                     result_distances.data() + (iter & 0x1) * result_buffer_size,
+                     result_distances_ptr,
                      result_buffer_allocation_size,
                      topk,
                      num_queries,
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
index e6966987c8..a0f346ab51 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
@@ -38,12 +38,13 @@ struct search_plan_impl_base : public search_params {
   {
     set_max_dim_team(dim);
     if (algo == search_algo::AUTO) {
-      if (itopk_size <= 512) {
+      const size_t num_sm = raft::getMultiProcessorCount();
+      if (itopk_size <= 512 && search_params::max_queries >= num_sm * 2lu) {
         algo = search_algo::SINGLE_CTA;
         RAFT_LOG_DEBUG("Auto strategy: selecting single-cta");
       } else {
-        algo = search_algo::MULTI_KERNEL;
-        RAFT_LOG_DEBUG("Auto strategy: selecting multi-kernel");
+        algo = search_algo::MULTI_CTA;
+        RAFT_LOG_DEBUG("Auto strategy: selecting multi-cta");
       }
     }
   }
@@ -65,7 +66,7 @@ struct search_plan_impl_base : public search_params {
   }
 };
 
-template <class DATA_T, class INDEX_T, class DISTANCE_T>
+template <class DATA_T, class INDEX_T, class DISTANCE_T, class SAMPLE_FILTER_T>
 struct search_plan_impl : public search_plan_impl_base {
   int64_t hash_bitlen;
 
@@ -111,9 +112,10 @@ struct search_plan_impl : public search_plan_impl_base {
                           DISTANCE_T* const result_distances_ptr,  // [num_queries, topk]
                           const DATA_T* const queries_ptr,         // [num_queries, dataset_dim]
                           const std::uint32_t num_queries,
-                          const INDEX_T* dev_seed_ptr,             // [num_queries, num_seeds]
+                          const INDEX_T* dev_seed_ptr,                   // [num_queries, num_seeds]
                           std::uint32_t* const num_executed_iterations,  // [num_queries]
-                          uint32_t topk){};
+                          uint32_t topk,
+                          SAMPLE_FILTER_T sample_filter){};
 
   void adjust_search_params()
   {
@@ -129,13 +131,13 @@ struct search_plan_impl : public search_plan_impl_base {
     if (max_iterations < min_iterations) { _max_iterations = min_iterations; }
     if (max_iterations < _max_iterations) {
       RAFT_LOG_DEBUG(
-        "# max_iterations is increased from %u to %u.", max_iterations, _max_iterations);
+        "# max_iterations is increased from %lu to %u.", max_iterations, _max_iterations);
       max_iterations = _max_iterations;
     }
     if (itopk_size % 32) {
       uint32_t itopk32 = itopk_size;
       itopk32 += 32 - (itopk_size % 32);
-      RAFT_LOG_DEBUG("# internal_topk is increased from %u to %u, as it must be multiple of 32.",
+      RAFT_LOG_DEBUG("# internal_topk is increased from %lu to %u, as it must be multiple of 32.",
                      itopk_size,
                      itopk32);
       itopk_size = itopk32;
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
index 96de83369d..b36bc6f77b 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
@@ -49,41 +49,42 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
-struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_queries;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::itopk_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::algo;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::team_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::search_width;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::min_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::thread_block_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_mode;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_min_bitlen;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_max_fill_rate;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_random_samplings;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::rand_xor_mask;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_dim;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dim;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::graph_degree;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::topk;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hash_bitlen;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_bitlen;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_reset_interval;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dataset_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::result_buffer_size;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::smem_size;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_executed_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dev_seed;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_seeds;
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
+struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_queries;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::itopk_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::algo;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::team_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::search_width;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::min_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::thread_block_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_mode;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_min_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_max_fill_rate;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_random_samplings;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::rand_xor_mask;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::graph_degree;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::topk;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hash_bitlen;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::small_hash_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::small_hash_reset_interval;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dataset_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::result_buffer_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::smem_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_executed_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dev_seed;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_seeds;
 
   uint32_t num_itopk_candidates;
 
@@ -92,7 +93,8 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
          int64_t dim,
          int64_t graph_degree,
          uint32_t topk)
-    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>(res, params, dim, graph_degree, topk)
+    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk)
   {
     set_params(res);
   }
@@ -111,7 +113,7 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
     RAFT_EXPECTS(itopk_size <= max_itopk, "itopk_size cannot be larger than %u", max_itopk);
 
     RAFT_LOG_DEBUG("# num_itopk_candidates: %u", num_itopk_candidates);
-    RAFT_LOG_DEBUG("# num_itopk: %u", itopk_size);
+    RAFT_LOG_DEBUG("# num_itopk: %lu", itopk_size);
     //
     // Determine the thread block size
     //
@@ -129,11 +131,9 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
       // Tentatively calculate the required share memory size when radix
       // sort based topk is used, assuming the block size is the maximum.
       if (itopk_size <= 256) {
-        smem_size +=
-          topk_by_radix_sort<256, max_block_size, INDEX_T>::smem_size * sizeof(std::uint32_t);
+        smem_size += topk_by_radix_sort<256, INDEX_T>::smem_size * sizeof(std::uint32_t);
       } else {
-        smem_size +=
-          topk_by_radix_sort<512, max_block_size, INDEX_T>::smem_size * sizeof(std::uint32_t);
+        smem_size += topk_by_radix_sort<512, INDEX_T>::smem_size * sizeof(std::uint32_t);
       }
     }
 
@@ -186,34 +186,10 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
       smem_size = base_smem_size;
       if (itopk_size <= 256) {
         constexpr unsigned MAX_ITOPK = 256;
-        if (block_size == 256) {
-          constexpr unsigned BLOCK_SIZE = 256;
-          smem_size +=
-            topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>::smem_size * sizeof(std::uint32_t);
-        } else if (block_size == 512) {
-          constexpr unsigned BLOCK_SIZE = 512;
-          smem_size +=
-            topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>::smem_size * sizeof(std::uint32_t);
-        } else {
-          constexpr unsigned BLOCK_SIZE = 1024;
-          smem_size +=
-            topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>::smem_size * sizeof(std::uint32_t);
-        }
+        smem_size += topk_by_radix_sort<MAX_ITOPK, INDEX_T>::smem_size * sizeof(std::uint32_t);
       } else {
         constexpr unsigned MAX_ITOPK = 512;
-        if (block_size == 256) {
-          constexpr unsigned BLOCK_SIZE = 256;
-          smem_size +=
-            topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>::smem_size * sizeof(std::uint32_t);
-        } else if (block_size == 512) {
-          constexpr unsigned BLOCK_SIZE = 512;
-          smem_size +=
-            topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>::smem_size * sizeof(std::uint32_t);
-        } else {
-          constexpr unsigned BLOCK_SIZE = 1024;
-          smem_size +=
-            topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>::smem_size * sizeof(std::uint32_t);
-        }
+        smem_size += topk_by_radix_sort<MAX_ITOPK, INDEX_T>::smem_size * sizeof(std::uint32_t);
       }
     }
     RAFT_LOG_DEBUG("# smem_size: %u", smem_size);
@@ -228,13 +204,14 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
   void operator()(raft::resources const& res,
                   raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
                   raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
-                  INDEX_T* const result_indices_ptr,             // [num_queries, topk]
-                  DISTANCE_T* const result_distances_ptr,        // [num_queries, topk]
-                  const DATA_T* const queries_ptr,               // [num_queries, dataset_dim]
+                  INDEX_T* const result_indices_ptr,       // [num_queries, topk]
+                  DISTANCE_T* const result_distances_ptr,  // [num_queries, topk]
+                  const DATA_T* const queries_ptr,         // [num_queries, dataset_dim]
                   const std::uint32_t num_queries,
                   const INDEX_T* dev_seed_ptr,                   // [num_queries, num_seeds]
                   std::uint32_t* const num_executed_iterations,  // [num_queries]
-                  uint32_t topk)
+                  uint32_t topk,
+                  SAMPLE_FILTER_T sample_filter)
   {
     cudaStream_t stream = resource::get_cuda_stream(res);
     select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(
@@ -261,6 +238,7 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
       search_width,
       min_iterations,
       max_iterations,
+      sample_filter,
       stream);
   }
 };
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
index f7c43fe11c..35d239563a 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
@@ -15,7 +15,9 @@
  */
 #pragma once
 
+#include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+
 namespace raft::neighbors::cagra::detail {
 namespace single_cta_search {
 
@@ -25,13 +27,14 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
 void select_and_run(  // raft::resources const& res,
   raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
   raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
-  INDEX_T* const topk_indices_ptr,          // [num_queries, topk]
-  DISTANCE_T* const topk_distances_ptr,     // [num_queries, topk]
-  const DATA_T* const queries_ptr,          // [num_queries, dataset_dim]
+  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
+  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
+  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
   const uint32_t num_queries,
   const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
   uint32_t* const num_executed_iterations,  // [num_queries,]
@@ -50,50 +53,65 @@ void select_and_run(  // raft::resources const& res,
   size_t search_width,
   size_t min_iterations,
   size_t max_iterations,
+  SAMPLE_FILTER_T sample_filter,
   cudaStream_t stream) RAFT_EXPLICIT;
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
-#define instantiate_single_cta_select_and_run(                                                  \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                                      \
-  extern template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                     \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                          \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t num_itopk_candidates,                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    size_t small_hash_bitlen,                                                                   \
-    size_t small_hash_reset_interval,                                                           \
-    uint32_t num_random_samplings,                                                              \
-    uint64_t rand_xor_mask,                                                                     \
-    uint32_t num_seeds,                                                                         \
-    size_t itopk_size,                                                                          \
-    size_t search_width,                                                                        \
-    size_t min_iterations,                                                                      \
-    size_t max_iterations,                                                                      \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  extern template void                                                                      \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 1024, float, uint32_t, float);
-instantiate_single_cta_select_and_run(8, 128, float, uint32_t, float);
-instantiate_single_cta_select_and_run(16, 256, float, uint32_t, float);
-instantiate_single_cta_select_and_run(32, 512, float, uint32_t, float);
-instantiate_single_cta_select_and_run(32, 1024, int8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(8, 128, int8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(16, 256, int8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(32, 512, int8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(32, 1024, uint8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(8, 128, uint8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(16, 256, uint8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(32, 512, uint8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 1024, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  8, 128, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  16, 256, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  32, 512, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  32, 1024, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  8, 128, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  16, 256, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  32, 512, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  32, 1024, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  8, 128, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  16, 256, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  32, 512, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_select_and_run
 
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 31d9c9fffa..3a5501f545 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -25,6 +25,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <rmm/device_uvector.hpp>
 #include <vector>
@@ -78,7 +79,7 @@ __device__ void pickup_next_parents(std::uint32_t* const terminate_flag,
     if (new_parent) {
       const auto i = __popc(ballot_mask & ((1 << threadIdx.x) - 1)) + num_new_parents;
       if (i < search_width) {
-        next_parent_indices[i] = index;
+        next_parent_indices[i] = jj;
         // set most significant bit as used node
         internal_topk_indices[jj] |= index_msb_1_mask;
       }
@@ -89,11 +90,12 @@ __device__ void pickup_next_parents(std::uint32_t* const terminate_flag,
   if (threadIdx.x == 0 && (num_new_parents == 0)) { *terminate_flag = 1; }
 }
 
-template <unsigned MAX_CANDIDATES, unsigned MULTI_WARPS = 0, class IdxT = void>
+template <unsigned MAX_CANDIDATES, class IdxT = void>
 __device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances,  // [num_candidates]
                                                 IdxT* candidate_indices,     // [num_candidates]
                                                 const std::uint32_t num_candidates,
-                                                const std::uint32_t num_itopk)
+                                                const std::uint32_t num_itopk,
+                                                unsigned MULTI_WARPS = 0)
 {
   const unsigned lane_id = threadIdx.x % 32;
   const unsigned warp_id = threadIdx.x / 32;
@@ -191,15 +193,16 @@ __device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances,  //
   }
 }
 
-template <unsigned MAX_ITOPK, unsigned MULTI_WARPS = 0, class IdxT = void>
-__device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances,      // [num_itopk]
-                                                IdxT* itopk_indices,         // [num_itopk]
+template <unsigned MAX_ITOPK, class IdxT = void>
+__device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances,  // [num_itopk]
+                                                IdxT* itopk_indices,     // [num_itopk]
                                                 const std::uint32_t num_itopk,
                                                 float* candidate_distances,  // [num_candidates]
                                                 IdxT* candidate_indices,     // [num_candidates]
                                                 const std::uint32_t num_candidates,
                                                 std::uint32_t* work_buf,
-                                                const bool first)
+                                                const bool first,
+                                                unsigned MULTI_WARPS = 0)
 {
   const unsigned lane_id = threadIdx.x % 32;
   const unsigned warp_id = threadIdx.x / 32;
@@ -398,43 +401,45 @@ __device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances,      //
 
 template <unsigned MAX_ITOPK,
           unsigned MAX_CANDIDATES,
-          unsigned MULTI_WARPS_1,
-          unsigned MULTI_WARPS_2,
           class IdxT>
-__device__ void topk_by_bitonic_sort(float* itopk_distances,      // [num_itopk]
-                                     IdxT* itopk_indices,         // [num_itopk]
+__device__ void topk_by_bitonic_sort(float* itopk_distances,  // [num_itopk]
+                                     IdxT* itopk_indices,     // [num_itopk]
                                      const std::uint32_t num_itopk,
                                      float* candidate_distances,  // [num_candidates]
                                      IdxT* candidate_indices,     // [num_candidates]
                                      const std::uint32_t num_candidates,
                                      std::uint32_t* work_buf,
-                                     const bool first)
+                                     const bool first,
+                                     const unsigned MULTI_WARPS_1,
+                                     const unsigned MULTI_WARPS_2)
 {
   // The results in candidate_distances/indices are sorted by bitonic sort.
-  topk_by_bitonic_sort_1st<MAX_CANDIDATES, MULTI_WARPS_1, IdxT>(
-    candidate_distances, candidate_indices, num_candidates, num_itopk);
+  topk_by_bitonic_sort_1st<MAX_CANDIDATES, IdxT>(
+    candidate_distances, candidate_indices, num_candidates, num_itopk, MULTI_WARPS_1);
 
   // The results sorted above are merged with the internal intermediate top-k
   // results so far using bitonic merge.
-  topk_by_bitonic_sort_2nd<MAX_ITOPK, MULTI_WARPS_2, IdxT>(itopk_distances,
-                                                           itopk_indices,
-                                                           num_itopk,
-                                                           candidate_distances,
-                                                           candidate_indices,
-                                                           num_candidates,
-                                                           work_buf,
-                                                           first);
+  topk_by_bitonic_sort_2nd<MAX_ITOPK, IdxT>(itopk_distances,
+                                            itopk_indices,
+                                            num_itopk,
+                                            candidate_distances,
+                                            candidate_indices,
+                                            num_candidates,
+                                            work_buf,
+                                            first,
+                                            MULTI_WARPS_2);
 }
 
-template <unsigned FIRST_TID, unsigned LAST_TID, class INDEX_T>
+template <class INDEX_T>
 __device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr,
                                        const size_t hashmap_bitlen,
                                        const INDEX_T* itopk_indices,
-                                       uint32_t itopk_size)
+                                       const uint32_t itopk_size,
+                                       const uint32_t first_tid = 0)
 {
   constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
-  if (threadIdx.x < FIRST_TID || threadIdx.x >= LAST_TID) return;
-  for (unsigned i = threadIdx.x - FIRST_TID; i < itopk_size; i += LAST_TID - FIRST_TID) {
+  if (threadIdx.x < first_tid) return;
+  for (unsigned i = threadIdx.x - first_tid; i < itopk_size; i += blockDim.x - first_tid) {
     auto key = itopk_indices[i] & ~index_msb_1_mask;  // clear most significant bit
     hashmap::insert(hashmap_ptr, hashmap_bitlen, key);
   }
@@ -450,20 +455,19 @@ __device__ inline void set_value_device(T* const ptr, const T fill, const std::u
 
 // One query one thread block
 template <unsigned TEAM_SIZE,
-          unsigned BLOCK_SIZE,
-          unsigned BLOCK_COUNT,
           unsigned MAX_ITOPK,
           unsigned MAX_CANDIDATES,
           unsigned TOPK_BY_BITONIC_SORT,
           unsigned MAX_DATASET_DIM,
           class DATA_T,
           class DISTANCE_T,
-          class INDEX_T>
-__launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
+          class INDEX_T,
+          class SAMPLE_FILTER_T>
+__launch_bounds__(1024, 1) __global__
   void search_kernel(INDEX_T* const result_indices_ptr,       // [num_queries, top_k]
                      DISTANCE_T* const result_distances_ptr,  // [num_queries, top_k]
                      const std::uint32_t top_k,
-                     const DATA_T* const dataset_ptr,         // [dataset_size, dataset_dim]
+                     const DATA_T* const dataset_ptr,  // [dataset_size, dataset_dim]
                      const std::size_t dataset_dim,
                      const std::size_t dataset_size,
                      const std::size_t dataset_ld,     // stride of dataset
@@ -472,7 +476,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
                      const std::uint32_t graph_degree,
                      const unsigned num_distilation,
                      const uint64_t rand_xor_mask,
-                     const INDEX_T* seed_ptr,             // [num_queries, num_seeds]
+                     const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
                      const uint32_t num_seeds,
                      INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
                      const std::uint32_t internal_topk,
@@ -482,7 +486,8 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
                      std::uint32_t* const num_executed_iterations,  // [num_queries]
                      const std::uint32_t hash_bitlen,
                      const std::uint32_t small_hash_bitlen,
-                     const std::uint32_t small_hash_reset_interval)
+                     const std::uint32_t small_hash_reset_interval,
+                     SAMPLE_FILTER_T sample_filter)
 {
   using LOAD_T        = device::LOAD_128BIT_T;
   const auto query_id = blockIdx.y;
@@ -527,8 +532,11 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
   auto terminate_flag     = reinterpret_cast<std::uint32_t*>(topk_ws + 3);
   auto smem_working_ptr   = reinterpret_cast<std::uint32_t*>(terminate_flag + 1);
 
+  // A flag for filtering.
+  auto filter_flag = terminate_flag;
+
   const DATA_T* const query_ptr = queries_ptr + query_id * dataset_dim;
-  for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) {
+  for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += blockDim.x) {
     unsigned j = device::swizzling(i);
     if (i < dataset_dim) {
       query_buffer[j] = spatial::knn::detail::utils::mapping<float>{}(query_ptr[i]);
@@ -548,7 +556,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
   } else {
     local_visited_hashmap_ptr = visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * query_id);
   }
-  hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+  hashmap::init(local_visited_hashmap_ptr, hash_bitlen, 0);
   __syncthreads();
   _CLK_REC(clk_init);
 
@@ -576,7 +584,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
   std::uint32_t iter = 0;
   while (1) {
     // sort
-    if (TOPK_BY_BITONIC_SORT) {
+    if constexpr (TOPK_BY_BITONIC_SORT) {
       // [Notice]
       // It is good to use multiple warps in topk_by_bitonic_sort() when
       // batch size is small (short-latency), but it might not be always good
@@ -584,8 +592,8 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
       // topk_by_bitonic_sort() consists of two operations:
       // if MAX_CANDIDATES is greater than 128, the first operation uses two warps;
       // if MAX_ITOPK is greater than 256, the second operation used two warps.
-      constexpr unsigned multi_warps_1 = ((BLOCK_SIZE >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0;
-      constexpr unsigned multi_warps_2 = ((BLOCK_SIZE >= 64) && (MAX_ITOPK > 256)) ? 1 : 0;
+      const unsigned multi_warps_1 = ((blockDim.x >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0;
+      const unsigned multi_warps_2 = ((blockDim.x >= 64) && (MAX_ITOPK > 256)) ? 1 : 0;
 
       // reset small-hash table.
       if ((iter + 1) % small_hash_reset_interval == 0) {
@@ -594,41 +602,56 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
         // the small hash and whether they are performed in overlap with
         // topk_by_bitonic_sort().
         _CLK_START();
-        if (BLOCK_SIZE == 32) {
-          hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
-        } else if (BLOCK_SIZE == 64) {
+        unsigned hash_start_tid;
+        if (blockDim.x == 32) {
+          hash_start_tid = 0;
+        } else if (blockDim.x == 64) {
           if (multi_warps_1 || multi_warps_2) {
-            hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+            hash_start_tid = 0;
           } else {
-            hashmap::init<32, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+            hash_start_tid = 32;
           }
         } else {
           if (multi_warps_1 || multi_warps_2) {
-            hashmap::init<64, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+            hash_start_tid = 64;
           } else {
-            hashmap::init<32, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+            hash_start_tid = 32;
           }
         }
+        hashmap::init(local_visited_hashmap_ptr, hash_bitlen, hash_start_tid);
         _CLK_REC(clk_reset_hash);
       }
 
       // topk with bitonic sort
       _CLK_START();
-      topk_by_bitonic_sort<MAX_ITOPK, MAX_CANDIDATES, multi_warps_1, multi_warps_2>(
-        result_distances_buffer,
-        result_indices_buffer,
-        internal_topk,
-        result_distances_buffer + internal_topk,
-        result_indices_buffer + internal_topk,
-        search_width * graph_degree,
-        topk_ws,
-        (iter == 0));
+      if (std::is_same<SAMPLE_FILTER_T,
+                       raft::neighbors::filtering::none_cagra_sample_filter>::value ||
+          *filter_flag == 0) {
+        topk_by_bitonic_sort<MAX_ITOPK, MAX_CANDIDATES>(result_distances_buffer,
+                                                        result_indices_buffer,
+                                                        internal_topk,
+                                                        result_distances_buffer + internal_topk,
+                                                        result_indices_buffer + internal_topk,
+                                                        search_width * graph_degree,
+                                                        topk_ws,
+                                                        (iter == 0),
+                                                        multi_warps_1,
+                                                        multi_warps_2);
+        __syncthreads();
+      } else {
+        topk_by_bitonic_sort_1st<MAX_ITOPK + MAX_CANDIDATES>(
+          result_distances_buffer,
+          result_indices_buffer,
+          internal_topk + search_width * graph_degree,
+          internal_topk,
+          false);
+        if (threadIdx.x == 0) { *terminate_flag = 0; }
+      }
       _CLK_REC(clk_topk);
-
     } else {
       _CLK_START();
       // topk with radix block sort
-      topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>{}(
+      topk_by_radix_sort<MAX_ITOPK, INDEX_T>{}(
         internal_topk,
         gridDim.x,
         result_buffer_size,
@@ -645,7 +668,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
       // reset small-hash table
       if ((iter + 1) % small_hash_reset_interval == 0) {
         _CLK_START();
-        hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+        hashmap::init(local_visited_hashmap_ptr, hash_bitlen);
         _CLK_REC(clk_reset_hash);
       }
     }
@@ -667,10 +690,10 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
 
     // restore small-hash table by putting internal-topk indices in it
     if ((iter + 1) % small_hash_reset_interval == 0) {
-      constexpr unsigned first_tid = ((BLOCK_SIZE <= 32) ? 0 : 32);
+      const unsigned first_tid = ((blockDim.x <= 32) ? 0 : 32);
       _CLK_START();
-      hashmap_restore<first_tid, BLOCK_SIZE>(
-        local_visited_hashmap_ptr, hash_bitlen, result_indices_buffer, internal_topk);
+      hashmap_restore(
+        local_visited_hashmap_ptr, hash_bitlen, result_indices_buffer, internal_topk, first_tid);
       _CLK_REC(clk_restore_hash);
     }
     __syncthreads();
@@ -680,26 +703,75 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
     // compute the norms between child nodes and query node
     _CLK_START();
     constexpr unsigned max_n_frags = 16;
-    device::
-      compute_distance_to_child_nodes<TEAM_SIZE, BLOCK_SIZE, MAX_DATASET_DIM, max_n_frags, LOAD_T>(
-        result_indices_buffer + internal_topk,
-        result_distances_buffer + internal_topk,
-        query_buffer,
-        dataset_ptr,
-        dataset_dim,
-        dataset_ld,
-        knn_graph,
-        graph_degree,
-        local_visited_hashmap_ptr,
-        hash_bitlen,
-        parent_list_buffer,
-        search_width);
+    device::compute_distance_to_child_nodes<TEAM_SIZE, MAX_DATASET_DIM, max_n_frags, LOAD_T>(
+      result_indices_buffer + internal_topk,
+      result_distances_buffer + internal_topk,
+      query_buffer,
+      dataset_ptr,
+      dataset_dim,
+      dataset_ld,
+      knn_graph,
+      graph_degree,
+      local_visited_hashmap_ptr,
+      hash_bitlen,
+      parent_list_buffer,
+      result_indices_buffer,
+      search_width);
     __syncthreads();
     _CLK_REC(clk_compute_distance);
 
+    // Filtering
+    if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                                raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+      if (threadIdx.x == 0) { *filter_flag = 0; }
+      __syncthreads();
+
+      constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+      const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
+
+      for (unsigned p = threadIdx.x; p < search_width; p += blockDim.x) {
+        if (parent_list_buffer[p] != invalid_index) {
+          const auto parent_id = result_indices_buffer[parent_list_buffer[p]] & ~index_msb_1_mask;
+          if (!sample_filter(query_id, parent_id)) {
+            // If the parent must not be in the resulting top-k list, remove from the parent list
+            result_distances_buffer[parent_list_buffer[p]] = utils::get_max_value<DISTANCE_T>();
+            result_indices_buffer[parent_list_buffer[p]]   = invalid_index;
+            *filter_flag                                   = 1;
+          }
+        }
+      }
+      __syncthreads();
+    }
+
     iter++;
   }
-  for (std::uint32_t i = threadIdx.x; i < top_k; i += BLOCK_SIZE) {
+
+  // Post process for filtering
+  if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                              raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+    constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+    const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
+
+    for (unsigned i = threadIdx.x; i < internal_topk + search_width * graph_degree;
+         i += blockDim.x) {
+      const auto node_id = result_indices_buffer[i] & ~index_msb_1_mask;
+      if (node_id != (invalid_index & ~index_msb_1_mask) && !sample_filter(query_id, node_id)) {
+        result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
+        result_indices_buffer[i]   = invalid_index;
+      }
+    }
+
+    __syncthreads();
+    topk_by_bitonic_sort_1st<MAX_ITOPK + MAX_CANDIDATES>(
+      result_distances_buffer,
+      result_indices_buffer,
+      internal_topk + search_width * graph_degree,
+      top_k,
+      false);
+    __syncthreads();
+  }
+
+  for (std::uint32_t i = threadIdx.x; i < top_k; i += blockDim.x) {
     unsigned j  = i + (top_k * query_id);
     unsigned ii = i;
     if (TOPK_BY_BITONIC_SORT) { ii = device::swizzling(i); }
@@ -737,36 +809,53 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
 #endif
 }
 
-template <unsigned TEAM_SIZE, unsigned MX_DIM, typename T, typename IdxT, typename DistT>
+template <unsigned TEAM_SIZE,
+          unsigned MX_DIM,
+          typename T,
+          typename IdxT,
+          typename DistT,
+          typename SAMPLE_FILTER_T>
 struct search_kernel_config {
-  using kernel_t = decltype(&search_kernel<TEAM_SIZE, 64, 16, 64, 64, 0, MX_DIM, T, DistT, IdxT>);
+  using kernel_t =
+    decltype(&search_kernel<TEAM_SIZE, 64, 64, 0, MX_DIM, T, DistT, IdxT, SAMPLE_FILTER_T>);
 
-  template <unsigned MAX_ITOPK, unsigned CANDIDATES, unsigned USE_BITONIC_SORT>
-  static auto choose_block_size(unsigned block_size) -> kernel_t
+  template <unsigned MAX_CANDIDATES, unsigned USE_BITONIC_SORT>
+  static auto choose_search_kernel(unsigned itopk_size) -> kernel_t
   {
-    constexpr unsigned BS = USE_BITONIC_SORT;
-    if constexpr (BS) {
-      if (block_size == 64) {
-        return search_kernel<TEAM_SIZE, 64, 16, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      } else if (block_size == 128) {
-        return search_kernel<TEAM_SIZE, 128, 8, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      } else if (block_size == 256) {
-        return search_kernel<TEAM_SIZE, 256, 4, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      } else if (block_size == 512) {
-        return search_kernel<TEAM_SIZE, 512, 2, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      } else {
-        return search_kernel<TEAM_SIZE, 1024, 1, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      }
-
-    } else {
-      if (block_size == 256) {
-        return search_kernel<TEAM_SIZE, 256, 4, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      } else if (block_size == 512) {
-        return search_kernel<TEAM_SIZE, 512, 2, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      } else {
-        return search_kernel<TEAM_SIZE, 1024, 1, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      }
+    if (itopk_size <= 64) {
+      return search_kernel<TEAM_SIZE, 64, MAX_CANDIDATES, USE_BITONIC_SORT, MX_DIM, T, DistT, IdxT>;
+    } else if (itopk_size <= 128) {
+      return search_kernel<TEAM_SIZE,
+                           128,
+                           MAX_CANDIDATES,
+                           USE_BITONIC_SORT,
+                           MX_DIM,
+                           T,
+                           DistT,
+                           IdxT,
+                           SAMPLE_FILTER_T>;
+    } else if (itopk_size <= 256) {
+      return search_kernel<TEAM_SIZE,
+                           256,
+                           MAX_CANDIDATES,
+                           USE_BITONIC_SORT,
+                           MX_DIM,
+                           T,
+                           DistT,
+                           IdxT,
+                           SAMPLE_FILTER_T>;
+    } else if (itopk_size <= 512) {
+      return search_kernel<TEAM_SIZE,
+                           512,
+                           MAX_CANDIDATES,
+                           USE_BITONIC_SORT,
+                           MX_DIM,
+                           T,
+                           DistT,
+                           IdxT,
+                           SAMPLE_FILTER_T>;
     }
+    THROW("No kernel for parametels itopk_size %u, max_candidates %u", itopk_size, MAX_CANDIDATES);
   }
 
   static auto choose_itopk_and_mx_candidates(unsigned itopk_size,
@@ -775,45 +864,18 @@ struct search_kernel_config {
   {
     if (num_itopk_candidates <= 64) {
       // use bitonic sort based topk
-      constexpr unsigned max_candidates = 64;
-      if (itopk_size <= 64) {
-        return choose_block_size<64, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 128) {
-        return choose_block_size<128, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 256) {
-        return choose_block_size<256, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 512) {
-        return choose_block_size<512, max_candidates, 1>(block_size);
-      }
+      return choose_search_kernel<64, 1>(itopk_size);
     } else if (num_itopk_candidates <= 128) {
-      constexpr unsigned max_candidates = 128;
-      if (itopk_size <= 64) {
-        return choose_block_size<64, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 128) {
-        return choose_block_size<128, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 256) {
-        return choose_block_size<256, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 512) {
-        return choose_block_size<512, max_candidates, 1>(block_size);
-      }
+      return choose_search_kernel<128, 1>(itopk_size);
     } else if (num_itopk_candidates <= 256) {
-      constexpr unsigned max_candidates = 256;
-      if (itopk_size <= 64) {
-        return choose_block_size<64, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 128) {
-        return choose_block_size<128, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 256) {
-        return choose_block_size<256, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 512) {
-        return choose_block_size<512, max_candidates, 1>(block_size);
-      }
+      return choose_search_kernel<256, 1>(itopk_size);
     } else {
       // Radix-based topk is used
       constexpr unsigned max_candidates = 32;  // to avoid build failure
       if (itopk_size <= 256) {
-        return choose_block_size<256, max_candidates, 0>(block_size);
+        return search_kernel<TEAM_SIZE, 256, max_candidates, 0, MX_DIM, T, DistT, IdxT>;
       } else if (itopk_size <= 512) {
-        return choose_block_size<512, max_candidates, 0>(block_size);
+        return search_kernel<TEAM_SIZE, 512, max_candidates, 0, MX_DIM, T, DistT, IdxT>;
       }
     }
     THROW("No kernel for parametels itopk_size %u, num_itopk_candidates %u",
@@ -826,13 +888,14 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
 void select_and_run(  // raft::resources const& res,
   raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
   raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
-  INDEX_T* const topk_indices_ptr,          // [num_queries, topk]
-  DISTANCE_T* const topk_distances_ptr,     // [num_queries, topk]
-  const DATA_T* const queries_ptr,          // [num_queries, dataset_dim]
+  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
+  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
+  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
   const uint32_t num_queries,
   const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
   uint32_t* const num_executed_iterations,  // [num_queries,]
@@ -851,16 +914,18 @@ void select_and_run(  // raft::resources const& res,
   size_t search_width,
   size_t min_iterations,
   size_t max_iterations,
+  SAMPLE_FILTER_T sample_filter,
   cudaStream_t stream)
 {
-  auto kernel = search_kernel_config<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>::
-    choose_itopk_and_mx_candidates(itopk_size, num_itopk_candidates, block_size);
+  auto kernel =
+    search_kernel_config<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::
+      choose_itopk_and_mx_candidates(itopk_size, num_itopk_candidates, block_size);
   RAFT_CUDA_TRY(
     cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
   dim3 thread_dims(block_size, 1, 1);
   dim3 block_dims(1, num_queries, 1);
   RAFT_LOG_DEBUG(
-    "Launching kernel with %u threads, %u block %lu smem", block_size, num_queries, smem_size);
+    "Launching kernel with %u threads, %u block %u smem", block_size, num_queries, smem_size);
   kernel<<<block_dims, thread_dims, smem_size, stream>>>(topk_indices_ptr,
                                                          topk_distances_ptr,
                                                          topk,
@@ -883,7 +948,8 @@ void select_and_run(  // raft::resources const& res,
                                                          num_executed_iterations,
                                                          hash_bitlen,
                                                          small_hash_bitlen,
-                                                         small_hash_reset_interval);
+                                                         small_hash_reset_interval,
+                                                         sample_filter);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 }  // namespace single_cta_search
diff --git a/cpp/include/raft/neighbors/detail/cagra/topk_by_radix.cuh b/cpp/include/raft/neighbors/detail/cagra/topk_by_radix.cuh
index a1b7f930d3..6a6a3cddf4 100644
--- a/cpp/include/raft/neighbors/detail/cagra/topk_by_radix.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/topk_by_radix.cuh
@@ -26,14 +26,11 @@ struct topk_by_radix_sort_base {
   static constexpr std::uint32_t state_bit_lenght = 0;
   static constexpr std::uint32_t vecLen           = 2;  // TODO
 };
-template <unsigned MAX_INTERNAL_TOPK, unsigned BLOCK_SIZE, class IdxT, class = void>
+template <unsigned MAX_INTERNAL_TOPK, class IdxT, class = void>
 struct topk_by_radix_sort : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {};
 
-template <unsigned MAX_INTERNAL_TOPK, unsigned BLOCK_SIZE, class IdxT>
-struct topk_by_radix_sort<MAX_INTERNAL_TOPK,
-                          BLOCK_SIZE,
-                          IdxT,
-                          std::enable_if_t<((MAX_INTERNAL_TOPK <= 64))>>
+template <unsigned MAX_INTERNAL_TOPK, class IdxT>
+struct topk_by_radix_sort<MAX_INTERNAL_TOPK, IdxT, std::enable_if_t<((MAX_INTERNAL_TOPK <= 64))>>
   : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {
   __device__ void operator()(uint32_t topk,
                              uint32_t batch_size,
@@ -48,8 +45,7 @@ struct topk_by_radix_sort<MAX_INTERNAL_TOPK,
                              uint32_t* _smem)
   {
     std::uint8_t* const state = reinterpret_cast<std::uint8_t*>(work);
-    topk_cta_11_core<BLOCK_SIZE,
-                     topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::state_bit_lenght,
+    topk_cta_11_core<topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::state_bit_lenght,
                      topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::vecLen,
                      64,
                      32,
@@ -58,10 +54,9 @@ struct topk_by_radix_sort<MAX_INTERNAL_TOPK,
 };
 
 #define TOP_FUNC_PARTIAL_SPECIALIZATION(V)                                           \
-  template <unsigned MAX_INTERNAL_TOPK, unsigned BLOCK_SIZE, class IdxT>             \
+  template <unsigned MAX_INTERNAL_TOPK, class IdxT>                                  \
   struct topk_by_radix_sort<                                                         \
     MAX_INTERNAL_TOPK,                                                               \
-    BLOCK_SIZE,                                                                      \
     IdxT,                                                                            \
     std::enable_if_t<((MAX_INTERNAL_TOPK <= V) && (2 * MAX_INTERNAL_TOPK > V))>>     \
     : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {                                   \
@@ -77,10 +72,9 @@ struct topk_by_radix_sort<MAX_INTERNAL_TOPK,
                                bool sort,                                            \
                                uint32_t* _smem)                                      \
     {                                                                                \
-      assert(BLOCK_SIZE >= V / 4);                                                   \
+      assert(blockDim.x >= V / 4);                                                   \
       std::uint8_t* state = (std::uint8_t*)work;                                     \
-      topk_cta_11_core<BLOCK_SIZE,                                                   \
-                       topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::state_bit_lenght, \
+      topk_cta_11_core<topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::state_bit_lenght, \
                        topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::vecLen,           \
                        V,                                                            \
                        V / 4,                                                        \
diff --git a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
index dd73558f86..fd4aeb9bb3 100644
--- a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
@@ -22,8 +22,6 @@
 #include <stdio.h>
 
 namespace raft::neighbors::cagra::detail {
-using namespace cub;
-
 //
 __device__ inline uint32_t convert(uint32_t x)
 {
@@ -174,8 +172,46 @@ __device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, i
   return xi;
 }
 
+template <typename T>
+__device__ inline void block_scan(const T input, T& output)
+{
+  switch (blockDim.x) {
+    case 32: {
+      typedef cub::BlockScan<T, 32> BlockScanT;
+      __shared__ typename BlockScanT::TempStorage temp_storage;
+      BlockScanT(temp_storage).InclusiveSum(input, output);
+    } break;
+    case 64: {
+      typedef cub::BlockScan<T, 64> BlockScanT;
+      __shared__ typename BlockScanT::TempStorage temp_storage;
+      BlockScanT(temp_storage).InclusiveSum(input, output);
+    } break;
+    case 128: {
+      typedef cub::BlockScan<T, 128> BlockScanT;
+      __shared__ typename BlockScanT::TempStorage temp_storage;
+      BlockScanT(temp_storage).InclusiveSum(input, output);
+    } break;
+    case 256: {
+      typedef cub::BlockScan<T, 256> BlockScanT;
+      __shared__ typename BlockScanT::TempStorage temp_storage;
+      BlockScanT(temp_storage).InclusiveSum(input, output);
+    } break;
+    case 512: {
+      typedef cub::BlockScan<T, 512> BlockScanT;
+      __shared__ typename BlockScanT::TempStorage temp_storage;
+      BlockScanT(temp_storage).InclusiveSum(input, output);
+    } break;
+    case 1024: {
+      typedef cub::BlockScan<T, 1024> BlockScanT;
+      __shared__ typename BlockScanT::TempStorage temp_storage;
+      BlockScanT(temp_storage).InclusiveSum(input, output);
+    } break;
+    default: break;
+  }
+}
+
 //
-template <typename T, int blockDim_x, int stateBitLen, int vecLen>
+template <typename T, int stateBitLen, int vecLen>
 __device__ inline void update_histogram(int itr,
                                         uint32_t thread_id,
                                         uint32_t num_threads,
@@ -183,9 +219,9 @@ __device__ inline void update_histogram(int itr,
                                         uint32_t threshold,
                                         uint32_t& num_bins,
                                         uint32_t& shift,
-                                        const T* x,        // [nx,]
+                                        const T* x,  // [nx,]
                                         uint32_t nx,
-                                        uint32_t* hist,    // [num_bins]
+                                        uint32_t* hist,  // [num_bins]
                                         uint8_t* state,
                                         uint32_t* output,  // [topk]
                                         uint32_t* output_count)
@@ -220,7 +256,7 @@ __device__ inline void update_histogram(int itr,
     return;
   }
   if (itr > 0) {
-    for (int i = threadIdx.x; i < num_bins; i += blockDim_x) {
+    for (int i = threadIdx.x; i < num_bins; i += blockDim.x) {
       hist[i] = 0;
     }
     __syncthreads();
@@ -285,8 +321,53 @@ __device__ inline void update_histogram(int itr,
   __syncthreads();
 }
 
+template <unsigned blockDim_x>
+__device__ inline void select_best_index_for_next_threshold_core(uint32_t& my_index,
+                                                                 uint32_t& my_csum,
+                                                                 const unsigned num_bins,
+                                                                 const uint32_t* const hist,
+                                                                 const uint32_t nx_below_threshold,
+                                                                 const uint32_t max_threshold,
+                                                                 const uint32_t threshold,
+                                                                 const uint32_t shift,
+                                                                 const uint32_t topk)
+{
+  typedef cub::BlockScan<uint32_t, blockDim_x> BlockScanT;
+  __shared__ typename BlockScanT::TempStorage temp_storage;
+  if (num_bins == 2048) {
+    constexpr int n_data = 2048 / blockDim_x;
+    uint32_t csum[n_data];
+    for (int i = 0; i < n_data; i++) {
+      csum[i] = hist[i + (n_data * threadIdx.x)];
+    }
+    BlockScanT(temp_storage).InclusiveSum(csum, csum);
+    for (int i = n_data - 1; i >= 0; i--) {
+      if (nx_below_threshold + csum[i] > topk) continue;
+      const uint32_t index = i + (n_data * threadIdx.x);
+      if (threshold + (index << shift) > max_threshold) continue;
+      my_index = index;
+      my_csum  = csum[i];
+      break;
+    }
+  } else if (num_bins == 1024) {
+    constexpr int n_data = 1024 / blockDim_x;
+    uint32_t csum[n_data];
+    for (int i = 0; i < n_data; i++) {
+      csum[i] = hist[i + (n_data * threadIdx.x)];
+    }
+    BlockScanT(temp_storage).InclusiveSum(csum, csum);
+    for (int i = n_data - 1; i >= 0; i--) {
+      if (nx_below_threshold + csum[i] > topk) continue;
+      const uint32_t index = i + (n_data * threadIdx.x);
+      if (threshold + (index << shift) > max_threshold) continue;
+      my_index = index;
+      my_csum  = csum[i];
+      break;
+    }
+  }
+}
+
 //
-template <int blockDim_x>
 __device__ inline void select_best_index_for_next_threshold(
   const uint32_t topk,
   const uint32_t threshold,
@@ -302,15 +383,12 @@ __device__ inline void select_best_index_for_next_threshold(
   // index under the condition that the sum of the number of elements found
   // so far ('nx_below_threshold') and the csum value does not exceed the
   // topk value.
-  typedef BlockScan<uint32_t, blockDim_x> BlockScanT;
-  __shared__ typename BlockScanT::TempStorage temp_storage;
-
   uint32_t my_index = 0xffffffff;
   uint32_t my_csum  = 0;
-  if (num_bins <= blockDim_x) {
+  if (num_bins <= blockDim.x) {
     uint32_t csum = 0;
     if (threadIdx.x < num_bins) { csum = hist[threadIdx.x]; }
-    BlockScanT(temp_storage).InclusiveSum(csum, csum);
+    detail::block_scan(csum, csum);
     if (threadIdx.x < num_bins) {
       const uint32_t index = threadIdx.x;
       if ((nx_below_threshold + csum <= topk) && (threshold + (index << shift) <= max_threshold)) {
@@ -319,36 +397,62 @@ __device__ inline void select_best_index_for_next_threshold(
       }
     }
   } else {
-    if (num_bins == 2048) {
-      constexpr int n_data = 2048 / blockDim_x;
-      uint32_t csum[n_data];
-      for (int i = 0; i < n_data; i++) {
-        csum[i] = hist[i + (n_data * threadIdx.x)];
-      }
-      BlockScanT(temp_storage).InclusiveSum(csum, csum);
-      for (int i = n_data - 1; i >= 0; i--) {
-        if (nx_below_threshold + csum[i] > topk) continue;
-        const uint32_t index = i + (n_data * threadIdx.x);
-        if (threshold + (index << shift) > max_threshold) continue;
-        my_index = index;
-        my_csum  = csum[i];
+    switch (blockDim.x) {
+      case 64:
+        select_best_index_for_next_threshold_core<64>(my_index,
+                                                      my_csum,
+                                                      num_bins,
+                                                      hist,
+                                                      nx_below_threshold,
+                                                      max_threshold,
+                                                      threshold,
+                                                      shift,
+                                                      topk);
         break;
-      }
-    } else if (num_bins == 1024) {
-      constexpr int n_data = 1024 / blockDim_x;
-      uint32_t csum[n_data];
-      for (int i = 0; i < n_data; i++) {
-        csum[i] = hist[i + (n_data * threadIdx.x)];
-      }
-      BlockScanT(temp_storage).InclusiveSum(csum, csum);
-      for (int i = n_data - 1; i >= 0; i--) {
-        if (nx_below_threshold + csum[i] > topk) continue;
-        const uint32_t index = i + (n_data * threadIdx.x);
-        if (threshold + (index << shift) > max_threshold) continue;
-        my_index = index;
-        my_csum  = csum[i];
+      case 128:
+        select_best_index_for_next_threshold_core<128>(my_index,
+                                                       my_csum,
+                                                       num_bins,
+                                                       hist,
+                                                       nx_below_threshold,
+                                                       max_threshold,
+                                                       threshold,
+                                                       shift,
+                                                       topk);
+        break;
+      case 256:
+        select_best_index_for_next_threshold_core<256>(my_index,
+                                                       my_csum,
+                                                       num_bins,
+                                                       hist,
+                                                       nx_below_threshold,
+                                                       max_threshold,
+                                                       threshold,
+                                                       shift,
+                                                       topk);
+        break;
+      case 512:
+        select_best_index_for_next_threshold_core<512>(my_index,
+                                                       my_csum,
+                                                       num_bins,
+                                                       hist,
+                                                       nx_below_threshold,
+                                                       max_threshold,
+                                                       threshold,
+                                                       shift,
+                                                       topk);
+        break;
+      case 1024:
+        select_best_index_for_next_threshold_core<1024>(my_index,
+                                                        my_csum,
+                                                        num_bins,
+                                                        hist,
+                                                        nx_below_threshold,
+                                                        max_threshold,
+                                                        threshold,
+                                                        shift,
+                                                        topk);
         break;
-      }
     }
   }
   if (threadIdx.x < num_bins) {
@@ -481,10 +585,14 @@ __device__ inline uint32_t max_value_of<uint32_t>()
   return ~0u;
 }
 
-template <int blockDim_x, int stateBitLen>
+template <int stateBitLen, unsigned BLOCK_SIZE = 0>
 __device__ __host__ inline uint32_t get_state_size(uint32_t len_x)
 {
-  const uint32_t num_threads = blockDim_x;
+#ifdef __CUDA_ARCH__
+  const uint32_t num_threads = blockDim.x;
+#else
+  const uint32_t num_threads = BLOCK_SIZE;
+#endif
   if (stateBitLen == 8) {
     uint32_t numElements_perThread = (len_x + num_threads - 1) / num_threads;
     uint32_t numState_perThread    = (numElements_perThread + stateBitLen - 1) / stateBitLen;
@@ -494,7 +602,7 @@ __device__ __host__ inline uint32_t get_state_size(uint32_t len_x)
 }
 
 //
-template <int blockDim_x, int stateBitLen, int vecLen, int maxTopk, int numSortThreads, class ValT>
+template <int stateBitLen, int vecLen, int maxTopk, int numSortThreads, class ValT>
 __device__ inline void topk_cta_11_core(uint32_t topk,
                                         uint32_t len_x,
                                         const uint32_t* _x,    // [size_batch, ld_x,]
@@ -511,7 +619,7 @@ __device__ inline void topk_cta_11_core(uint32_t topk,
   uint32_t* const best_index    = &(_smem[2 * maxTopk + 2048]);
   uint32_t* const best_csum     = &(_smem[2 * maxTopk + 2048 + 3]);
 
-  const uint32_t num_threads = blockDim_x;
+  const uint32_t num_threads = blockDim.x;
   const uint32_t thread_id   = threadIdx.x;
   uint32_t nx                = len_x;
   const uint32_t* const x    = _x;
@@ -541,29 +649,29 @@ __device__ inline void topk_cta_11_core(uint32_t topk,
   for (int j = 0; j < 3; j += 1) {
     uint32_t num_bins;
     uint32_t shift;
-    update_histogram<uint32_t, blockDim_x, stateBitLen, vecLen>(j,
-                                                                thread_id,
-                                                                num_threads,
-                                                                hint,
-                                                                threshold,
-                                                                num_bins,
-                                                                shift,
-                                                                x,
-                                                                nx,
-                                                                hist,
-                                                                state,
-                                                                smem_out_vals,
-                                                                output_count);
-
-    select_best_index_for_next_threshold<blockDim_x>(topk,
-                                                     threshold,
-                                                     hint,
-                                                     nx_below_threshold,
-                                                     num_bins,
-                                                     shift,
-                                                     hist,
-                                                     best_index + j,
-                                                     best_csum + j);
+
+    update_histogram<uint32_t, stateBitLen, vecLen>(j,
+                                                    thread_id,
+                                                    num_threads,
+                                                    hint,
+                                                    threshold,
+                                                    num_bins,
+                                                    shift,
+                                                    x,
+                                                    nx,
+                                                    hist,
+                                                    state,
+                                                    smem_out_vals,
+                                                    output_count);
+    select_best_index_for_next_threshold(topk,
+                                         threshold,
+                                         hint,
+                                         nx_below_threshold,
+                                         num_bins,
+                                         shift,
+                                         hist,
+                                         best_index + j,
+                                         best_csum + j);
 
     threshold += (best_index[j] << shift);
     nx_below_threshold += best_csum[j];
@@ -601,7 +709,7 @@ __device__ inline void topk_cta_11_core(uint32_t topk,
 #endif
 
   if (!sort) {
-    for (int k = thread_id; k < topk; k += blockDim_x) {
+    for (int k = thread_id; k < topk; k += blockDim.x) {
       const uint32_t i = smem_out_vals[k];
       if (y) { y[k] = x[i]; }
       if (out_vals) {
@@ -756,21 +864,21 @@ int _get_vecLen(uint32_t maxSamples, int maxVecLen = MAX_VEC_LENGTH)
 }
 }  // unnamed namespace
 
-template <int blockDim_x, int stateBitLen, int vecLen, int maxTopk, int numSortThreads, class ValT>
+template <int stateBitLen, int vecLen, int maxTopk, int numSortThreads, class ValT>
 __launch_bounds__(1024, 1) __global__
   void kern_topk_cta_11(uint32_t topk,
                         uint32_t size_batch,
                         uint32_t len_x,
-                        const uint32_t* _x,    // [size_batch, ld_x,]
+                        const uint32_t* _x,  // [size_batch, ld_x,]
                         uint32_t ld_x,
                         const ValT* _in_vals,  // [size_batch, ld_iv,]
                         uint32_t ld_iv,
-                        uint32_t* _y,          // [size_batch, ld_y,]
+                        uint32_t* _y,  // [size_batch, ld_y,]
                         uint32_t ld_y,
-                        ValT* _out_vals,       // [size_batch, ld_ov,]
+                        ValT* _out_vals,  // [size_batch, ld_ov,]
                         uint32_t ld_ov,
-                        uint8_t* _state,       // [size_batch, ...,]
-                        uint32_t* _hints,      // [size_batch,]
+                        uint8_t* _state,   // [size_batch, ...,]
+                        uint32_t* _hints,  // [size_batch,]
                         bool sort)
 {
   const uint32_t i_batch = blockIdx.x;
@@ -781,14 +889,14 @@ __launch_bounds__(1024, 1) __global__
                 "maxTopk * sizeof(ValT) must be smaller or equal to 8192 byte");
   __shared__ uint32_t _smem[smem_len];
 
-  topk_cta_11_core<blockDim_x, stateBitLen, vecLen, maxTopk, numSortThreads, ValT>(
+  topk_cta_11_core<stateBitLen, vecLen, maxTopk, numSortThreads, ValT>(
     topk,
     len_x,
     (_x == NULL ? NULL : _x + i_batch * ld_x),
     (_in_vals == NULL ? NULL : _in_vals + i_batch * ld_iv),
     (_y == NULL ? NULL : _y + i_batch * ld_y),
     (_out_vals == NULL ? NULL : _out_vals + i_batch * ld_ov),
-    (_state == NULL ? NULL : _state + i_batch * get_state_size<blockDim_x, stateBitLen>(len_x)),
+    (_state == NULL ? NULL : _state + i_batch * get_state_size<stateBitLen>(len_x)),
     (_hints == NULL ? NULL : _hints + i_batch),
     sort,
     _smem);
@@ -808,7 +916,7 @@ size_t inline _cuann_find_topk_bufferSize(uint32_t topK,
   // state
   if (stateBitLen == 8) {
     workspaceSize = _cuann_aligned(
-      sizeof(uint8_t) * get_state_size<numThreads, stateBitLen>(numElements) * sizeBatch);
+      sizeof(uint8_t) * get_state_size<stateBitLen, numThreads>(numElements) * sizeBatch);
   }
 
   return workspaceSize;
@@ -862,12 +970,12 @@ inline void _cuann_find_topk(uint32_t topK,
                      bool) = nullptr;
 
   // V:vecLen, K:maxTopk, T:numSortThreads
-#define SET_KERNEL_VKT(V, K, T, ValT)                                      \
-  do {                                                                     \
-    assert(numThreads >= T);                                               \
-    assert((K % T) == 0);                                                  \
-    assert((K / T) <= 4);                                                  \
-    cta_kernel = kern_topk_cta_11<numThreads, stateBitLen, V, K, T, ValT>; \
+#define SET_KERNEL_VKT(V, K, T, ValT)                          \
+  do {                                                         \
+    assert(numThreads >= T);                                   \
+    assert((K % T) == 0);                                      \
+    assert((K / T) <= 4);                                      \
+    cta_kernel = kern_topk_cta_11<stateBitLen, V, K, T, ValT>; \
   } while (0)
 
   // V: vecLen
diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
index 22c7a60647..22cbe6bbac 100644
--- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
@@ -20,6 +20,8 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
 #include <type_traits>
 
 namespace raft::neighbors::cagra::detail {
@@ -150,4 +152,97 @@ struct gen_index_msb_1_mask {
 };
 }  // namespace utils
 
+/**
+ * Utility to sync memory from a host_matrix_view to a device_matrix_view
+ *
+ * In certain situations (UVM/HMM/ATS) host memory might be directly accessible on the
+ * device, and no extra allocations need to be performed. This class checks
+ * if the host_matrix_view is already accessible on the device, and only creates device
+ * memory and copies over if necessary. In memory limited situations this is preferable
+ * to having both a host and device copy
+ * TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer
+ * have been merged, we should remove this class and switch over to using mdbuffer for this
+ */
+template <typename T, typename IdxT>
+class device_matrix_view_from_host {
+ public:
+  device_matrix_view_from_host(raft::resources const& res, host_matrix_view<T, IdxT> host_view)
+    : host_view_(host_view)
+  {
+    cudaPointerAttributes attr;
+    RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle()));
+    device_ptr = reinterpret_cast<T*>(attr.devicePointer);
+    if (device_ptr == NULL) {
+      // allocate memory and copy over
+      device_mem_.emplace(
+        raft::make_device_matrix<T, IdxT>(res, host_view.extent(0), host_view.extent(1)));
+      raft::copy(device_mem_->data_handle(),
+                 host_view.data_handle(),
+                 host_view.extent(0) * host_view.extent(1),
+                 resource::get_cuda_stream(res));
+      device_ptr = device_mem_->data_handle();
+    }
+  }
+
+  device_matrix_view<T, IdxT> view()
+  {
+    return make_device_matrix_view<T, IdxT>(device_ptr, host_view_.extent(0), host_view_.extent(1));
+  }
+
+  T* data_handle() { return device_ptr; }
+
+  bool allocated_memory() const { return device_mem_.has_value(); }
+
+ private:
+  std::optional<device_matrix<T, IdxT>> device_mem_;
+  host_matrix_view<T, IdxT> host_view_;
+  T* device_ptr;
+};
+
+/**
+ * Utility to sync memory from a device_matrix_view to a host_matrix_view
+ *
+ * In certain situations (UVM/HMM/ATS) device memory might be directly accessible on the
+ * host, and no extra allocations need to be performed. This class checks
+ * if the device_matrix_view is already accessible on the host, and only creates host
+ * memory and copies over if necessary. In memory limited situations this is preferable
+ * to having both a host and device copy
+ * TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer
+ * have been merged, we should remove this class and switch over to using mdbuffer for this
+ */
+template <typename T, typename IdxT>
+class host_matrix_view_from_device {
+ public:
+  host_matrix_view_from_device(raft::resources const& res, device_matrix_view<T, IdxT> device_view)
+    : device_view_(device_view)
+  {
+    cudaPointerAttributes attr;
+    RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, device_view.data_handle()));
+    host_ptr = reinterpret_cast<T*>(attr.hostPointer);
+    if (host_ptr == NULL) {
+      // allocate memory and copy over
+      host_mem_.emplace(
+        raft::make_host_matrix<T, IdxT>(device_view.extent(0), device_view.extent(1)));
+      raft::copy(host_mem_->data_handle(),
+                 device_view.data_handle(),
+                 device_view.extent(0) * device_view.extent(1),
+                 resource::get_cuda_stream(res));
+      host_ptr = host_mem_->data_handle();
+    }
+  }
+
+  host_matrix_view<T, IdxT> view()
+  {
+    return make_host_matrix_view<T, IdxT>(host_ptr, device_view_.extent(0), device_view_.extent(1));
+  }
+
+  T* data_handle() { return host_ptr; }
+
+  bool allocated_memory() const { return host_mem_.has_value(); }
+
+ private:
+  std::optional<host_matrix<T, IdxT>> host_mem_;
+  device_matrix_view<T, IdxT> device_view_;
+  T* host_ptr;
+};
 }  // namespace raft::neighbors::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
index 93eeb0dead..c0f856103a 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/logger.hpp>                                 // RAFT_LOG_TRACE
+#include <raft/core/logger.hpp>  // RAFT_LOG_TRACE
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>                              // raft::resources
 #include <raft/distance/distance_types.hpp>                     // is_min_close, DistanceType
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 199cb74fbe..47c10de200 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -220,7 +220,7 @@ void select_residuals(raft::resources const& handle,
 template <typename T, typename IdxT>
 void flat_compute_residuals(
   raft::resources const& handle,
-  float* residuals,                                                      // [n_rows, rot_dim]
+  float* residuals,  // [n_rows, rot_dim]
   IdxT n_rows,
   device_matrix_view<const float, uint32_t, row_major> rotation_matrix,  // [rot_dim, dim]
   device_matrix_view<const float, uint32_t, row_major> centers,          // [n_lists, dim_ext]
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
index 2ab216b13b..7c5b523a8b 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
@@ -898,7 +898,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props,
     }
 
     {
-      if (selected_perf.occupancy <= 0.0                 // no candidate yet
+      if (selected_perf.occupancy <= 0.0  // no candidate yet
           || (selected_perf.occupancy < cur.occupancy * kTargetOccupancy &&
               selected_perf.shmem_use >= cur.shmem_use)  // much improved occupancy
       ) {
diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
index 123a902ef9..be05d5545f 100644
--- a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
@@ -64,10 +64,11 @@ void tiled_brute_force_knn(const raft::resources& handle,
                            ElementType* distances,  // size (m, k)
                            IndexType* indices,      // size (m, k)
                            raft::distance::DistanceType metric,
-                           float metric_arg                   = 2.0,
-                           size_t max_row_tile_size           = 0,
-                           size_t max_col_tile_size           = 0,
-                           DistanceEpilogue distance_epilogue = raft::identity_op())
+                           float metric_arg                           = 2.0,
+                           size_t max_row_tile_size                   = 0,
+                           size_t max_col_tile_size                   = 0,
+                           DistanceEpilogue distance_epilogue         = raft::identity_op(),
+                           const ElementType* precomputed_index_norms = nullptr)
 {
   // Figure out the number of rows/cols to tile for
   size_t tile_rows   = 0;
@@ -97,7 +98,7 @@ void tiled_brute_force_knn(const raft::resources& handle,
       metric == raft::distance::DistanceType::L2SqrtExpanded ||
       metric == raft::distance::DistanceType::CosineExpanded) {
     search_norms.resize(m, stream);
-    index_norms.resize(n, stream);
+    if (!precomputed_index_norms) { index_norms.resize(n, stream); }
     // cosine needs the l2norm, where as l2 distances needs the squared norm
     if (metric == raft::distance::DistanceType::CosineExpanded) {
       raft::linalg::rowNorm(search_norms.data(),
@@ -108,19 +109,24 @@ void tiled_brute_force_knn(const raft::resources& handle,
                             true,
                             stream,
                             raft::sqrt_op{});
-      raft::linalg::rowNorm(index_norms.data(),
-                            index,
-                            d,
-                            n,
-                            raft::linalg::NormType::L2Norm,
-                            true,
-                            stream,
-                            raft::sqrt_op{});
+      if (!precomputed_index_norms) {
+        raft::linalg::rowNorm(index_norms.data(),
+                              index,
+                              d,
+                              n,
+                              raft::linalg::NormType::L2Norm,
+                              true,
+                              stream,
+                              raft::sqrt_op{});
+      }
     } else {
       raft::linalg::rowNorm(
         search_norms.data(), search, d, m, raft::linalg::NormType::L2Norm, true, stream);
-      raft::linalg::rowNorm(
-        index_norms.data(), index, d, n, raft::linalg::NormType::L2Norm, true, stream);
+
+      if (!precomputed_index_norms) {
+        raft::linalg::rowNorm(
+          index_norms.data(), index, d, n, raft::linalg::NormType::L2Norm, true, stream);
+      }
     }
     pairwise_metric = raft::distance::DistanceType::InnerProduct;
   }
@@ -178,7 +184,7 @@ void tiled_brute_force_knn(const raft::resources& handle,
       if (metric == raft::distance::DistanceType::L2Expanded ||
           metric == raft::distance::DistanceType::L2SqrtExpanded) {
         auto row_norms = search_norms.data();
-        auto col_norms = index_norms.data();
+        auto col_norms = precomputed_index_norms ? precomputed_index_norms : index_norms.data();
         auto dist      = temp_distances.data();
 
         raft::linalg::map_offset(
@@ -200,7 +206,7 @@ void tiled_brute_force_knn(const raft::resources& handle,
           });
       } else if (metric == raft::distance::DistanceType::CosineExpanded) {
         auto row_norms = search_norms.data();
-        auto col_norms = index_norms.data();
+        auto col_norms = precomputed_index_norms ? precomputed_index_norms : index_norms.data();
         auto dist      = temp_distances.data();
 
         raft::linalg::map_offset(
@@ -330,7 +336,8 @@ void brute_force_knn_impl(
   std::vector<IdxType>* translations  = nullptr,
   raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
   float metricArg                     = 0,
-  DistanceEpilogue distance_epilogue  = raft::identity_op())
+  DistanceEpilogue distance_epilogue  = raft::identity_op(),
+  std::vector<value_t*>* input_norms  = nullptr)
 {
   auto userStream = resource::get_cuda_stream(handle);
 
@@ -424,7 +431,8 @@ void brute_force_knn_impl(
                  rowMajorIndex,
                  rowMajorQuery,
                  stream,
-                 metric);
+                 metric,
+                 input_norms ? (*input_norms)[i] : nullptr);
 
       // Perform necessary post-processing
       if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
@@ -473,7 +481,8 @@ void brute_force_knn_impl(
                                                   metricArg,
                                                   0,
                                                   0,
-                                                  distance_epilogue);
+                                                  distance_epilogue,
+                                                  input_norms ? (*input_norms)[i] : nullptr);
           break;
       }
     }
diff --git a/cpp/include/raft/neighbors/detail/nn_descent.cuh b/cpp/include/raft/neighbors/detail/nn_descent.cuh
new file mode 100644
index 0000000000..3e4d0409bd
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/nn_descent.cuh
@@ -0,0 +1,1453 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <mma.h>
+#include <omp.h>
+
+#include <cub/cub.cuh>
+#include <limits>
+#include <queue>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/fill.h>
+#include <thrust/host_vector.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/device_memory_resource.h>
+
+#include "../nn_descent_types.hpp"
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/neighbors/detail/cagra/device_common.hpp>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/arch.cuh>  // raft::util::arch::SM_*
+#include <raft/util/cuda_dev_essentials.cuh>
+#include <raft/util/cuda_rt_essentials.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+namespace raft::neighbors::experimental::nn_descent::detail {
+
+using pinned_memory_resource = thrust::universal_host_pinned_memory_resource;
+template <typename T>
+using pinned_memory_allocator = thrust::mr::stateless_resource_allocator<T, pinned_memory_resource>;
+
+using DistData_t = float;
+constexpr int DEGREE_ON_DEVICE{32};
+constexpr int SEGMENT_SIZE{32};
+constexpr int counter_interval{100};
+template <typename Index_t>
+struct InternalID_t;
+
+// InternalID_t uses 1 bit for marking (new or old).
+template <>
+class InternalID_t<int> {
+ private:
+  using Index_t = int;
+  Index_t id_{std::numeric_limits<Index_t>::max()};
+
+ public:
+  __host__ __device__ bool is_new() const { return id_ >= 0; }
+  __host__ __device__ Index_t& id_with_flag() { return id_; }
+  __host__ __device__ Index_t id() const
+  {
+    if (is_new()) return id_;
+    return -id_ - 1;
+  }
+  __host__ __device__ void mark_old()
+  {
+    if (id_ >= 0) id_ = -id_ - 1;
+  }
+  __host__ __device__ bool operator==(const InternalID_t<int>& other) const
+  {
+    return id() == other.id();
+  }
+};
+
+template <typename Index_t>
+struct ResultItem;
+
+template <>
+class ResultItem<int> {
+ private:
+  using Index_t = int;
+  Index_t id_;
+  DistData_t dist_;
+
+ public:
+  __host__ __device__ ResultItem()
+    : id_(std::numeric_limits<Index_t>::max()), dist_(std::numeric_limits<DistData_t>::max()){};
+  __host__ __device__ ResultItem(const Index_t id_with_flag, const DistData_t dist)
+    : id_(id_with_flag), dist_(dist){};
+  __host__ __device__ bool is_new() const { return id_ >= 0; }
+  __host__ __device__ Index_t& id_with_flag() { return id_; }
+  __host__ __device__ Index_t id() const
+  {
+    if (is_new()) return id_;
+    return -id_ - 1;
+  }
+  __host__ __device__ DistData_t& dist() { return dist_; }
+
+  __host__ __device__ void mark_old()
+  {
+    if (id_ >= 0) id_ = -id_ - 1;
+  }
+
+  __host__ __device__ bool operator<(const ResultItem<Index_t>& other) const
+  {
+    if (dist_ == other.dist_) return id() < other.id();
+    return dist_ < other.dist_;
+  }
+  __host__ __device__ bool operator==(const ResultItem<Index_t>& other) const
+  {
+    return id() == other.id();
+  }
+  __host__ __device__ bool operator>=(const ResultItem<Index_t>& other) const
+  {
+    return !(*this < other);
+  }
+  __host__ __device__ bool operator<=(const ResultItem<Index_t>& other) const
+  {
+    return (*this == other) || (*this < other);
+  }
+  __host__ __device__ bool operator>(const ResultItem<Index_t>& other) const
+  {
+    return !(*this <= other);
+  }
+  __host__ __device__ bool operator!=(const ResultItem<Index_t>& other) const
+  {
+    return !(*this == other);
+  }
+};
+
+using align32 = raft::Pow2<32>;
+
+template <typename T>
+int get_batch_size(const int it_now, const T nrow, const int batch_size)
+{
+  int it_total = ceildiv(nrow, batch_size);
+  return (it_now == it_total - 1) ? nrow - it_now * batch_size : batch_size;
+}
+
+// for avoiding bank conflict
+template <typename T>
+constexpr __host__ __device__ __forceinline__ int skew_dim(int ndim)
+{
+  // all "4"s are for alignment
+  if constexpr (std::is_same<T, float>::value) {
+    ndim = ceildiv(ndim, 4) * 4;
+    return ndim + (ndim % 32 == 0) * 4;
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ ResultItem<T> xor_swap(ResultItem<T> x, int mask, int dir)
+{
+  ResultItem<T> y;
+  y.dist() = __shfl_xor_sync(raft::warp_full_mask(), x.dist(), mask, raft::warp_size());
+  y.id_with_flag() =
+    __shfl_xor_sync(raft::warp_full_mask(), x.id_with_flag(), mask, raft::warp_size());
+  return x < y == dir ? y : x;
+}
+
+__device__ __forceinline__ int xor_swap(int x, int mask, int dir)
+{
+  int y = __shfl_xor_sync(raft::warp_full_mask(), x, mask, raft::warp_size());
+  return x < y == dir ? y : x;
+}
+
+// TODO: Move to RAFT utils https://github.com/rapidsai/raft/issues/1827
+__device__ __forceinline__ uint bfe(uint lane_id, uint pos)
+{
+  uint res;
+  asm("bfe.u32 %0,%1,%2,%3;" : "=r"(res) : "r"(lane_id), "r"(pos), "r"(1));
+  return res;
+}
+
+template <typename T>
+__device__ __forceinline__ void warp_bitonic_sort(T* element_ptr, const int lane_id)
+{
+  static_assert(raft::warp_size() == 32);
+  auto& element = *element_ptr;
+  element       = xor_swap(element, 0x01, bfe(lane_id, 1) ^ bfe(lane_id, 0));
+  element       = xor_swap(element, 0x02, bfe(lane_id, 2) ^ bfe(lane_id, 1));
+  element       = xor_swap(element, 0x01, bfe(lane_id, 2) ^ bfe(lane_id, 0));
+  element       = xor_swap(element, 0x04, bfe(lane_id, 3) ^ bfe(lane_id, 2));
+  element       = xor_swap(element, 0x02, bfe(lane_id, 3) ^ bfe(lane_id, 1));
+  element       = xor_swap(element, 0x01, bfe(lane_id, 3) ^ bfe(lane_id, 0));
+  element       = xor_swap(element, 0x08, bfe(lane_id, 4) ^ bfe(lane_id, 3));
+  element       = xor_swap(element, 0x04, bfe(lane_id, 4) ^ bfe(lane_id, 2));
+  element       = xor_swap(element, 0x02, bfe(lane_id, 4) ^ bfe(lane_id, 1));
+  element       = xor_swap(element, 0x01, bfe(lane_id, 4) ^ bfe(lane_id, 0));
+  element       = xor_swap(element, 0x10, bfe(lane_id, 4));
+  element       = xor_swap(element, 0x08, bfe(lane_id, 3));
+  element       = xor_swap(element, 0x04, bfe(lane_id, 2));
+  element       = xor_swap(element, 0x02, bfe(lane_id, 1));
+  element       = xor_swap(element, 0x01, bfe(lane_id, 0));
+  return;
+}
+
+struct BuildConfig {
+  size_t max_dataset_size;
+  size_t dataset_dim;
+  size_t node_degree{64};
+  size_t internal_node_degree{0};
+  // If internal_node_degree == 0, the value of node_degree will be assigned to it
+  size_t max_iterations{50};
+  float termination_threshold{0.0001};
+};
+
+template <typename Index_t>
+class BloomFilter {
+ public:
+  BloomFilter(size_t nrow, size_t num_sets_per_list, size_t num_hashs)
+    : nrow_(nrow),
+      num_sets_per_list_(num_sets_per_list),
+      num_hashs_(num_hashs),
+      bitsets_(nrow * num_bits_per_set_ * num_sets_per_list)
+  {
+  }
+
+  void add(size_t list_id, Index_t key)
+  {
+    if (is_cleared) { is_cleared = false; }
+    uint32_t hash         = hash_0(key);
+    size_t global_set_idx = list_id * num_bits_per_set_ * num_sets_per_list_ +
+                            key % num_sets_per_list_ * num_bits_per_set_;
+    bitsets_[global_set_idx + hash % num_bits_per_set_] = 1;
+    for (size_t i = 1; i < num_hashs_; i++) {
+      hash                                                = hash + hash_1(key);
+      bitsets_[global_set_idx + hash % num_bits_per_set_] = 1;
+    }
+  }
+
+  bool check(size_t list_id, Index_t key)
+  {
+    bool is_present       = true;
+    uint32_t hash         = hash_0(key);
+    size_t global_set_idx = list_id * num_bits_per_set_ * num_sets_per_list_ +
+                            key % num_sets_per_list_ * num_bits_per_set_;
+    is_present &= bitsets_[global_set_idx + hash % num_bits_per_set_];
+
+    if (!is_present) return false;
+    for (size_t i = 1; i < num_hashs_; i++) {
+      hash = hash + hash_1(key);
+      is_present &= bitsets_[global_set_idx + hash % num_bits_per_set_];
+      if (!is_present) return false;
+    }
+    return true;
+  }
+
+  void clear()
+  {
+    if (is_cleared) return;
+#pragma omp parallel for
+    for (size_t i = 0; i < nrow_ * num_bits_per_set_ * num_sets_per_list_; i++) {
+      bitsets_[i] = 0;
+    }
+    is_cleared = true;
+  }
+
+ private:
+  uint32_t hash_0(uint32_t value)
+  {
+    value *= 1103515245;
+    value += 12345;
+    value ^= value << 13;
+    value ^= value >> 17;
+    value ^= value << 5;
+    return value;
+  }
+
+  uint32_t hash_1(uint32_t value)
+  {
+    value *= 1664525;
+    value += 1013904223;
+    value ^= value << 13;
+    value ^= value >> 17;
+    value ^= value << 5;
+    return value;
+  }
+
+  static constexpr int num_bits_per_set_ = 512;
+  bool is_cleared{true};
+  std::vector<bool> bitsets_;
+  size_t nrow_;
+  size_t num_sets_per_list_;
+  size_t num_hashs_;
+};
+
+template <typename Index_t>
+struct GnndGraph {
+  static constexpr int segment_size = 32;
+  InternalID_t<Index_t>* h_graph;
+
+  size_t nrow;
+  size_t node_degree;
+  int num_samples;
+  int num_segments;
+
+  raft::host_matrix<DistData_t, size_t, raft::row_major> h_dists;
+
+  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_new;
+  thrust::host_vector<int2, pinned_memory_allocator<int2>> h_list_sizes_new;
+
+  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_old;
+  thrust::host_vector<int2, pinned_memory_allocator<int2>> h_list_sizes_old;
+  BloomFilter<Index_t> bloom_filter;
+
+  GnndGraph(const GnndGraph&)            = delete;
+  GnndGraph& operator=(const GnndGraph&) = delete;
+  GnndGraph(const size_t nrow,
+            const size_t node_degree,
+            const size_t internal_node_degree,
+            const size_t num_samples);
+  void init_random_graph();
+  // TODO: Create a generic bloom filter utility https://github.com/rapidsai/raft/issues/1827
+  // Use Bloom filter to sample "new" neighbors for local joining
+  void sample_graph_new(InternalID_t<Index_t>* new_neighbors, const size_t width);
+  void sample_graph(bool sample_new);
+  void update_graph(const InternalID_t<Index_t>* new_neighbors,
+                    const DistData_t* new_dists,
+                    const size_t width,
+                    std::atomic<int64_t>& update_counter);
+  void sort_lists();
+  void clear();
+  ~GnndGraph();
+};
+
+template <typename Data_t = float, typename Index_t = int>
+class GNND {
+ public:
+  GNND(raft::resources const& res, const BuildConfig& build_config);
+  GNND(const GNND&)            = delete;
+  GNND& operator=(const GNND&) = delete;
+
+  void build(Data_t* data, const Index_t nrow, Index_t* output_graph);
+  ~GNND()    = default;
+  using ID_t = InternalID_t<Index_t>;
+
+ private:
+  void add_reverse_edges(Index_t* graph_ptr,
+                         Index_t* h_rev_graph_ptr,
+                         Index_t* d_rev_graph_ptr,
+                         int2* list_sizes,
+                         cudaStream_t stream = 0);
+  void local_join(cudaStream_t stream = 0);
+
+  raft::resources const& res;
+
+  BuildConfig build_config_;
+  GnndGraph<Index_t> graph_;
+  std::atomic<int64_t> update_counter_;
+
+  Index_t nrow_;
+  const int ndim_;
+
+  raft::device_matrix<__half, Index_t, raft::row_major> d_data_;
+  raft::device_vector<DistData_t, Index_t> l2_norms_;
+
+  raft::device_matrix<ID_t, Index_t, raft::row_major> graph_buffer_;
+  raft::device_matrix<DistData_t, Index_t, raft::row_major> dists_buffer_;
+
+  // TODO: Investigate using RMM/RAFT types https://github.com/rapidsai/raft/issues/1827
+  thrust::host_vector<ID_t, pinned_memory_allocator<ID_t>> graph_host_buffer_;
+  thrust::host_vector<DistData_t, pinned_memory_allocator<DistData_t>> dists_host_buffer_;
+
+  raft::device_vector<int, Index_t> d_locks_;
+
+  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_new_;
+  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_old_;
+  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_old_;
+  // int2.x is the number of forward edges, int2.y is the number of reverse edges
+
+  raft::device_vector<int2, Index_t> d_list_sizes_new_;
+  raft::device_vector<int2, Index_t> d_list_sizes_old_;
+};
+
+constexpr int TILE_ROW_WIDTH = 64;
+constexpr int TILE_COL_WIDTH = 128;
+
+constexpr int NUM_SAMPLES = 32;
+// For now, the max. number of samples is 32, so the sample cache size is fixed
+// to 64 (32 * 2).
+constexpr int MAX_NUM_BI_SAMPLES        = 64;
+constexpr int SKEWED_MAX_NUM_BI_SAMPLES = skew_dim<float>(MAX_NUM_BI_SAMPLES);
+constexpr int BLOCK_SIZE                = 512;
+constexpr int WMMA_M                    = 16;
+constexpr int WMMA_N                    = 16;
+constexpr int WMMA_K                    = 16;
+
+template <typename Data_t>
+__device__ __forceinline__ void load_vec(Data_t* vec_buffer,
+                                         const Data_t* d_vec,
+                                         const int load_dims,
+                                         const int padding_dims,
+                                         const int lane_id)
+{
+  if constexpr (std::is_same_v<Data_t, float> or std::is_same_v<Data_t, uint8_t> or
+                std::is_same_v<Data_t, int8_t>) {
+    constexpr int num_load_elems_per_warp = raft::warp_size();
+    for (int step = 0; step < ceildiv(padding_dims, num_load_elems_per_warp); step++) {
+      int idx = step * num_load_elems_per_warp + lane_id;
+      if (idx < load_dims) {
+        vec_buffer[idx] = d_vec[idx];
+      } else if (idx < padding_dims) {
+        vec_buffer[idx] = 0.0f;
+      }
+    }
+  }
+  if constexpr (std::is_same_v<Data_t, __half>) {
+    if ((size_t)d_vec % sizeof(float2) == 0 && (size_t)vec_buffer % sizeof(float2) == 0 &&
+        load_dims % 4 == 0 && padding_dims % 4 == 0) {
+      constexpr int num_load_elems_per_warp = raft::warp_size() * 4;
+#pragma unroll
+      for (int step = 0; step < ceildiv(padding_dims, num_load_elems_per_warp); step++) {
+        int idx_in_vec = step * num_load_elems_per_warp + lane_id * 4;
+        if (idx_in_vec + 4 <= load_dims) {
+          *(float2*)(vec_buffer + idx_in_vec) = *(float2*)(d_vec + idx_in_vec);
+        } else if (idx_in_vec + 4 <= padding_dims) {
+          *(float2*)(vec_buffer + idx_in_vec) = float2({0.0f, 0.0f});
+        }
+      }
+    } else {
+      constexpr int num_load_elems_per_warp = raft::warp_size();
+      for (int step = 0; step < ceildiv(padding_dims, num_load_elems_per_warp); step++) {
+        int idx = step * num_load_elems_per_warp + lane_id;
+        if (idx < load_dims) {
+          vec_buffer[idx] = d_vec[idx];
+        } else if (idx < padding_dims) {
+          vec_buffer[idx] = 0.0f;
+        }
+      }
+    }
+  }
+}
+
+// TODO: Replace with RAFT utilities https://github.com/rapidsai/raft/issues/1827
+/** Calculate L2 norm, and cast data to __half */
+template <typename Data_t>
+__global__ void preprocess_data_kernel(const Data_t* input_data,
+                                       __half* output_data,
+                                       int dim,
+                                       DistData_t* l2_norms,
+                                       size_t list_offset = 0)
+{
+  extern __shared__ char buffer[];
+  __shared__ float l2_norm;
+  Data_t* s_vec  = (Data_t*)buffer;
+  size_t list_id = list_offset + blockIdx.x;
+
+  load_vec(s_vec, input_data + blockIdx.x * dim, dim, dim, threadIdx.x % raft::warp_size());
+  if (threadIdx.x == 0) { l2_norm = 0; }
+  __syncthreads();
+  int lane_id = threadIdx.x % raft::warp_size();
+  for (int step = 0; step < ceildiv(dim, raft::warp_size()); step++) {
+    int idx         = step * raft::warp_size() + lane_id;
+    float part_dist = 0;
+    if (idx < dim) {
+      part_dist = s_vec[idx];
+      part_dist = part_dist * part_dist;
+    }
+    __syncwarp();
+    for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
+      part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset);
+    }
+    if (lane_id == 0) { l2_norm += part_dist; }
+    __syncwarp();
+  }
+
+  for (int step = 0; step < ceildiv(dim, raft::warp_size()); step++) {
+    int idx = step * raft::warp_size() + threadIdx.x;
+    if (idx < dim) {
+      if (l2_norms == nullptr) {
+        output_data[list_id * dim + idx] =
+          (float)input_data[(size_t)blockIdx.x * dim + idx] / sqrt(l2_norm);
+      } else {
+        output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx];
+        if (idx == 0) { l2_norms[list_id] = l2_norm; }
+      }
+    }
+  }
+}
+
+template <typename Index_t>
+__global__ void add_rev_edges_kernel(const Index_t* graph,
+                                     Index_t* rev_graph,
+                                     int num_samples,
+                                     int2* list_sizes)
+{
+  size_t list_id = blockIdx.x;
+  int2 list_size = list_sizes[list_id];
+
+  for (int idx = threadIdx.x; idx < list_size.x; idx += blockDim.x) {
+    // each node has same number (num_samples) of forward and reverse edges
+    size_t rev_list_id = graph[list_id * num_samples + idx];
+    // there are already num_samples forward edges
+    int idx_in_rev_list = atomicAdd(&list_sizes[rev_list_id].y, 1);
+    if (idx_in_rev_list >= num_samples) {
+      atomicExch(&list_sizes[rev_list_id].y, num_samples);
+    } else {
+      rev_graph[rev_list_id * num_samples + idx_in_rev_list] = list_id;
+    }
+  }
+}
+
+template <typename Index_t, typename ID_t = InternalID_t<Index_t>>
+__device__ void insert_to_global_graph(ResultItem<Index_t> elem,
+                                       size_t list_id,
+                                       ID_t* graph,
+                                       DistData_t* dists,
+                                       int node_degree,
+                                       int* locks)
+{
+  int tx                 = threadIdx.x;
+  int lane_id            = tx % raft::warp_size();
+  size_t global_idx_base = list_id * node_degree;
+  if (elem.id() == list_id) return;
+
+  const int num_segments = ceildiv(node_degree, raft::warp_size());
+
+  int loop_flag = 0;
+  do {
+    int segment_id = elem.id() % num_segments;
+    if (lane_id == 0) {
+      loop_flag = atomicCAS(&locks[list_id * num_segments + segment_id], 0, 1) == 0;
+    }
+
+    loop_flag = __shfl_sync(raft::warp_full_mask(), loop_flag, 0);
+
+    if (loop_flag == 1) {
+      ResultItem<Index_t> knn_list_frag;
+      int local_idx     = segment_id * raft::warp_size() + lane_id;
+      size_t global_idx = global_idx_base + local_idx;
+      if (local_idx < node_degree) {
+        knn_list_frag.id_with_flag() = graph[global_idx].id_with_flag();
+        knn_list_frag.dist()         = dists[global_idx];
+      }
+
+      int pos_to_insert = -1;
+      ResultItem<Index_t> prev_elem;
+
+      prev_elem.id_with_flag() =
+        __shfl_up_sync(raft::warp_full_mask(), knn_list_frag.id_with_flag(), 1);
+      prev_elem.dist() = __shfl_up_sync(raft::warp_full_mask(), knn_list_frag.dist(), 1);
+
+      if (lane_id == 0) {
+        prev_elem = ResultItem<Index_t>{std::numeric_limits<Index_t>::min(),
+                                        std::numeric_limits<DistData_t>::lowest()};
+      }
+      if (elem > prev_elem && elem < knn_list_frag) {
+        pos_to_insert = segment_id * raft::warp_size() + lane_id;
+      } else if (elem == prev_elem || elem == knn_list_frag) {
+        pos_to_insert = -2;
+      }
+      uint mask = __ballot_sync(raft::warp_full_mask(), pos_to_insert >= 0);
+      if (mask) {
+        uint set_lane_id = __fns(mask, 0, 1);
+        pos_to_insert    = __shfl_sync(raft::warp_full_mask(), pos_to_insert, set_lane_id);
+      }
+
+      if (pos_to_insert >= 0) {
+        int local_idx = segment_id * raft::warp_size() + lane_id;
+        if (local_idx > pos_to_insert) {
+          local_idx++;
+        } else if (local_idx == pos_to_insert) {
+          graph[global_idx_base + local_idx].id_with_flag() = elem.id_with_flag();
+          dists[global_idx_base + local_idx]                = elem.dist();
+          local_idx++;
+        }
+        size_t global_pos = global_idx_base + local_idx;
+        if (local_idx < (segment_id + 1) * raft::warp_size() && local_idx < node_degree) {
+          graph[global_pos].id_with_flag() = knn_list_frag.id_with_flag();
+          dists[global_pos]                = knn_list_frag.dist();
+        }
+      }
+      __threadfence();
+      if (loop_flag && lane_id == 0) { atomicExch(&locks[list_id * num_segments + segment_id], 0); }
+    }
+  } while (!loop_flag);
+}
+
+template <typename Index_t>
+__device__ ResultItem<Index_t> get_min_item(const Index_t id,
+                                            const int idx_in_list,
+                                            const Index_t* neighbs,
+                                            const DistData_t* distances,
+                                            const bool find_in_row = true)
+{
+  int lane_id = threadIdx.x % raft::warp_size();
+
+  static_assert(MAX_NUM_BI_SAMPLES == 64);
+  int idx[MAX_NUM_BI_SAMPLES / raft::warp_size()];
+  float dist[MAX_NUM_BI_SAMPLES / raft::warp_size()] = {std::numeric_limits<DistData_t>::max(),
+                                                        std::numeric_limits<DistData_t>::max()};
+  idx[0]                                             = lane_id;
+  idx[1]                                             = raft::warp_size() + lane_id;
+
+  if (neighbs[idx[0]] != id) {
+    dist[0] = find_in_row ? distances[idx_in_list * SKEWED_MAX_NUM_BI_SAMPLES + lane_id]
+                          : distances[idx_in_list + lane_id * SKEWED_MAX_NUM_BI_SAMPLES];
+  }
+
+  if (neighbs[idx[1]] != id) {
+    dist[1] =
+      find_in_row
+        ? distances[idx_in_list * SKEWED_MAX_NUM_BI_SAMPLES + raft::warp_size() + lane_id]
+        : distances[idx_in_list + (raft::warp_size() + lane_id) * SKEWED_MAX_NUM_BI_SAMPLES];
+  }
+
+  if (dist[1] < dist[0]) {
+    dist[0] = dist[1];
+    idx[0]  = idx[1];
+  }
+  __syncwarp();
+  for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
+    float other_idx  = __shfl_down_sync(raft::warp_full_mask(), idx[0], offset);
+    float other_dist = __shfl_down_sync(raft::warp_full_mask(), dist[0], offset);
+    if (other_dist < dist[0]) {
+      dist[0] = other_dist;
+      idx[0]  = other_idx;
+    }
+  }
+
+  ResultItem<Index_t> result;
+  result.dist()         = __shfl_sync(raft::warp_full_mask(), dist[0], 0);
+  result.id_with_flag() = neighbs[__shfl_sync(raft::warp_full_mask(), idx[0], 0)];
+  return result;
+}
+
+template <typename T>
+__device__ __forceinline__ void remove_duplicates(
+  T* list_a, int list_a_size, T* list_b, int list_b_size, int& unique_counter, int execute_warp_id)
+{
+  static_assert(raft::warp_size() == 32);
+  if (!(threadIdx.x >= execute_warp_id * raft::warp_size() &&
+        threadIdx.x < execute_warp_id * raft::warp_size() + raft::warp_size())) {
+    return;
+  }
+  int lane_id = threadIdx.x % raft::warp_size();
+  T elem      = std::numeric_limits<T>::max();
+  if (lane_id < list_a_size) { elem = list_a[lane_id]; }
+  warp_bitonic_sort(&elem, lane_id);
+
+  if (elem != std::numeric_limits<T>::max()) { list_a[lane_id] = elem; }
+
+  T elem_b = std::numeric_limits<T>::max();
+
+  if (lane_id < list_b_size) { elem_b = list_b[lane_id]; }
+  __syncwarp();
+
+  int idx_l    = 0;
+  int idx_r    = list_a_size;
+  bool existed = false;
+  while (idx_l < idx_r) {
+    int idx  = (idx_l + idx_r) / 2;
+    int elem = list_a[idx];
+    if (elem == elem_b) {
+      existed = true;
+      break;
+    }
+    if (elem_b > elem) {
+      idx_l = idx + 1;
+    } else {
+      idx_r = idx;
+    }
+  }
+  if (!existed && elem_b != std::numeric_limits<T>::max()) {
+    int idx                   = atomicAdd(&unique_counter, 1);
+    list_a[list_a_size + idx] = elem_b;
+  }
+}
+
+// launch_bounds here denote BLOCK_SIZE = 512 and MIN_BLOCKS_PER_SM = 4
+// Per
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications,
+// MAX_RESIDENT_THREAD_PER_SM = BLOCK_SIZE * BLOCKS_PER_SM = 2048
+// For architectures 750 and 860, the values for MAX_RESIDENT_THREAD_PER_SM
+// is 1024 and 1536 respectively, which means the bounds don't work anymore
+template <typename Index_t, typename ID_t = InternalID_t<Index_t>>
+__global__ void
+#ifdef __CUDA_ARCH__
+#if (__CUDA_ARCH__) == 750 || (__CUDA_ARCH__) == 860
+__launch_bounds__(BLOCK_SIZE)
+#else
+__launch_bounds__(BLOCK_SIZE, 4)
+#endif
+#endif
+  local_join_kernel(const Index_t* graph_new,
+                    const Index_t* rev_graph_new,
+                    const int2* sizes_new,
+                    const Index_t* graph_old,
+                    const Index_t* rev_graph_old,
+                    const int2* sizes_old,
+                    const int width,
+                    const __half* data,
+                    const int data_dim,
+                    ID_t* graph,
+                    DistData_t* dists,
+                    int graph_width,
+                    int* locks,
+                    DistData_t* l2_norms)
+{
+#if (__CUDA_ARCH__ >= 700)
+  using namespace nvcuda;
+  __shared__ int s_list[MAX_NUM_BI_SAMPLES * 2];
+
+  constexpr int APAD = 8;
+  constexpr int BPAD = 8;
+  __shared__ __half s_nv[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + APAD];  // New vectors
+  __shared__ __half s_ov[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + BPAD];  // Old vectors
+  static_assert(sizeof(float) * MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES <=
+                sizeof(__half) * MAX_NUM_BI_SAMPLES * (TILE_COL_WIDTH + BPAD));
+  // s_distances: MAX_NUM_BI_SAMPLES x SKEWED_MAX_NUM_BI_SAMPLES, reuse the space of s_ov
+  float* s_distances    = (float*)&s_ov[0][0];
+  int* s_unique_counter = (int*)&s_ov[0][0];
+
+  if (threadIdx.x == 0) {
+    s_unique_counter[0] = 0;
+    s_unique_counter[1] = 0;
+  }
+
+  Index_t* new_neighbors = s_list;
+  Index_t* old_neighbors = s_list + MAX_NUM_BI_SAMPLES;
+
+  size_t list_id      = blockIdx.x;
+  int2 list_new_size2 = sizes_new[list_id];
+  int list_new_size   = list_new_size2.x + list_new_size2.y;
+  int2 list_old_size2 = sizes_old[list_id];
+  int list_old_size   = list_old_size2.x + list_old_size2.y;
+
+  if (!list_new_size) return;
+  int tx = threadIdx.x;
+
+  if (tx < list_new_size2.x) {
+    new_neighbors[tx] = graph_new[list_id * width + tx];
+  } else if (tx >= list_new_size2.x && tx < list_new_size) {
+    new_neighbors[tx] = rev_graph_new[list_id * width + tx - list_new_size2.x];
+  }
+
+  if (tx < list_old_size2.x) {
+    old_neighbors[tx] = graph_old[list_id * width + tx];
+  } else if (tx >= list_old_size2.x && tx < list_old_size) {
+    old_neighbors[tx] = rev_graph_old[list_id * width + tx - list_old_size2.x];
+  }
+
+  __syncthreads();
+
+  remove_duplicates(new_neighbors,
+                    list_new_size2.x,
+                    new_neighbors + list_new_size2.x,
+                    list_new_size2.y,
+                    s_unique_counter[0],
+                    0);
+
+  remove_duplicates(old_neighbors,
+                    list_old_size2.x,
+                    old_neighbors + list_old_size2.x,
+                    list_old_size2.y,
+                    s_unique_counter[1],
+                    1);
+  __syncthreads();
+  list_new_size = list_new_size2.x + s_unique_counter[0];
+  list_old_size = list_old_size2.x + s_unique_counter[1];
+
+  int warp_id             = threadIdx.x / raft::warp_size();
+  int lane_id             = threadIdx.x % raft::warp_size();
+  constexpr int num_warps = BLOCK_SIZE / raft::warp_size();
+
+  int warp_id_y = warp_id / 4;
+  int warp_id_x = warp_id % 4;
+
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> b_frag;
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;
+  wmma::fill_fragment(c_frag, 0.0);
+  for (int step = 0; step < ceildiv(data_dim, TILE_COL_WIDTH); step++) {
+    int num_load_elems = (step == ceildiv(data_dim, TILE_COL_WIDTH) - 1)
+                           ? data_dim - step * TILE_COL_WIDTH
+                           : TILE_COL_WIDTH;
+#pragma unroll
+    for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) {
+      int idx = i * num_warps + warp_id;
+      if (idx < list_new_size) {
+        size_t neighbor_id = new_neighbors[idx];
+        size_t idx_in_data = neighbor_id * data_dim;
+        load_vec(s_nv[idx],
+                 data + idx_in_data + step * TILE_COL_WIDTH,
+                 num_load_elems,
+                 TILE_COL_WIDTH,
+                 lane_id);
+      }
+    }
+    __syncthreads();
+
+    for (int i = 0; i < TILE_COL_WIDTH / WMMA_K; i++) {
+      wmma::load_matrix_sync(a_frag, s_nv[warp_id_y * WMMA_M] + i * WMMA_K, TILE_COL_WIDTH + APAD);
+      wmma::load_matrix_sync(b_frag, s_nv[warp_id_x * WMMA_N] + i * WMMA_K, TILE_COL_WIDTH + BPAD);
+      wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+      __syncthreads();
+    }
+  }
+
+  wmma::store_matrix_sync(
+    s_distances + warp_id_y * WMMA_M * SKEWED_MAX_NUM_BI_SAMPLES + warp_id_x * WMMA_N,
+    c_frag,
+    SKEWED_MAX_NUM_BI_SAMPLES,
+    wmma::mem_row_major);
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
+    if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_new_size &&
+        i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
+      if (l2_norms == nullptr) {
+        s_distances[i] = -s_distances[i];
+      } else {
+        s_distances[i] = l2_norms[new_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
+                         l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
+                         2.0 * s_distances[i];
+      }
+    } else {
+      s_distances[i] = std::numeric_limits<float>::max();
+    }
+  }
+  __syncthreads();
+
+  for (int step = 0; step < ceildiv(list_new_size, num_warps); step++) {
+    int idx_in_list = step * num_warps + tx / raft::warp_size();
+    if (idx_in_list >= list_new_size) continue;
+    auto min_elem = get_min_item(s_list[idx_in_list], idx_in_list, new_neighbors, s_distances);
+    if (min_elem.id() < gridDim.x) {
+      insert_to_global_graph(min_elem, s_list[idx_in_list], graph, dists, graph_width, locks);
+    }
+  }
+
+  if (!list_old_size) return;
+
+  __syncthreads();
+
+  wmma::fill_fragment(c_frag, 0.0);
+  for (int step = 0; step < ceildiv(data_dim, TILE_COL_WIDTH); step++) {
+    int num_load_elems = (step == ceildiv(data_dim, TILE_COL_WIDTH) - 1)
+                           ? data_dim - step * TILE_COL_WIDTH
+                           : TILE_COL_WIDTH;
+    if (TILE_COL_WIDTH < data_dim) {
+#pragma unroll
+      for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) {
+        int idx = i * num_warps + warp_id;
+        if (idx < list_new_size) {
+          size_t neighbor_id = new_neighbors[idx];
+          size_t idx_in_data = neighbor_id * data_dim;
+          load_vec(s_nv[idx],
+                   data + idx_in_data + step * TILE_COL_WIDTH,
+                   num_load_elems,
+                   TILE_COL_WIDTH,
+                   lane_id);
+        }
+      }
+    }
+#pragma unroll
+    for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) {
+      int idx = i * num_warps + warp_id;
+      if (idx < list_old_size) {
+        size_t neighbor_id = old_neighbors[idx];
+        size_t idx_in_data = neighbor_id * data_dim;
+        load_vec(s_ov[idx],
+                 data + idx_in_data + step * TILE_COL_WIDTH,
+                 num_load_elems,
+                 TILE_COL_WIDTH,
+                 lane_id);
+      }
+    }
+    __syncthreads();
+
+    for (int i = 0; i < TILE_COL_WIDTH / WMMA_K; i++) {
+      wmma::load_matrix_sync(a_frag, s_nv[warp_id_y * WMMA_M] + i * WMMA_K, TILE_COL_WIDTH + APAD);
+      wmma::load_matrix_sync(b_frag, s_ov[warp_id_x * WMMA_N] + i * WMMA_K, TILE_COL_WIDTH + BPAD);
+      wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+      __syncthreads();
+    }
+  }
+
+  wmma::store_matrix_sync(
+    s_distances + warp_id_y * WMMA_M * SKEWED_MAX_NUM_BI_SAMPLES + warp_id_x * WMMA_N,
+    c_frag,
+    SKEWED_MAX_NUM_BI_SAMPLES,
+    wmma::mem_row_major);
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
+    if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_old_size &&
+        i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
+      if (l2_norms == nullptr) {
+        s_distances[i] = -s_distances[i];
+      } else {
+        s_distances[i] = l2_norms[old_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
+                         l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
+                         2.0 * s_distances[i];
+      }
+    } else {
+      s_distances[i] = std::numeric_limits<float>::max();
+    }
+  }
+  __syncthreads();
+
+  for (int step = 0; step < ceildiv(MAX_NUM_BI_SAMPLES * 2, num_warps); step++) {
+    int idx_in_list = step * num_warps + tx / raft::warp_size();
+    if (idx_in_list >= list_new_size && idx_in_list < MAX_NUM_BI_SAMPLES) continue;
+    if (idx_in_list >= MAX_NUM_BI_SAMPLES + list_old_size && idx_in_list < MAX_NUM_BI_SAMPLES * 2)
+      continue;
+    ResultItem<Index_t> min_elem{std::numeric_limits<Index_t>::max(),
+                                 std::numeric_limits<DistData_t>::max()};
+    if (idx_in_list < MAX_NUM_BI_SAMPLES) {
+      auto temp_min_item =
+        get_min_item(s_list[idx_in_list], idx_in_list, old_neighbors, s_distances);
+      if (temp_min_item.dist() < min_elem.dist()) { min_elem = temp_min_item; }
+    } else {
+      auto temp_min_item = get_min_item(
+        s_list[idx_in_list], idx_in_list - MAX_NUM_BI_SAMPLES, new_neighbors, s_distances, false);
+      if (temp_min_item.dist() < min_elem.dist()) { min_elem = temp_min_item; }
+    }
+
+    if (min_elem.id() < gridDim.x) {
+      insert_to_global_graph(min_elem, s_list[idx_in_list], graph, dists, graph_width, locks);
+    }
+  }
+#endif
+}
+
+namespace {
+template <typename Index_t>
+int insert_to_ordered_list(InternalID_t<Index_t>* list,
+                           DistData_t* dist_list,
+                           const int width,
+                           const InternalID_t<Index_t> neighb_id,
+                           const DistData_t dist)
+{
+  if (dist > dist_list[width - 1]) { return width; }
+
+  int idx_insert      = width;
+  bool position_found = false;
+  for (int i = 0; i < width; i++) {
+    if (list[i].id() == neighb_id.id()) { return width; }
+    if (!position_found && dist_list[i] > dist) {
+      idx_insert     = i;
+      position_found = true;
+    }
+  }
+  if (idx_insert == width) return idx_insert;
+
+  memmove(list + idx_insert + 1, list + idx_insert, sizeof(*list) * (width - idx_insert - 1));
+  memmove(dist_list + idx_insert + 1,
+          dist_list + idx_insert,
+          sizeof(*dist_list) * (width - idx_insert - 1));
+
+  list[idx_insert]      = neighb_id;
+  dist_list[idx_insert] = dist;
+  return idx_insert;
+};
+
+}  // namespace
+
+template <typename Index_t>
+GnndGraph<Index_t>::GnndGraph(const size_t nrow,
+                              const size_t node_degree,
+                              const size_t internal_node_degree,
+                              const size_t num_samples)
+  : nrow(nrow),
+    node_degree(node_degree),
+    num_samples(num_samples),
+    bloom_filter(nrow, internal_node_degree / segment_size, 3),
+    h_dists{raft::make_host_matrix<DistData_t, size_t, raft::row_major>(nrow, node_degree)},
+    h_graph_new{nrow * num_samples},
+    h_list_sizes_new{nrow},
+    h_graph_old{nrow * num_samples},
+    h_list_sizes_old{nrow}
+{
+  // node_degree must be a multiple of segment_size;
+  assert(node_degree % segment_size == 0);
+  assert(internal_node_degree % segment_size == 0);
+
+  num_segments = node_degree / segment_size;
+  // To save the CPU memory, graph should be allocated by external function
+  h_graph = nullptr;
+}
+
+// This is the only operation on the CPU that cannot be overlapped.
+// So it should be as fast as possible.
+template <typename Index_t>
+void GnndGraph<Index_t>::sample_graph_new(InternalID_t<Index_t>* new_neighbors, const size_t width)
+{
+#pragma omp parallel for
+  for (size_t i = 0; i < nrow; i++) {
+    auto list_new         = h_graph_new.data() + i * num_samples;
+    h_list_sizes_new[i].x = 0;
+    h_list_sizes_new[i].y = 0;
+
+    for (size_t j = 0; j < width; j++) {
+      auto new_neighb_id = new_neighbors[i * width + j].id();
+      if ((size_t)new_neighb_id >= nrow) break;
+      if (bloom_filter.check(i, new_neighb_id)) { continue; }
+      bloom_filter.add(i, new_neighb_id);
+      new_neighbors[i * width + j].mark_old();
+      list_new[h_list_sizes_new[i].x++] = new_neighb_id;
+      if (h_list_sizes_new[i].x == num_samples) break;
+    }
+  }
+}
+
+template <typename Index_t>
+void GnndGraph<Index_t>::init_random_graph()
+{
+  for (size_t seg_idx = 0; seg_idx < static_cast<size_t>(num_segments); seg_idx++) {
+    // random sequence (range: 0~nrow)
+    // segment_x stores neighbors which id % num_segments == x
+    std::vector<Index_t> rand_seq(nrow / num_segments);
+    std::iota(rand_seq.begin(), rand_seq.end(), 0);
+    std::random_shuffle(rand_seq.begin(), rand_seq.end());
+
+#pragma omp parallel for
+    for (size_t i = 0; i < nrow; i++) {
+      size_t base_idx      = i * node_degree + seg_idx * segment_size;
+      auto h_neighbor_list = h_graph + base_idx;
+      auto h_dist_list     = h_dists.data_handle() + base_idx;
+      for (size_t j = 0; j < static_cast<size_t>(segment_size); j++) {
+        size_t idx = base_idx + j;
+        Index_t id = rand_seq[idx % rand_seq.size()] * num_segments + seg_idx;
+        if ((size_t)id == i) {
+          id = rand_seq[(idx + segment_size) % rand_seq.size()] * num_segments + seg_idx;
+        }
+        h_neighbor_list[j].id_with_flag() = id;
+        h_dist_list[j]                    = std::numeric_limits<DistData_t>::max();
+      }
+    }
+  }
+}
+
+template <typename Index_t>
+void GnndGraph<Index_t>::sample_graph(bool sample_new)
+{
+#pragma omp parallel for
+  for (size_t i = 0; i < nrow; i++) {
+    h_list_sizes_old[i].x = 0;
+    h_list_sizes_old[i].y = 0;
+    h_list_sizes_new[i].x = 0;
+    h_list_sizes_new[i].y = 0;
+
+    auto list     = h_graph + i * node_degree;
+    auto list_old = h_graph_old.data() + i * num_samples;
+    auto list_new = h_graph_new.data() + i * num_samples;
+    for (int j = 0; j < segment_size; j++) {
+      for (int k = 0; k < num_segments; k++) {
+        auto neighbor = list[k * segment_size + j];
+        if ((size_t)neighbor.id() >= nrow) continue;
+        if (!neighbor.is_new()) {
+          if (h_list_sizes_old[i].x < num_samples) {
+            list_old[h_list_sizes_old[i].x++] = neighbor.id();
+          }
+        } else if (sample_new) {
+          if (h_list_sizes_new[i].x < num_samples) {
+            list[k * segment_size + j].mark_old();
+            list_new[h_list_sizes_new[i].x++] = neighbor.id();
+          }
+        }
+        if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; }
+      }
+      if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; }
+    }
+  }
+}
+
+template <typename Index_t>
+void GnndGraph<Index_t>::update_graph(const InternalID_t<Index_t>* new_neighbors,
+                                      const DistData_t* new_dists,
+                                      const size_t width,
+                                      std::atomic<int64_t>& update_counter)
+{
+#pragma omp parallel for
+  for (size_t i = 0; i < nrow; i++) {
+    for (size_t j = 0; j < width; j++) {
+      auto new_neighb_id = new_neighbors[i * width + j];
+      auto new_dist      = new_dists[i * width + j];
+      if (new_dist == std::numeric_limits<DistData_t>::max()) break;
+      if ((size_t)new_neighb_id.id() == i) continue;
+      int seg_idx    = new_neighb_id.id() % num_segments;
+      auto list      = h_graph + i * node_degree + seg_idx * segment_size;
+      auto dist_list = h_dists.data_handle() + i * node_degree + seg_idx * segment_size;
+      int insert_pos =
+        insert_to_ordered_list(list, dist_list, segment_size, new_neighb_id, new_dist);
+      if (i % counter_interval == 0 && insert_pos != segment_size) { update_counter++; }
+    }
+  }
+}
+
+template <typename Index_t>
+void GnndGraph<Index_t>::sort_lists()
+{
+#pragma omp parallel for
+  for (size_t i = 0; i < nrow; i++) {
+    std::vector<std::pair<DistData_t, Index_t>> new_list;
+    for (size_t j = 0; j < node_degree; j++) {
+      new_list.emplace_back(h_dists.data_handle()[i * node_degree + j],
+                            h_graph[i * node_degree + j].id());
+    }
+    std::sort(new_list.begin(), new_list.end());
+    for (size_t j = 0; j < node_degree; j++) {
+      h_graph[i * node_degree + j].id_with_flag() = new_list[j].second;
+      h_dists.data_handle()[i * node_degree + j]  = new_list[j].first;
+    }
+  }
+}
+
+template <typename Index_t>
+void GnndGraph<Index_t>::clear()
+{
+  bloom_filter.clear();
+}
+
+template <typename Index_t>
+GnndGraph<Index_t>::~GnndGraph()
+{
+  assert(h_graph == nullptr);
+}
+
+template <typename Data_t, typename Index_t>
+GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build_config)
+  : res(res),
+    build_config_(build_config),
+    graph_(build_config.max_dataset_size,
+           align32::roundUp(build_config.node_degree),
+           align32::roundUp(build_config.internal_node_degree ? build_config.internal_node_degree
+                                                              : build_config.node_degree),
+           NUM_SAMPLES),
+    nrow_(build_config.max_dataset_size),
+    ndim_(build_config.dataset_dim),
+    d_data_{raft::make_device_matrix<__half, Index_t, raft::row_major>(
+      res, nrow_, build_config.dataset_dim)},
+    l2_norms_{raft::make_device_vector<DistData_t, Index_t>(res, nrow_)},
+    graph_buffer_{
+      raft::make_device_matrix<ID_t, Index_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
+    dists_buffer_{
+      raft::make_device_matrix<DistData_t, Index_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
+    graph_host_buffer_{static_cast<size_t>(nrow_ * DEGREE_ON_DEVICE)},
+    dists_host_buffer_{static_cast<size_t>(nrow_ * DEGREE_ON_DEVICE)},
+    d_locks_{raft::make_device_vector<int, Index_t>(res, nrow_)},
+    h_rev_graph_new_{static_cast<size_t>(nrow_ * NUM_SAMPLES)},
+    h_graph_old_{static_cast<size_t>(nrow_ * NUM_SAMPLES)},
+    h_rev_graph_old_{static_cast<size_t>(nrow_ * NUM_SAMPLES)},
+    d_list_sizes_new_{raft::make_device_vector<int2, Index_t>(res, nrow_)},
+    d_list_sizes_old_{raft::make_device_vector<int2, Index_t>(res, nrow_)}
+{
+  static_assert(NUM_SAMPLES <= 32);
+
+  thrust::fill(thrust::device,
+               dists_buffer_.data_handle(),
+               dists_buffer_.data_handle() + dists_buffer_.size(),
+               std::numeric_limits<float>::max());
+  thrust::fill(thrust::device,
+               reinterpret_cast<Index_t*>(graph_buffer_.data_handle()),
+               reinterpret_cast<Index_t*>(graph_buffer_.data_handle()) + graph_buffer_.size(),
+               std::numeric_limits<Index_t>::max());
+  thrust::fill(thrust::device, d_locks_.data_handle(), d_locks_.data_handle() + d_locks_.size(), 0);
+};
+
+template <typename Data_t, typename Index_t>
+void GNND<Data_t, Index_t>::add_reverse_edges(Index_t* graph_ptr,
+                                              Index_t* h_rev_graph_ptr,
+                                              Index_t* d_rev_graph_ptr,
+                                              int2* list_sizes,
+                                              cudaStream_t stream)
+{
+  add_rev_edges_kernel<<<nrow_, raft::warp_size(), 0, stream>>>(
+    graph_ptr, d_rev_graph_ptr, NUM_SAMPLES, list_sizes);
+  raft::copy(
+    h_rev_graph_ptr, d_rev_graph_ptr, nrow_ * NUM_SAMPLES, raft::resource::get_cuda_stream(res));
+}
+
+template <typename Data_t, typename Index_t>
+void GNND<Data_t, Index_t>::local_join(cudaStream_t stream)
+{
+  thrust::fill(thrust::device.on(stream),
+               dists_buffer_.data_handle(),
+               dists_buffer_.data_handle() + dists_buffer_.size(),
+               std::numeric_limits<float>::max());
+  local_join_kernel<<<nrow_, BLOCK_SIZE, 0, stream>>>(
+    thrust::raw_pointer_cast(graph_.h_graph_new.data()),
+    thrust::raw_pointer_cast(h_rev_graph_new_.data()),
+    d_list_sizes_new_.data_handle(),
+    thrust::raw_pointer_cast(h_graph_old_.data()),
+    thrust::raw_pointer_cast(h_rev_graph_old_.data()),
+    d_list_sizes_old_.data_handle(),
+    NUM_SAMPLES,
+    d_data_.data_handle(),
+    ndim_,
+    graph_buffer_.data_handle(),
+    dists_buffer_.data_handle(),
+    DEGREE_ON_DEVICE,
+    d_locks_.data_handle(),
+    l2_norms_.data_handle());
+}
+
+template <typename Data_t, typename Index_t>
+void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* output_graph)
+{
+  using input_t = typename std::remove_const<Data_t>::type;
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(res);
+  nrow_               = nrow;
+  graph_.h_graph      = (InternalID_t<Index_t>*)output_graph;
+
+  cudaPointerAttributes data_ptr_attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&data_ptr_attr, data));
+  size_t batch_size = (data_ptr_attr.devicePointer == nullptr) ? 100000 : nrow_;
+
+  raft::spatial::knn::detail::utils::batch_load_iterator vec_batches{
+    data, static_cast<size_t>(nrow_), build_config_.dataset_dim, batch_size, stream};
+  for (auto const& batch : vec_batches) {
+    preprocess_data_kernel<<<
+      batch.size(),
+      raft::warp_size(),
+      sizeof(Data_t) * ceildiv(build_config_.dataset_dim, static_cast<size_t>(raft::warp_size())) *
+        raft::warp_size(),
+      stream>>>(batch.data(),
+                d_data_.data_handle(),
+                build_config_.dataset_dim,
+                l2_norms_.data_handle(),
+                batch.offset());
+  }
+
+  thrust::fill(thrust::device.on(stream),
+               (Index_t*)graph_buffer_.data_handle(),
+               (Index_t*)graph_buffer_.data_handle() + graph_buffer_.size(),
+               std::numeric_limits<Index_t>::max());
+
+  graph_.clear();
+  graph_.init_random_graph();
+  graph_.sample_graph(true);
+
+  auto update_and_sample = [&](bool update_graph) {
+    if (update_graph) {
+      update_counter_ = 0;
+      graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()),
+                          thrust::raw_pointer_cast(dists_host_buffer_.data()),
+                          DEGREE_ON_DEVICE,
+                          update_counter_);
+      if (update_counter_ < build_config_.termination_threshold * nrow_ *
+                              build_config_.dataset_dim / counter_interval) {
+        update_counter_ = -1;
+      }
+    }
+    graph_.sample_graph(false);
+  };
+
+  for (size_t it = 0; it < build_config_.max_iterations; it++) {
+    raft::copy(d_list_sizes_new_.data_handle(),
+               thrust::raw_pointer_cast(graph_.h_list_sizes_new.data()),
+               nrow_,
+               raft::resource::get_cuda_stream(res));
+    raft::copy(thrust::raw_pointer_cast(h_graph_old_.data()),
+               thrust::raw_pointer_cast(graph_.h_graph_old.data()),
+               nrow_ * NUM_SAMPLES,
+               raft::resource::get_cuda_stream(res));
+    raft::copy(d_list_sizes_old_.data_handle(),
+               thrust::raw_pointer_cast(graph_.h_list_sizes_old.data()),
+               nrow_,
+               raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+
+    std::thread update_and_sample_thread(update_and_sample, it);
+
+    std::cout << "# GNND iteraton: " << it + 1 << "/" << build_config_.max_iterations << "\r";
+    std::fflush(stdout);
+
+    // Reuse dists_buffer_ to save GPU memory. graph_buffer_ cannot be reused, because it
+    // contains some information for local_join.
+    static_assert(DEGREE_ON_DEVICE * sizeof(*(dists_buffer_.data_handle())) >=
+                  NUM_SAMPLES * sizeof(*(graph_buffer_.data_handle())));
+    add_reverse_edges(thrust::raw_pointer_cast(graph_.h_graph_new.data()),
+                      thrust::raw_pointer_cast(h_rev_graph_new_.data()),
+                      (Index_t*)dists_buffer_.data_handle(),
+                      d_list_sizes_new_.data_handle(),
+                      stream);
+    add_reverse_edges(thrust::raw_pointer_cast(h_graph_old_.data()),
+                      thrust::raw_pointer_cast(h_rev_graph_old_.data()),
+                      (Index_t*)dists_buffer_.data_handle(),
+                      d_list_sizes_old_.data_handle(),
+                      stream);
+
+    // Tensor operations from `mma.h` are guarded with archicteture
+    // __CUDA_ARCH__ >= 700. Since RAFT supports compilation for ARCH 600,
+    // we need to ensure that `local_join_kernel` (which uses tensor) operations
+    // is not only not compiled, but also a runtime error is presented to the user
+    auto kernel       = preprocess_data_kernel<input_t>;
+    void* kernel_ptr  = reinterpret_cast<void*>(kernel);
+    auto runtime_arch = raft::util::arch::kernel_virtual_arch(kernel_ptr);
+    auto wmma_range =
+      raft::util::arch::SM_range(raft::util::arch::SM_70(), raft::util::arch::SM_future());
+
+    if (wmma_range.contains(runtime_arch)) {
+      local_join(stream);
+    } else {
+      THROW("NN_DESCENT cannot be run for __CUDA_ARCH__ < 700");
+    }
+
+    update_and_sample_thread.join();
+
+    if (update_counter_ == -1) { break; }
+    raft::copy(thrust::raw_pointer_cast(graph_host_buffer_.data()),
+               graph_buffer_.data_handle(),
+               nrow_ * DEGREE_ON_DEVICE,
+               raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+    raft::copy(thrust::raw_pointer_cast(dists_host_buffer_.data()),
+               dists_buffer_.data_handle(),
+               nrow_ * DEGREE_ON_DEVICE,
+               raft::resource::get_cuda_stream(res));
+
+    graph_.sample_graph_new(thrust::raw_pointer_cast(graph_host_buffer_.data()), DEGREE_ON_DEVICE);
+  }
+
+  graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()),
+                      thrust::raw_pointer_cast(dists_host_buffer_.data()),
+                      DEGREE_ON_DEVICE,
+                      update_counter_);
+  raft::resource::sync_stream(res);
+  graph_.sort_lists();
+
+  // Reuse graph_.h_dists as the buffer for shrink the lists in graph
+  static_assert(sizeof(decltype(*(graph_.h_dists.data_handle()))) >= sizeof(Index_t));
+  Index_t* graph_shrink_buffer = (Index_t*)graph_.h_dists.data_handle();
+
+#pragma omp parallel for
+  for (size_t i = 0; i < (size_t)nrow_; i++) {
+    for (size_t j = 0; j < build_config_.node_degree; j++) {
+      size_t idx = i * graph_.node_degree + j;
+      Index_t id = graph_.h_graph[idx].id();
+      if (id < nrow_) {
+        graph_shrink_buffer[i * build_config_.node_degree + j] = id;
+      } else {
+        graph_shrink_buffer[i * build_config_.node_degree + j] =
+          raft::neighbors::cagra::detail::device::xorshift64(idx) % nrow_;
+      }
+    }
+  }
+  graph_.h_graph = nullptr;
+
+#pragma omp parallel for
+  for (size_t i = 0; i < (size_t)nrow_; i++) {
+    for (size_t j = 0; j < build_config_.node_degree; j++) {
+      output_graph[i * build_config_.node_degree + j] =
+        graph_shrink_buffer[i * build_config_.node_degree + j];
+    }
+  }
+}
+
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+void build(raft::resources const& res,
+           const index_params& params,
+           mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+           index<IdxT>& idx)
+{
+  RAFT_EXPECTS(dataset.extent(0) < std::numeric_limits<int>::max() - 1,
+               "The dataset size for GNND should be less than %d",
+               std::numeric_limits<int>::max() - 1);
+  size_t intermediate_degree = params.intermediate_graph_degree;
+  size_t graph_degree        = params.graph_degree;
+
+  if (intermediate_degree >= static_cast<size_t>(dataset.extent(0))) {
+    RAFT_LOG_WARN(
+      "Intermediate graph degree cannot be larger than dataset size, reducing it to %lu",
+      dataset.extent(0));
+    intermediate_degree = dataset.extent(0) - 1;
+  }
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+
+  // The elements in each knn-list are partitioned into different buckets, and we need more buckets
+  // to mitigate bucket collisions. `intermediate_degree` is OK to larger than
+  // extended_graph_degree.
+  size_t extended_graph_degree =
+    align32::roundUp(static_cast<size_t>(graph_degree * (graph_degree <= 32 ? 1.0 : 1.3)));
+  size_t extended_intermediate_degree = align32::roundUp(
+    static_cast<size_t>(intermediate_degree * (intermediate_degree <= 32 ? 1.0 : 1.3)));
+
+  auto int_graph = raft::make_host_matrix<int, int64_t, row_major>(
+    dataset.extent(0), static_cast<int64_t>(extended_graph_degree));
+
+  BuildConfig build_config{.max_dataset_size      = static_cast<size_t>(dataset.extent(0)),
+                           .dataset_dim           = static_cast<size_t>(dataset.extent(1)),
+                           .node_degree           = extended_graph_degree,
+                           .internal_node_degree  = extended_intermediate_degree,
+                           .max_iterations        = params.max_iterations,
+                           .termination_threshold = params.termination_threshold};
+
+  GNND<const T, int> nnd(res, build_config);
+  nnd.build(dataset.data_handle(), dataset.extent(0), int_graph.data_handle());
+
+#pragma omp parallel for
+  for (size_t i = 0; i < static_cast<size_t>(dataset.extent(0)); i++) {
+    for (size_t j = 0; j < graph_degree; j++) {
+      auto graph                  = idx.graph().data_handle();
+      graph[i * graph_degree + j] = int_graph.data_handle()[i * extended_graph_degree + j];
+    }
+  }
+}
+
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+index<IdxT> build(raft::resources const& res,
+                  const index_params& params,
+                  mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset)
+{
+  size_t intermediate_degree = params.intermediate_graph_degree;
+  size_t graph_degree        = params.graph_degree;
+
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+
+  index<IdxT> idx{res, dataset.extent(0), static_cast<int64_t>(graph_degree)};
+
+  build(res, params, dataset, idx);
+
+  return idx;
+}
+
+}  // namespace raft::neighbors::experimental::nn_descent::detail
diff --git a/cpp/include/raft/neighbors/detail/refine_host-ext.hpp b/cpp/include/raft/neighbors/detail/refine_host-ext.hpp
index 3ce2dc3eb5..2a863b47b3 100644
--- a/cpp/include/raft/neighbors/detail/refine_host-ext.hpp
+++ b/cpp/include/raft/neighbors/detail/refine_host-ext.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cstdint>                           // int64_t
+#include <cstdint>  // int64_t
 
 #include <raft/core/host_mdspan.hpp>         // raft::host_matrix_view
 #include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
index c000a4810b..a6ed17e251 100644
--- a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
@@ -36,7 +36,7 @@ void select_k(const key_t* inK,
               bool select_min,
               int k,
               cudaStream_t stream) RAFT_EXPLICIT;
-};      // namespace raft::neighbors::detail
+};  // namespace raft::neighbors::detail
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
diff --git a/cpp/include/raft/neighbors/ivf_flat-ext.cuh b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
index 848703c9b5..8dbe7587ff 100644
--- a/cpp/include/raft/neighbors/ivf_flat-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include <cstdint>                                // int64_t
+#include <cstdint>  // int64_t
 
-#include <raft/core/device_mdspan.hpp>            // raft::device_matrix_view
-#include <raft/core/resources.hpp>                // raft::resources
+#include <raft/core/device_mdspan.hpp>  // raft::device_matrix_view
+#include <raft/core/resources.hpp>      // raft::resources
 #include <raft/neighbors/ivf_flat_serialize.cuh>
 #include <raft/neighbors/ivf_flat_types.hpp>      // raft::neighbors::ivf_flat::index
 #include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
index a18ee065bf..6641346a67 100644
--- a/cpp/include/raft/neighbors/ivf_flat-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
@@ -342,7 +342,7 @@ void extend(raft::resources const& handle,
 /** @} */
 
 /**
- * @brief Search ANN using the constructed index.
+ * @brief Search ANN using the constructed index with the given filter.
  *
  * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
  *
@@ -374,6 +374,8 @@ void extend(raft::resources const& handle,
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices
+ * @tparam IvfSampleFilterT Device filter function, with the signature
+ *         `(uint32_t query_ix, uint32 cluster_ix, uint32_t sample_ix) -> bool`
  *
  * @param[in] handle
  * @param[in] params configure the search
@@ -386,7 +388,7 @@ void extend(raft::resources const& handle,
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
  * @param[in] mr an optional memory resource to use across the searches (you can provide a large
  * enough memory pool here to avoid memory allocations within search).
- * @param[in] sample_filter a filter the greenlights samples for a given query
+ * @param[in] sample_filter a device filter function that greenlights samples for a given query
  */
 template <typename T, typename IdxT, typename IvfSampleFilterT>
 void search_with_filtering(raft::resources const& handle,
@@ -475,7 +477,7 @@ void search(raft::resources const& handle,
  */
 
 /**
- * @brief Search ANN using the constructed index using the given filter.
+ * @brief Search ANN using the constructed index with the given filter.
  *
  * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
  *
@@ -501,6 +503,8 @@ void search(raft::resources const& handle,
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices
+ * @tparam IvfSampleFilterT Device filter function, with the signature
+ *         `(uint32_t query_ix, uint32 cluster_ix, uint32_t sample_ix) -> bool`
  *
  * @param[in] handle
  * @param[in] params configure the search
@@ -509,7 +513,7 @@ void search(raft::resources const& handle,
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] sample_filter a filter the greenlights samples for a given query
+ * @param[in] sample_filter a device filter function that greenlights samples for a given query
  */
 template <typename T, typename IdxT, typename IvfSampleFilterT>
 void search_with_filtering(raft::resources const& handle,
diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
index fcfe837e2d..4a60cfc09d 100644
--- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cstdint>                                // int64_t
+#include <cstdint>  // int64_t
 
 #include <raft/core/device_mdspan.hpp>            // raft::device_matrix_view
 #include <raft/core/resources.hpp>                // raft::resources
diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
index ccf8717486..9f203d92fb 100644
--- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
@@ -134,7 +134,7 @@ void extend(raft::resources const& handle,
 }
 
 /**
- * @brief Search ANN using the constructed index using the given filter.
+ * @brief Search ANN using the constructed index with the given filter.
  *
  * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
  *
@@ -148,6 +148,8 @@ void extend(raft::resources const& handle,
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices
+ * @tparam IvfSampleFilterT Device filter function, with the signature
+ *         `(uint32_t query_ix, uint32 cluster_ix, uint32_t sample_ix) -> bool`
  *
  * @param[in] handle
  * @param[in] params configure the search
@@ -157,7 +159,7 @@ void extend(raft::resources const& handle,
  * [n_queries, k]
  * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
  * k]
- * @param[in] sample_filter a filter the greenlights samples for a given query.
+ * @param[in] sample_filter a device filter function that greenlights samples for a given query.
  */
 template <typename T, typename IdxT, typename IvfSampleFilterT>
 void search_with_filtering(raft::resources const& handle,
@@ -343,7 +345,7 @@ void extend(raft::resources const& handle,
 }
 
 /**
- * @brief Search ANN using the constructed index using the given filter.
+ * @brief Search ANN using the constructed index with the given filter.
  *
  * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
  *
@@ -372,6 +374,8 @@ void extend(raft::resources const& handle,
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices
+ * @tparam IvfSampleFilterT Device filter function, with the signature
+ *         `(uint32_t query_ix, uint32 cluster_ix, uint32_t sample_ix) -> bool`
  *
  * @param[in] handle
  * @param[in] params configure the search
@@ -382,7 +386,7 @@ void extend(raft::resources const& handle,
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] sample_filter a filter the greenlights samples for a given query
+ * @param[in] sample_filter a device filter function that greenlights samples for a given query
  */
 template <typename T, typename IdxT, typename IvfSampleFilterT>
 void search_with_filtering(raft::resources const& handle,
diff --git a/cpp/include/raft/neighbors/nn_descent.cuh b/cpp/include/raft/neighbors/nn_descent.cuh
new file mode 100644
index 0000000000..ceb5ae5643
--- /dev/null
+++ b/cpp/include/raft/neighbors/nn_descent.cuh
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/nn_descent.cuh"
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+
+namespace raft::neighbors::experimental::nn_descent {
+
+/**
+ * @defgroup nn-descent CUDA gradient descent nearest neighbor
+ * @{
+ */
+
+/**
+ * @brief Build nn-descent Index with dataset in device memory
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors::experimental;
+ *   // use default index parameters
+ *   nn_descent::index_params index_params;
+ *   // create and fill the index from a [N, D] raft::device_matrix_view dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *   // index.graph() provides a raft::host_matrix_view of an
+ *   // all-neighbors knn graph of dimensions [N, k] of the input
+ *   // dataset
+ * @endcode
+ *
+ * @tparam T data-type of the input dataset
+ * @tparam IdxT data-type for the output index
+ * @param[in] res raft::resources is an object mangaging resources
+ * @param[in] params an instance of nn_descent::index_params that are parameters
+ *               to run the nn-descent algorithm
+ * @param[in] dataset raft::device_matrix_view input dataset expected to be located
+ *                in device memory
+ * @return index<IdxT> index containing all-neighbors knn graph in host memory
+ */
+template <typename T, typename IdxT = uint32_t>
+index<IdxT> build(raft::resources const& res,
+                  index_params const& params,
+                  raft::device_matrix_view<const T, int64_t, row_major> dataset)
+{
+  return detail::build<T, IdxT>(res, params, dataset);
+}
+
+/**
+ * @brief Build nn-descent Index with dataset in device memory
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors::experimental;
+ *   // use default index parameters
+ *   nn_descent::index_params index_params;
+ *   // create and fill the index from a [N, D] raft::device_matrix_view dataset
+ *   auto knn_graph = raft::make_host_matrix<uint32_t, int64_t>(N, D);
+ *   auto index = nn_descent::index{res, knn_graph.view()};
+ *   cagra::build(res, index_params, dataset, index);
+ *   // index.graph() provides a raft::host_matrix_view of an
+ *   // all-neighbors knn graph of dimensions [N, k] of the input
+ *   // dataset
+ * @endcode
+ *
+ * @tparam T data-type of the input dataset
+ * @tparam IdxT data-type for the output index
+ * @param res raft::resources is an object mangaging resources
+ * @param[in] params an instance of nn_descent::index_params that are parameters
+ *               to run the nn-descent algorithm
+ * @param[in] dataset raft::device_matrix_view input dataset expected to be located
+ *                in device memory
+ * @param[out] idx raft::neighbors::experimental::nn_descentindex containing all-neighbors knn graph
+ * in host memory
+ */
+template <typename T, typename IdxT = uint32_t>
+void build(raft::resources const& res,
+           index_params const& params,
+           raft::device_matrix_view<const T, int64_t, row_major> dataset,
+           index<IdxT>& idx)
+{
+  detail::build<T, IdxT>(res, params, dataset, idx);
+}
+
+/**
+ * @brief Build nn-descent Index with dataset in host memory
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors::experimental;
+ *   // use default index parameters
+ *   nn_descent::index_params index_params;
+ *   // create and fill the index from a [N, D] raft::host_matrix_view dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *   // index.graph() provides a raft::host_matrix_view of an
+ *   // all-neighbors knn graph of dimensions [N, k] of the input
+ *   // dataset
+ * @endcode
+ *
+ * @tparam T data-type of the input dataset
+ * @tparam IdxT data-type for the output index
+ * @param res raft::resources is an object mangaging resources
+ * @param[in] params an instance of nn_descent::index_params that are parameters
+ *               to run the nn-descent algorithm
+ * @param[in] dataset raft::host_matrix_view input dataset expected to be located
+ *                in host memory
+ * @return index<IdxT> index containing all-neighbors knn graph in host memory
+ */
+template <typename T, typename IdxT = uint32_t>
+index<IdxT> build(raft::resources const& res,
+                  index_params const& params,
+                  raft::host_matrix_view<const T, int64_t, row_major> dataset)
+{
+  return detail::build<T, IdxT>(res, params, dataset);
+}
+
+/**
+ * @brief Build nn-descent Index with dataset in host memory
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors::experimental;
+ *   // use default index parameters
+ *   nn_descent::index_params index_params;
+ *   // create and fill the index from a [N, D] raft::host_matrix_view dataset
+ *   auto knn_graph = raft::make_host_matrix<uint32_t, int64_t>(N, D);
+ *   auto index = nn_descent::index{res, knn_graph.view()};
+ *   cagra::build(res, index_params, dataset, index);
+ *   // index.graph() provides a raft::host_matrix_view of an
+ *   // all-neighbors knn graph of dimensions [N, k] of the input
+ *   // dataset
+ * @endcode
+ *
+ * @tparam T data-type of the input dataset
+ * @tparam IdxT data-type for the output index
+ * @param[in] res raft::resources is an object mangaging resources
+ * @param[in] params an instance of nn_descent::index_params that are parameters
+ *               to run the nn-descent algorithm
+ * @param[in] dataset raft::host_matrix_view input dataset expected to be located
+ *                in host memory
+ * @param[out] idx raft::neighbors::experimental::nn_descentindex containing all-neighbors knn graph
+ * in host memory
+ */
+template <typename T, typename IdxT = uint32_t>
+void build(raft::resources const& res,
+           index_params const& params,
+           raft::host_matrix_view<const T, int64_t, row_major> dataset,
+           index<IdxT>& idx)
+{
+  detail::build<T, IdxT>(res, params, dataset, idx);
+}
+
+/** @} */  // end group nn-descent
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/include/raft/neighbors/nn_descent_types.hpp b/cpp/include/raft/neighbors/nn_descent_types.hpp
new file mode 100644
index 0000000000..64e464c618
--- /dev/null
+++ b/cpp/include/raft/neighbors/nn_descent_types.hpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "ann_types.hpp"
+#include <raft/core/resource/cuda_stream.hpp>
+
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
+
+namespace raft::neighbors::experimental::nn_descent {
+/**
+ * @ingroup nn_descent
+ * @{
+ */
+
+/**
+ * @brief Parameters used to build an nn-descent index
+ *
+ * `graph_degree`: For an input dataset of dimensions (N, D),
+ * determines the final dimensions of the all-neighbors knn graph
+ * which turns out to be of dimensions (N, graph_degree)
+ * `intermediate_graph_degree`: Internally, nn-descent builds an
+ * all-neighbors knn graph of dimensions (N, intermediate_graph_degree)
+ * before selecting the final `graph_degree` neighbors. It's recommended
+ * that `intermediate_graph_degree` >= 1.5 * graph_degree
+ * `max_iterations`: The number of iterations that nn-descent will refine
+ * the graph for. More iterations produce a better quality graph at cost of performance
+ * `termination_threshold`: The delta at which nn-descent will terminate its iterations
+ *
+ */
+struct index_params : ann::index_params {
+  size_t graph_degree              = 64;      // Degree of output graph.
+  size_t intermediate_graph_degree = 128;     // Degree of input graph for pruning.
+  size_t max_iterations            = 20;      // Number of nn-descent iterations.
+  float termination_threshold      = 0.0001;  // Termination threshold of nn-descent.
+};
+
+/**
+ * @brief nn-descent Build an nn-descent index
+ * The index contains an all-neighbors graph of the input dataset
+ * stored in host memory of dimensions (n_rows, n_cols)
+ *
+ * @tparam IdxT dtype to be used for constructing knn-graph
+ */
+template <typename IdxT>
+struct index : ann::index {
+ public:
+  /**
+   * @brief Construct a new index object
+   *
+   * This constructor creates an nn-descent index which is a knn-graph in host memory.
+   * The type of the knn-graph is a dense raft::host_matrix and dimensions are
+   * (n_rows, n_cols).
+   *
+   * @param res raft::resources is an object mangaging resources
+   * @param n_rows number of rows in knn-graph
+   * @param n_cols number of cols in knn-graph
+   */
+  index(raft::resources const& res, int64_t n_rows, int64_t n_cols)
+    : ann::index(),
+      res_{res},
+      metric_{raft::distance::DistanceType::L2Expanded},
+      graph_{raft::make_host_matrix<IdxT, int64_t, row_major>(n_rows, n_cols)},
+      graph_view_{graph_.view()}
+  {
+  }
+
+  /**
+   * @brief Construct a new index object
+   *
+   * This constructor creates an nn-descent index using a user allocated host memory knn-graph.
+   * The type of the knn-graph is a dense raft::host_matrix and dimensions are
+   * (n_rows, n_cols).
+   *
+   * @param res raft::resources is an object mangaging resources
+   * @param graph_view raft::host_matrix_view<IdxT, int64_t, raft::row_major> for storing knn-graph
+   */
+  index(raft::resources const& res,
+        raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view)
+    : ann::index(),
+      res_{res},
+      metric_{raft::distance::DistanceType::L2Expanded},
+      graph_{raft::make_host_matrix<IdxT, int64_t, row_major>(0, 0)},
+      graph_view_{graph_view}
+  {
+  }
+
+  /** Distance metric used for clustering. */
+  [[nodiscard]] constexpr inline auto metric() const noexcept -> raft::distance::DistanceType
+  {
+    return metric_;
+  }
+
+  // /** Total length of the index (number of vectors). */
+  [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT
+  {
+    return graph_view_.extent(0);
+  }
+
+  /** Graph degree */
+  [[nodiscard]] constexpr inline auto graph_degree() const noexcept -> uint32_t
+  {
+    return graph_view_.extent(1);
+  }
+
+  /** neighborhood graph [size, graph-degree] */
+  [[nodiscard]] inline auto graph() noexcept -> host_matrix_view<IdxT, int64_t, row_major>
+  {
+    return graph_view_;
+  }
+
+  // Don't allow copying the index for performance reasons (try avoiding copying data)
+  index(const index&)                    = delete;
+  index(index&&)                         = default;
+  auto operator=(const index&) -> index& = delete;
+  auto operator=(index&&) -> index&      = default;
+  ~index()                               = default;
+
+ private:
+  raft::resources const& res_;
+  raft::distance::DistanceType metric_;
+  raft::host_matrix<IdxT, int64_t, row_major> graph_;  // graph to return for non-int IdxT
+  raft::host_matrix_view<IdxT, int64_t, row_major>
+    graph_view_;  // view of graph for user provided matrix
+};
+
+/** @} */
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/include/raft/neighbors/refine-ext.cuh b/cpp/include/raft/neighbors/refine-ext.cuh
index c1fd4676dc..3ccd3891b7 100644
--- a/cpp/include/raft/neighbors/refine-ext.cuh
+++ b/cpp/include/raft/neighbors/refine-ext.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cstdint>                           // int64_t
+#include <cstdint>  // int64_t
 
 #include <raft/core/device_mdspan.hpp>       // raft::device_matrix_view
 #include <raft/core/host_mdspan.hpp>         // // raft::host_matrix_view
diff --git a/cpp/include/raft/neighbors/sample_filter_types.hpp b/cpp/include/raft/neighbors/sample_filter_types.hpp
index 5a301e9d2f..10c5e99372 100644
--- a/cpp/include/raft/neighbors/sample_filter_types.hpp
+++ b/cpp/include/raft/neighbors/sample_filter_types.hpp
@@ -37,6 +37,18 @@ struct none_ivf_sample_filter {
   }
 };
 
+/* A filter that filters nothing. This is the default behavior. */
+struct none_cagra_sample_filter {
+  inline _RAFT_HOST_DEVICE bool operator()(
+    // query index
+    const uint32_t query_ix,
+    // the index of the current sample
+    const uint32_t sample_ix) const
+  {
+    return true;
+  }
+};
+
 /**
  * If the filtering depends on the index of a sample, then the following
  * filter template can be used:
diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh
index b60940341a..702846f586 100644
--- a/cpp/include/raft/sparse/distance/distance.cuh
+++ b/cpp/include/raft/sparse/distance/distance.cuh
@@ -218,8 +218,8 @@ void pairwise_distance(raft::resources const& handle,
 
 /** @} */  // end of sparse_distance
 
-};         // namespace distance
-};         // namespace sparse
-};         // namespace raft
+};  // namespace distance
+};  // namespace sparse
+};  // namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index 3cb4a3e353..56ca2ebfa7 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -49,10 +49,10 @@ __global__ void csr_row_normalize_l1_kernel(
   // over each row and then divide the values in parallel.
   const int* ia,  // csr row ex_scan (sorted by row)
   const T* vals,
-  int nnz,        // array of values and number of non-zeros
-  int m,          // num rows in csr
+  int nnz,  // array of values and number of non-zeros
+  int m,    // num rows in csr
   T* result)
-{                 // output array
+{  // output array
 
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -95,8 +95,8 @@ __global__ void csr_row_normalize_l1_kernel(
 template <int TPB_X = 64, typename T>
 void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
                           const T* vals,
-                          int nnz,        // array of values and number of non-zeros
-                          int m,          // num rows in csr
+                          int nnz,  // array of values and number of non-zeros
+                          int m,    // num rows in csr
                           T* result,
                           cudaStream_t stream)
 {  // output array
@@ -115,10 +115,10 @@ __global__ void csr_row_normalize_max_kernel(
   // over each row and then divide the values in parallel.
   const int* ia,  // csr row ind array (sorted by row)
   const T* vals,
-  int nnz,        // array of values and number of non-zeros
-  int m,          // num total rows in csr
+  int nnz,  // array of values and number of non-zeros
+  int m,    // num total rows in csr
   T* result)
-{                 // output array
+{  // output array
 
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -163,8 +163,8 @@ __global__ void csr_row_normalize_max_kernel(
 template <int TPB_X = 64, typename T>
 void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
                            const T* vals,
-                           int nnz,        // array of values and number of non-zeros
-                           int m,          // num total rows in csr
+                           int nnz,  // array of values and number of non-zeros
+                           int m,    // num total rows in csr
                            T* result,
                            cudaStream_t stream)
 {
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 545f218e63..51836ca9aa 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -69,7 +69,7 @@ void fit_embedding(raft::resources const& handle,
     handle, ro, ci, vs, n, nnz};
 
   index_type neigvs       = n_components + 1;
-  index_type maxiter      = 4000;         // default reset value (when set to 0);
+  index_type maxiter      = 4000;  // default reset value (when set to 0);
   value_type tol          = 0.01;
   index_type restart_iter = 15 + neigvs;  // what cugraph is using
 
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index f7ebf50db0..43dd182fe5 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -39,8 +39,8 @@ namespace linalg {
 template <typename T>
 void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
                           const T* vals,
-                          int nnz,        // array of values and number of non-zeros
-                          int m,          // num rows in csr
+                          int nnz,  // array of values and number of non-zeros
+                          int m,    // num rows in csr
                           T* result,
                           cudaStream_t stream)
 {  // output array
@@ -60,8 +60,8 @@ void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
 template <typename T>
 void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
                            const T* vals,
-                           int nnz,        // array of values and number of non-zeros
-                           int m,          // num total rows in csr
+                           int nnz,  // array of values and number of non-zeros
+                           int m,    // num total rows in csr
                            T* result,
                            cudaStream_t stream)
 {
diff --git a/cpp/include/raft/sparse/neighbors/detail/knn.cuh b/cpp/include/raft/sparse/neighbors/detail/knn.cuh
index f2be427367..ff644c000e 100644
--- a/cpp/include/raft/sparse/neighbors/detail/knn.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn.cuh
@@ -58,7 +58,7 @@ struct csr_batcher_t {
   void set_batch(int batch_num)
   {
     batch_start_ = batch_num * batch_size_;
-    batch_stop_  = batch_start_ + batch_size_ - 1;                  // zero-based indexing
+    batch_stop_  = batch_start_ + batch_size_ - 1;  // zero-based indexing
 
     if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1;  // zero-based indexing
 
diff --git a/cpp/include/raft/sparse/solver/mst_solver.cuh b/cpp/include/raft/sparse/solver/mst_solver.cuh
index bfedb9ce2a..193431251f 100644
--- a/cpp/include/raft/sparse/solver/mst_solver.cuh
+++ b/cpp/include/raft/sparse/solver/mst_solver.cuh
@@ -78,10 +78,10 @@ class MST_solver {
   rmm::device_uvector<alteration_t> altered_weights;  // weights to be used for mst
   rmm::device_scalar<edge_t> mst_edge_count;  // total number of edges added after every iteration
   rmm::device_scalar<edge_t>
-    prev_mst_edge_count;                      // total number of edges up to the previous iteration
-  rmm::device_uvector<bool> mst_edge;         // mst output -  true if the edge belongs in mst
-  rmm::device_uvector<vertex_t> next_color;   //  next iteration color
-  rmm::device_uvector<vertex_t> color;        // index of color that vertex points to
+    prev_mst_edge_count;                     // total number of edges up to the previous iteration
+  rmm::device_uvector<bool> mst_edge;        // mst output -  true if the edge belongs in mst
+  rmm::device_uvector<vertex_t> next_color;  //  next iteration color
+  rmm::device_uvector<vertex_t> color;       // index of color that vertex points to
 
   // new src-dst pairs found per iteration
   rmm::device_uvector<vertex_t> temp_src;
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
index 95aeca64e5..70c5cec23f 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
@@ -61,7 +61,7 @@ void rbc_low_dim_pass_two(raft::resources const& handle,
                           float weight,
                           value_int* post_dists_counter) RAFT_EXPLICIT;
 
-};      // namespace raft::spatial::knn::detail
+};  // namespace raft::spatial::knn::detail
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
index 390436939f..1a48e1adde 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
@@ -36,7 +36,9 @@ void fusedL2Knn(size_t D,
                 bool rowMajorIndex,
                 bool rowMajorQuery,
                 cudaStream_t stream,
-                raft::distance::DistanceType metric) RAFT_EXPLICIT;
+                raft::distance::DistanceType metric,
+                const value_t* index_norms = NULL,
+                const value_t* query_norms = NULL) RAFT_EXPLICIT;
 
 }  // namespace raft::spatial::knn::detail
 
@@ -56,7 +58,9 @@ void fusedL2Knn(size_t D,
     bool rowMajorIndex,                                                                     \
     bool rowMajorQuery,                                                                     \
     cudaStream_t stream,                                                                    \
-    raft::distance::DistanceType metric)
+    raft::distance::DistanceType metric,                                                    \
+    const Mvalue_t* index_norms,                                                            \
+    const Mvalue_t* query_norms);
 
 instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, true);
 instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, false);
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh
index 4a571c1447..67abab3d1e 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh
@@ -706,6 +706,8 @@ template <typename DataT,
           bool isRowMajor>
 void fusedL2ExpKnnImpl(const DataT* x,
                        const DataT* y,
+                       const DataT* xn,
+                       const DataT* yn,
                        IdxT m,
                        IdxT n,
                        IdxT k,
@@ -787,19 +789,25 @@ void fusedL2ExpKnnImpl(const DataT* x,
       }
     }
 
-    DataT* xn = (DataT*)workspace;
-    DataT* yn = (DataT*)workspace;
-
-    if (x != y) {
-      yn += m;
-      raft::linalg::rowNorm(
-        xn, x, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-      raft::linalg::rowNorm(
-        yn, y, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-    } else {
+    // calculate norms if they haven't been passed in
+    if (!xn) {
+      DataT* xn_ = (DataT*)workspace;
+      workspace  = xn_ + m;
       raft::linalg::rowNorm(
-        xn, x, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+        xn_, x, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+      xn = xn_;
     }
+    if (!yn) {
+      if (x == y) {
+        yn = xn;
+      } else {
+        DataT* yn_ = (DataT*)(workspace);
+        raft::linalg::rowNorm(
+          yn_, y, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+        yn = yn_;
+      }
+    }
+
     fusedL2ExpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(x,
                                                                 y,
                                                                 xn,
@@ -836,6 +844,8 @@ void fusedL2ExpKnn(IdxT m,
                    IdxT ldd,
                    const DataT* x,
                    const DataT* y,
+                   const DataT* xn,
+                   const DataT* yn,
                    bool sqrt,
                    OutT* out_dists,
                    IdxT* out_inds,
@@ -850,6 +860,8 @@ void fusedL2ExpKnn(IdxT m,
     fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs, isRowMajor>(
       x,
       y,
+      xn,
+      yn,
       m,
       n,
       k,
@@ -867,6 +879,8 @@ void fusedL2ExpKnn(IdxT m,
     fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs, isRowMajor>(
       x,
       y,
+      xn,
+      yn,
       m,
       n,
       k,
@@ -883,6 +897,8 @@ void fusedL2ExpKnn(IdxT m,
   } else {
     fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(x,
                                                                             y,
+                                                                            xn,
+                                                                            yn,
                                                                             m,
                                                                             n,
                                                                             k,
@@ -927,7 +943,9 @@ void fusedL2Knn(size_t D,
                 bool rowMajorIndex,
                 bool rowMajorQuery,
                 cudaStream_t stream,
-                raft::distance::DistanceType metric)
+                raft::distance::DistanceType metric,
+                const value_t* index_norms = NULL,
+                const value_t* query_norms = NULL)
 {
   // Validate the input data
   ASSERT(k > 0, "l2Knn: k must be > 0");
@@ -968,6 +986,8 @@ void fusedL2Knn(size_t D,
                                                                               ldd,
                                                                               query,
                                                                               index,
+                                                                              query_norms,
+                                                                              index_norms,
                                                                               sqrt,
                                                                               out_dists,
                                                                               out_inds,
@@ -985,6 +1005,8 @@ void fusedL2Knn(size_t D,
                                                                                 ldd,
                                                                                 query,
                                                                                 index,
+                                                                                query_norms,
+                                                                                index_norms,
                                                                                 sqrt,
                                                                                 out_dists,
                                                                                 out_inds,
diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh
index 7daa1792b1..1f97cd5f76 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/adjusted_rand_index.cuh
@@ -83,7 +83,7 @@ double adjusted_rand_index(raft::resources const& handle,
 
 /** @} */  // end group stats_adj_rand_index
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/completeness_score.cuh b/cpp/include/raft/stats/completeness_score.cuh
index 07fd61411d..b669e0de32 100644
--- a/cpp/include/raft/stats/completeness_score.cuh
+++ b/cpp/include/raft/stats/completeness_score.cuh
@@ -85,7 +85,7 @@ double completeness_score(raft::resources const& handle,
 
 /** @} */  // end group stats_completeness
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/cov.cuh b/cpp/include/raft/stats/cov.cuh
index 67f44b0fde..ad5d233c0e 100644
--- a/cpp/include/raft/stats/cov.cuh
+++ b/cpp/include/raft/stats/cov.cuh
@@ -116,7 +116,7 @@ void cov(raft::resources const& handle,
 
 /** @} */  // end group stats_cov
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/entropy.cuh b/cpp/include/raft/stats/entropy.cuh
index fcc49fefd2..fe432569ee 100644
--- a/cpp/include/raft/stats/entropy.cuh
+++ b/cpp/include/raft/stats/entropy.cuh
@@ -80,7 +80,7 @@ double entropy(raft::resources const& handle,
 
 /** @} */  // end group stats_entropy
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/histogram.cuh b/cpp/include/raft/stats/histogram.cuh
index c374251359..8480d16316 100644
--- a/cpp/include/raft/stats/histogram.cuh
+++ b/cpp/include/raft/stats/histogram.cuh
@@ -115,7 +115,7 @@ void histogram(raft::resources const& handle,
 
 /** @} */  // end group stats_histogram
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/stats/homogeneity_score.cuh b/cpp/include/raft/stats/homogeneity_score.cuh
index ce7872d55d..311cd599f8 100644
--- a/cpp/include/raft/stats/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/homogeneity_score.cuh
@@ -88,7 +88,7 @@ double homogeneity_score(raft::resources const& handle,
 
 /** @} */  // end group stats_homogeneity_score
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/kl_divergence.cuh b/cpp/include/raft/stats/kl_divergence.cuh
index 2e01918d2a..b417cbd509 100644
--- a/cpp/include/raft/stats/kl_divergence.cuh
+++ b/cpp/include/raft/stats/kl_divergence.cuh
@@ -76,7 +76,7 @@ value_t kl_divergence(raft::resources const& handle,
 
 /** @} */  // end group kl_divergence
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index 96c9ca3b5c..43d39cfd6c 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -93,7 +93,7 @@ void mean(raft::resources const& handle,
 
 /** @} */  // end group stats_mean
 
-};         // namespace stats
-};         // namespace raft
+};  // namespace stats
+};  // namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
index 48f5eb667f..83f9a8a941 100644
--- a/cpp/include/raft/stats/mean_center.cuh
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -160,7 +160,7 @@ void mean_add(raft::resources const& handle,
 
 /** @} */  // end group stats_mean_center
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/meanvar.cuh b/cpp/include/raft/stats/meanvar.cuh
index f6127df701..5c27a6caf6 100644
--- a/cpp/include/raft/stats/meanvar.cuh
+++ b/cpp/include/raft/stats/meanvar.cuh
@@ -107,6 +107,6 @@ void meanvar(raft::resources const& handle,
 
 /** @} */  // end group stats_mean_var
 
-};         // namespace raft::stats
+};  // namespace raft::stats
 
 #endif
diff --git a/cpp/include/raft/stats/minmax.cuh b/cpp/include/raft/stats/minmax.cuh
index 0c5a62257d..5c5ff346a4 100644
--- a/cpp/include/raft/stats/minmax.cuh
+++ b/cpp/include/raft/stats/minmax.cuh
@@ -139,6 +139,6 @@ void minmax(raft::resources const& handle,
 
 /** @} */  // end group stats_minmax
 
-};         // namespace stats
-};         // namespace raft
+};  // namespace stats
+};  // namespace raft
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mutual_info_score.cuh b/cpp/include/raft/stats/mutual_info_score.cuh
index 5c4ae43e09..5a334e9280 100644
--- a/cpp/include/raft/stats/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/mutual_info_score.cuh
@@ -86,7 +86,7 @@ double mutual_info_score(raft::resources const& handle,
 
 /** @} */  // end group stats_mutual_info
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/rand_index.cuh b/cpp/include/raft/stats/rand_index.cuh
index 6a208c5492..a21a0c0dc5 100644
--- a/cpp/include/raft/stats/rand_index.cuh
+++ b/cpp/include/raft/stats/rand_index.cuh
@@ -72,7 +72,7 @@ double rand_index(raft::resources const& handle,
 
 /** @} */  // end group stats_rand_index
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
index 6349f8fd11..0a67bd2325 100644
--- a/cpp/include/raft/stats/stddev.cuh
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -182,7 +182,7 @@ void vars(raft::resources const& handle,
 
 /** @} */  // end group stats_variance
 
-};         // namespace stats
-};         // namespace raft
+};  // namespace stats
+};  // namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
index 2ac9cd9eb5..2c3ed1b83e 100644
--- a/cpp/include/raft/stats/sum.cuh
+++ b/cpp/include/raft/stats/sum.cuh
@@ -85,7 +85,7 @@ void sum(raft::resources const& handle,
 
 /** @} */  // end group stats_sum
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/v_measure.cuh b/cpp/include/raft/stats/v_measure.cuh
index 8ea5c65600..041adb5e38 100644
--- a/cpp/include/raft/stats/v_measure.cuh
+++ b/cpp/include/raft/stats/v_measure.cuh
@@ -92,7 +92,7 @@ double v_measure(raft::resources const& handle,
 
 /** @} */  // end group stats_vmeasure
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
index 7d06d5dff1..da22f0163c 100644
--- a/cpp/include/raft/stats/weighted_mean.cuh
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -186,7 +186,7 @@ void col_weighted_mean(raft::resources const& handle,
 
 /** @} */  // end group stats_weighted_mean
 
-};         // end namespace stats
-};         // end namespace raft
+};  // end namespace stats
+};  // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/util/cache.cuh b/cpp/include/raft/util/cache.cuh
index a9cfe64568..255dea16bb 100644
--- a/cpp/include/raft/util/cache.cuh
+++ b/cpp/include/raft/util/cache.cuh
@@ -362,9 +362,9 @@ class Cache {
   int GetSize() const { return cached_keys.size(); }
 
  protected:
-  int n_vec;            //!< Number of elements in a cached vector
-  float cache_size;     //!< in MiB
-  int n_cache_sets;     //!< number of cache sets
+  int n_vec;         //!< Number of elements in a cached vector
+  float cache_size;  //!< in MiB
+  int n_cache_sets;  //!< number of cache sets
 
   const int TPB = 256;  //!< threads per block for kernel launch
   int n_iter    = 0;    //!< Counter for time stamping cache operation
diff --git a/cpp/include/raft/util/cache_util.cuh b/cpp/include/raft/util/cache_util.cuh
index a7dcc22b02..79a94d9563 100644
--- a/cpp/include/raft/util/cache_util.cuh
+++ b/cpp/include/raft/util/cache_util.cuh
@@ -46,7 +46,7 @@ __global__ void get_vecs(
   const math_t* cache, int_t n_vec, const idx_t* cache_idx, int_t n, math_t* out)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int row = tid % n_vec;             // row idx
+  int row = tid % n_vec;  // row idx
   if (tid < n_vec * n) {
     size_t out_col   = tid / n_vec;  // col idx
     size_t cache_col = cache_idx[out_col];
@@ -93,7 +93,7 @@ __global__ void store_vecs(const math_t* tile,
                            int n_cache_vecs)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int row = tid % n_vec;          // row idx
+  int row = tid % n_vec;  // row idx
   if (tid < n_vec * n) {
     int tile_col  = tid / n_vec;  // col idx
     int data_col  = tile_idx ? tile_idx[tile_col] : tile_col;
@@ -357,7 +357,7 @@ __global__ void get_cache_idx(int* keys,
       cache_time[cidx] = time;  // update time stamp
       cache_idx[tid]   = cidx;  // exact cache idx
     } else {
-      cache_idx[tid] = sidx;    // assign cache set
+      cache_idx[tid] = sidx;  // assign cache set
     }
   }
 }
diff --git a/cpp/include/raft/util/device_loads_stores.cuh b/cpp/include/raft/util/device_loads_stores.cuh
index e3d54c51f5..65936b2f66 100644
--- a/cpp/include/raft/util/device_loads_stores.cuh
+++ b/cpp/include/raft/util/device_loads_stores.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cstdint>                            // uintX_t
+#include <cstdint>  // uintX_t
 #include <raft/core/device_span.hpp>
 #include <raft/util/cuda_dev_essentials.cuh>  // DI
 
diff --git a/cpp/include/raft/util/memory_pool-inl.hpp b/cpp/include/raft/util/memory_pool-inl.hpp
index 070c8f4e30..ad94ee0096 100644
--- a/cpp/include/raft/util/memory_pool-inl.hpp
+++ b/cpp/include/raft/util/memory_pool-inl.hpp
@@ -25,6 +25,10 @@
 
 namespace raft {
 
+/**
+ * @defgroup memory_pool Memory Pool
+ * @{
+ */
 /**
  * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned
  * unique pointer.
@@ -73,4 +77,5 @@ RAFT_INLINE_CONDITIONAL std::unique_ptr<rmm::mr::device_memory_resource> get_poo
   return pool_res;
 }
 
+/** @} */
 }  // namespace raft
diff --git a/cpp/internal/raft_internal/matrix/select_k.cuh b/cpp/internal/raft_internal/matrix/select_k.cuh
index b72e67580a..1d15c5fc03 100644
--- a/cpp/internal/raft_internal/matrix/select_k.cuh
+++ b/cpp/internal/raft_internal/matrix/select_k.cuh
@@ -33,6 +33,7 @@ struct params {
   bool use_index_input       = true;
   bool use_same_leading_bits = false;
   bool use_memory_pool       = true;
+  double frac_infinities     = 0.0;
 };
 
 inline auto operator<<(std::ostream& os, const params& ss) -> std::ostream&
@@ -41,8 +42,10 @@ inline auto operator<<(std::ostream& os, const params& ss) -> std::ostream&
   os << ", len: " << ss.len;
   os << ", k: " << ss.k;
   os << (ss.select_min ? ", asc" : ", dsc");
-  os << (ss.use_index_input ? "" : ", no-input-index");
-  os << (ss.use_same_leading_bits ? ", same-leading-bits}" : "}");
+  if (!ss.use_index_input) { os << ", no-input-index"; }
+  if (ss.use_same_leading_bits) { os << ", same-leading-bits"; }
+  if (ss.frac_infinities > 0) { os << ", infs: " << ss.frac_infinities; }
+  os << "}";
   return os;
 }
 
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
index 49f96aa18b..3d8bbcec4a 100644
--- a/cpp/scripts/run-clang-tidy.py
+++ b/cpp/scripts/run-clang-tidy.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@
 import subprocess
 
 
-EXPECTED_VERSIONS = ("11.1.0",)
+EXPECTED_VERSIONS = ("16.0.6",)
 VERSION_REGEX = re.compile(r"clang version ([0-9.]+)")
 CMAKE_COMPILER_REGEX = re.compile(
     r"^\s*CMAKE_CXX_COMPILER:FILEPATH=(.+)\s*$", re.MULTILINE)
diff --git a/cpp/src/neighbors/brute_force_knn_index_float.cu b/cpp/src/neighbors/brute_force_knn_index_float.cu
new file mode 100644
index 0000000000..f2fda93a97
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_knn_index_float.cu
@@ -0,0 +1,39 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+template void raft::neighbors::brute_force::search<float, int>(
+  raft::resources const& res,
+  const raft::neighbors::brute_force::index<float>& idx,
+  raft::device_matrix_view<const float, int64_t, row_major> queries,
+  raft::device_matrix_view<int, int64_t, row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, row_major> distances);
+
+template void raft::neighbors::brute_force::search<float, int64_t>(
+  raft::resources const& res,
+  const raft::neighbors::brute_force::index<float>& idx,
+  raft::device_matrix_view<const float, int64_t, row_major> queries,
+  raft::device_matrix_view<int64_t, int64_t, row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, row_major> distances);
+
+template raft::neighbors::brute_force::index<float> raft::neighbors::brute_force::build<float>(
+  raft::resources const& res,
+  raft::device_matrix_view<const float, int64_t, row_major> dataset,
+  raft::distance::DistanceType metric,
+  float metric_arg);
\ No newline at end of file
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
index 784d116503..15eb0a9e65 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
@@ -39,41 +39,45 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \\
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \\
-      raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \\
-      raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \\
-      INDEX_T* const topk_indices_ptr,                                                        \\
-      DISTANCE_T* const topk_distances_ptr,                                                   \\
-      const DATA_T* const queries_ptr,                                                        \\
-      const uint32_t num_queries,                                                             \\
-      const INDEX_T* dev_seed_ptr,                                                            \\
-      uint32_t* const num_executed_iterations,                                                \\
-      uint32_t topk,                                                                          \\
-      uint32_t block_size,                                                                    \\
-      uint32_t result_buffer_size,                                                            \\
-      uint32_t smem_size,                                                                     \\
-      int64_t hash_bitlen,                                                                    \\
-      INDEX_T* hashmap_ptr,                                                                   \\
-      uint32_t num_cta_per_query,                                                             \\
-      uint32_t num_random_samplings,                                                          \\
-      uint64_t rand_xor_mask,                                                                 \\
-      uint32_t num_seeds,                                                                     \\
-      size_t itopk_size,                                                                      \\
-      size_t search_width,                                                                     \\
-      size_t min_iterations,                                                                  \\
-      size_t max_iterations,                                                                  \\
-      cudaStream_t stream);
+#define instantiate_kernel_selection(                                                       \\
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \\
+  template void                                                                             \\
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \\
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \\
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \\
+    INDEX_T* const topk_indices_ptr,                                                        \\
+    DISTANCE_T* const topk_distances_ptr,                                                   \\
+    const DATA_T* const queries_ptr,                                                        \\
+    const uint32_t num_queries,                                                             \\
+    const INDEX_T* dev_seed_ptr,                                                            \\
+    uint32_t* const num_executed_iterations,                                                \\
+    uint32_t topk,                                                                          \\
+    uint32_t block_size,                                                                    \\
+    uint32_t result_buffer_size,                                                            \\
+    uint32_t smem_size,                                                                     \\
+    int64_t hash_bitlen,                                                                    \\
+    INDEX_T* hashmap_ptr,                                                                   \\
+    uint32_t num_cta_per_query,                                                             \\
+    uint32_t num_random_samplings,                                                          \\
+    uint64_t rand_xor_mask,                                                                 \\
+    uint32_t num_seeds,                                                                     \\
+    size_t itopk_size,                                                                      \\
+    size_t search_width,                                                                    \\
+    size_t min_iterations,                                                                  \\
+    size_t max_iterations,                                                                  \\
+    SAMPLE_FILTER_T sample_filter,                                                          \\
+    cudaStream_t stream);
 
 """
 
 trailer = """
 #undef instantiate_kernel_selection
 
-} // namespace raft::neighbors::cagra::detail::namespace multi_cta_search
+}  // namespace raft::neighbors::cagra::detail::multi_cta_search
 """
 
 mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
@@ -97,7 +101,7 @@
         with open(path, "w") as f:
             f.write(header)
             f.write(
-                f"instantiate_kernel_selection({team}, {mxdim}, {data_t}, {idx_t}, {distance_t});\n"
+                f"instantiate_kernel_selection(\n  {team}, {mxdim}, {data_t}, {idx_t}, {distance_t}, raft::neighbors::filtering::none_cagra_sample_filter);\n"
             )
             f.write(trailer)
             # For pasting into CMakeLists.txt
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
index 2a4e7ac607..1a3b2284bd 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_widthhhhhhhhh,                                                              \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 1024, float, uint32_t, float);
+instantiate_kernel_selection(
+  32, 1024, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
index 115ce3b48b..36e86d9ed6 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(8, 128, float, uint32_t, float);
+instantiate_kernel_selection(
+  8, 128, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
index c5e704a85f..6f1af2d93f 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(16, 256, float, uint32_t, float);
+instantiate_kernel_selection(
+  16, 256, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
index 3469facf39..1279f8e415 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 512, float, uint32_t, float);
+instantiate_kernel_selection(
+  32, 512, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
index 327bfc73b4..0dabff0df5 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 1024, float, uint64_t, float);
+instantiate_kernel_selection(
+  32, 1024, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
index 1abe0cd8af..72bb74cdb8 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(8, 128, float, uint64_t, float);
+instantiate_kernel_selection(
+  8, 128, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
index dd61810d06..dceea10b5d 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(16, 256, float, uint64_t, float);
+instantiate_kernel_selection(
+  16, 256, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
index 8e12bab514..acb8bd6a12 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 512, float, uint64_t, float);
+instantiate_kernel_selection(
+  32, 512, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
index d946ac9c79..0254f09ff0 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 1024, int8_t, uint32_t, float);
+instantiate_kernel_selection(
+  32, 1024, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
index e4d7b44d1e..2b67e7e968 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(8, 128, int8_t, uint32_t, float);
+instantiate_kernel_selection(
+  8, 128, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
index b8dc3b38a8..17d6722e58 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(16, 256, int8_t, uint32_t, float);
+instantiate_kernel_selection(
+  16, 256, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
index 749b35bad6..38f02812e2 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 512, int8_t, uint32_t, float);
+instantiate_kernel_selection(
+  32, 512, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
index 428d460ba8..fa111196c6 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_widthh,                                                                     \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 1024, uint8_t, uint32_t, float);
+instantiate_kernel_selection(
+  32, 1024, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
index 28a20b865e..1ef3c28aa3 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(8, 128, uint8_t, uint32_t, float);
+instantiate_kernel_selection(
+  8, 128, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
index e85a84ae8e..d26cb44843 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(16, 256, uint8_t, uint32_t, float);
+instantiate_kernel_selection(
+  16, 256, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
index 232b62ebcd..4d4322f261 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 512, uint8_t, uint32_t, float);
+instantiate_kernel_selection(
+  32, 512, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py
index cf61a45b4a..249555082e 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py
@@ -39,35 +39,38 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \\
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \\
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \\
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \\
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \\
-    INDEX_T* const topk_indices_ptr,                                                     \\
-    DISTANCE_T* const topk_distances_ptr,                                                \\
-    const DATA_T* const queries_ptr,                                                     \\
-    const uint32_t num_queries,                                                          \\
-    const INDEX_T* dev_seed_ptr,                                                         \\
-    uint32_t* const num_executed_iterations,                                             \\
-    uint32_t topk,                                                                       \\
-    uint32_t num_itopk_candidates,                                                       \\
-    uint32_t block_size,                                                                 \\
-    uint32_t smem_size,                                                                  \\
-    int64_t hash_bitlen,                                                                 \\
-    INDEX_T* hashmap_ptr,                                                                \\
-    size_t small_hash_bitlen,                                                           \\
-    size_t small_hash_reset_interval,                                                    \\
-    uint32_t num_random_samplings,                                                       \\
-    uint64_t rand_xor_mask,                                                              \\
-    uint32_t num_seeds,                                                                  \\
-    size_t itopk_size,                                                                   \\
-    size_t search_width,                                                                  \\
-    size_t min_iterations,                                                               \\
-    size_t max_iterations,                                                               \\
+#define instantiate_single_cta_select_and_run(                                              \\
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \\
+  template void                                                                             \\
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \\
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \\
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \\
+    INDEX_T* const topk_indices_ptr,                                                        \\
+    DISTANCE_T* const topk_distances_ptr,                                                   \\
+    const DATA_T* const queries_ptr,                                                        \\
+    const uint32_t num_queries,                                                             \\
+    const INDEX_T* dev_seed_ptr,                                                            \\
+    uint32_t* const num_executed_iterations,                                                \\
+    uint32_t topk,                                                                          \\
+    uint32_t num_itopk_candidates,                                                          \\
+    uint32_t block_size,                                                                    \\
+    uint32_t smem_size,                                                                     \\
+    int64_t hash_bitlen,                                                                    \\
+    INDEX_T* hashmap_ptr,                                                                   \\
+    size_t small_hash_bitlen,                                                               \\
+    size_t small_hash_reset_interval,                                                       \\
+    uint32_t num_random_samplings,                                                          \\
+    uint64_t rand_xor_mask,                                                                 \\
+    uint32_t num_seeds,                                                                     \\
+    size_t itopk_size,                                                                      \\
+    size_t search_width,                                                                    \\
+    size_t min_iterations,                                                                  \\
+    size_t max_iterations,                                                                  \\
+    SAMPLE_FILTER_T sample_filter,                                                          \\
     cudaStream_t stream);
 
 """
@@ -75,7 +78,7 @@
 trailer = """
 #undef instantiate_single_cta_search_kernel
 
-} // namespace raft::neighbors::cagra::detail::single_cta_search
+}  // namespace raft::neighbors::cagra::detail::single_cta_search
 """
 
 mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
@@ -102,7 +105,7 @@
         with open(path, "w") as f:
             f.write(header)
             f.write(
-                f"instantiate_single_cta_select_and_run({team}, {mxdim},{data_t}, {idx_t}, {distance_t});\n"
+                f"instantiate_single_cta_select_and_run(\n  {team}, {mxdim}, {data_t}, {idx_t}, {distance_t}, raft::neighbors::filtering::none_cagra_sample_filter);\n"
             )
 
             f.write(trailer)
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
index eb45d4ff08..b8c23103ba 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 1024, float, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 1024, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
index 049715aa20..8ab1897119 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(8, 128, float, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  8, 128, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
index 6028c283db..9fd36b4cb9 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(16, 256, float, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  16, 256, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
index 2566e9cbd9..a9ee2c864b 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 512, float, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 512, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
index 4cd96ad9c0..dadc574b65 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 1024, float, uint64_t, float);
+instantiate_single_cta_select_and_run(
+  32, 1024, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
index 822a2efb2f..30e043f47e 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(8, 128, float, uint64_t, float);
+instantiate_single_cta_select_and_run(
+  8, 128, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
index 80d1f76b9b..089e4c930f 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(16, 256, float, uint64_t, float);
+instantiate_single_cta_select_and_run(
+  16, 256, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
index 06c3eaf10b..3e8ffb8bf8 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 512, float, uint64_t, float);
+instantiate_single_cta_select_and_run(
+  32, 512, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
index b4c30ac943..279587738e 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 1024, int8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 1024, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
index c8d0df3ac4..ef127d3f7d 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(8, 128, int8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  8, 128, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
index 19ecee91af..7fcfdcc28e 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(16, 256, int8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  16, 256, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
index 52c4eb7d6b..a6c606d99b 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 512, int8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 512, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
index 4675e17084..0b8be56614 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 1024, uint8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 1024, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
index e73e1071ee..4c193b9408 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(8, 128, uint8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  8, 128, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
index 01e26b5f29..bdf16d2f03 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(16, 256, uint8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  16, 256, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
index b0534b555f..93624df4aa 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 512, uint8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 512, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu b/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
index 67b08655e6..b73cf31c58 100644
--- a/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
@@ -32,7 +32,9 @@
     bool rowMajorIndex,                                                                      \
     bool rowMajorQuery,                                                                      \
     cudaStream_t stream,                                                                     \
-    raft::distance::DistanceType metric)
+    raft::distance::DistanceType metric,                                                     \
+    const Mvalue_t* index_norms,                                                             \
+    const Mvalue_t* query_norms)
 
 instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, true);
 instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, false);
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu b/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
index 3c0d13710e..35ef37c984 100644
--- a/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
@@ -32,7 +32,9 @@
     bool rowMajorIndex,                                                                      \
     bool rowMajorQuery,                                                                      \
     cudaStream_t stream,                                                                     \
-    raft::distance::DistanceType metric)
+    raft::distance::DistanceType metric,                                                     \
+    const Mvalue_t* index_norms,                                                             \
+    const Mvalue_t* query_norms)
 
 instantiate_raft_spatial_knn_detail_fusedL2Knn(int64_t, float, true);
 instantiate_raft_spatial_knn_detail_fusedL2Knn(int64_t, float, false);
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu b/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
index e799c5181f..ff23d9c41b 100644
--- a/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
@@ -32,7 +32,9 @@
     bool rowMajorIndex,                                                                      \
     bool rowMajorQuery,                                                                      \
     cudaStream_t stream,                                                                     \
-    raft::distance::DistanceType metric)
+    raft::distance::DistanceType metric,                                                     \
+    const Mvalue_t* index_norms,                                                             \
+    const Mvalue_t* query_norms)
 
 // These are used by brute_force_knn:
 instantiate_raft_spatial_knn_detail_fusedL2Knn(uint32_t, float, true);
diff --git a/cpp/template/CMakeLists.txt b/cpp/template/CMakeLists.txt
index 44b06e1b5f..538eac07ef 100644
--- a/cpp/template/CMakeLists.txt
+++ b/cpp/template/CMakeLists.txt
@@ -34,5 +34,8 @@ rapids_cpm_init()
 include(cmake/thirdparty/get_raft.cmake)
 
 # -------------- compile tasks ----------------- #
-add_executable(TEST_RAFT src/test_vector_search.cu)
-target_link_libraries(TEST_RAFT PRIVATE raft::raft raft::compiled)
+add_executable(CAGRA_EXAMPLE src/cagra_example.cu)
+target_link_libraries(CAGRA_EXAMPLE PRIVATE raft::raft raft::compiled)
+
+add_executable(IVF_FLAT_EXAMPLE src/ivf_flat_example.cu)
+target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE raft::raft raft::compiled)
diff --git a/cpp/template/src/cagra_example.cu b/cpp/template/src/cagra_example.cu
new file mode 100644
index 0000000000..7f3a7d6676
--- /dev/null
+++ b/cpp/template/src/cagra_example.cu
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/neighbors/cagra.cuh>
+#include <raft/random/make_blobs.cuh>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include "common.cuh"
+
+void cagra_build_search_simple(raft::device_resources const& dev_resources,
+                               raft::device_matrix_view<const float, int64_t> dataset,
+                               raft::device_matrix_view<const float, int64_t> queries)
+{
+  using namespace raft::neighbors;
+
+  int64_t topk      = 12;
+  int64_t n_queries = queries.extent(0);
+
+  // create output arrays
+  auto neighbors = raft::make_device_matrix<uint32_t>(dev_resources, n_queries, topk);
+  auto distances = raft::make_device_matrix<float>(dev_resources, n_queries, topk);
+
+  // use default index parameters
+  cagra::index_params index_params;
+
+  std::cout << "Building CAGRA index (search graph)" << std::endl;
+  auto index = cagra::build<float, uint32_t>(dev_resources, index_params, dataset);
+
+  std::cout << "CAGRA index has " << index.size() << " vectors" << std::endl;
+  std::cout << "CAGRA graph has degree " << index.graph_degree() << ", graph size ["
+            << index.graph().extent(0) << ", " << index.graph().extent(1) << "]" << std::endl;
+
+  // use default search parameters
+  cagra::search_params search_params;
+  // search K nearest neighbors
+  cagra::search<float, uint32_t>(
+    dev_resources, search_params, index, queries, neighbors.view(), distances.view());
+
+  // The call to ivf_flat::search is asynchronous. Before accessing the data, sync by calling
+  // raft::resource::sync_stream(dev_resources);
+
+  print_results(dev_resources, neighbors.view(), distances.view());
+}
+
+int main()
+{
+  raft::device_resources dev_resources;
+
+  // Set pool memory resource with 1 GiB initial pool size. All allocations use the same pool.
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr(
+    rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull);
+  rmm::mr::set_current_device_resource(&pool_mr);
+
+  // Alternatively, one could define a pool allocator for temporary arrays (used within RAFT
+  // algorithms). In that case only the internal arrays would use the pool, any other allocation
+  // uses the default RMM memory resource. Here is how to change the workspace memory resource to
+  // a pool with 2 GiB upper limit.
+  // raft::resource::set_workspace_to_pool_resource(dev_resources, 2 * 1024 * 1024 * 1024ull);
+
+  // Create input arrays.
+  int64_t n_samples = 10000;
+  int64_t n_dim     = 90;
+  int64_t n_queries = 10;
+  auto dataset      = raft::make_device_matrix<float, int64_t>(dev_resources, n_samples, n_dim);
+  auto queries      = raft::make_device_matrix<float, int64_t>(dev_resources, n_queries, n_dim);
+  generate_dataset(dev_resources, dataset.view(), queries.view());
+
+  // Simple build and search example.
+  cagra_build_search_simple(dev_resources,
+                            raft::make_const_mdspan(dataset.view()),
+                            raft::make_const_mdspan(queries.view()));
+}
diff --git a/cpp/template/src/common.cuh b/cpp/template/src/common.cuh
new file mode 100644
index 0000000000..0b72d3bf3b
--- /dev/null
+++ b/cpp/template/src/common.cuh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/sample_without_replacement.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+
+// Fill dataset and queries with synthetic data.
+void generate_dataset(raft::device_resources const& dev_resources,
+                      raft::device_matrix_view<float, int64_t> dataset,
+                      raft::device_matrix_view<float, int64_t> queries)
+{
+  auto labels = raft::make_device_vector<int64_t, int64_t>(dev_resources, dataset.extent(0));
+  raft::random::make_blobs(dev_resources, dataset, labels.view());
+  raft::random::RngState r(1234ULL);
+  raft::random::uniform(dev_resources,
+                        r,
+                        raft::make_device_vector_view(queries.data_handle(), queries.size()),
+                        -1.0f,
+                        1.0f);
+}
+
+// Copy the results to host and print a few samples
+template <typename IdxT>
+void print_results(raft::device_resources const& dev_resources,
+                   raft::device_matrix_view<IdxT, int64_t> neighbors,
+                   raft::device_matrix_view<float, int64_t> distances)
+{
+  int64_t topk        = neighbors.extent(1);
+  auto neighbors_host = raft::make_host_matrix<IdxT, int64_t>(neighbors.extent(0), topk);
+  auto distances_host = raft::make_host_matrix<float, int64_t>(distances.extent(0), topk);
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(dev_resources);
+
+  raft::copy(neighbors_host.data_handle(), neighbors.data_handle(), neighbors.size(), stream);
+  raft::copy(distances_host.data_handle(), distances.data_handle(), distances.size(), stream);
+
+  // The calls to RAFT algorithms and  raft::copy is asynchronous.
+  // We need to sync the stream before accessing the data.
+  raft::resource::sync_stream(dev_resources, stream);
+
+  for (int query_id = 0; query_id < 2; query_id++) {
+    std::cout << "Query " << query_id << " neighbor indices: ";
+    raft::print_host_vector("", &neighbors_host(query_id, 0), topk, std::cout);
+    std::cout << "Query " << query_id << " neighbor distances: ";
+    raft::print_host_vector("", &distances_host(query_id, 0), topk, std::cout);
+  }
+}
+
+/** Subsample the dataset to create a training set*/
+raft::device_matrix<float, int64_t> subsample(
+  raft::device_resources const& dev_resources,
+  raft::device_matrix_view<const float, int64_t> dataset,
+  raft::device_vector_view<const int64_t, int64_t> data_indices,
+  float fraction)
+{
+  int64_t n_samples = dataset.extent(0);
+  int64_t n_dim     = dataset.extent(1);
+  int64_t n_train   = n_samples * fraction;
+  auto trainset     = raft::make_device_matrix<float, int64_t>(dev_resources, n_train, n_dim);
+
+  int seed = 137;
+  raft::random::RngState rng(seed);
+  auto train_indices = raft::make_device_vector<int64_t>(dev_resources, n_train);
+
+  raft::random::sample_without_replacement(
+    dev_resources, rng, data_indices, std::nullopt, train_indices.view(), std::nullopt);
+
+  raft::matrix::copy_rows(
+    dev_resources, dataset, trainset.view(), raft::make_const_mdspan(train_indices.view()));
+
+  return trainset;
+}
diff --git a/cpp/template/src/ivf_flat_example.cu b/cpp/template/src/ivf_flat_example.cu
new file mode 100644
index 0000000000..5d91f8fe8b
--- /dev/null
+++ b/cpp/template/src/ivf_flat_example.cu
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <optional>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/neighbors/ivf_flat.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include "common.cuh"
+
+void ivf_flat_build_search_simple(raft::device_resources const& dev_resources,
+                                  raft::device_matrix_view<const float, int64_t> dataset,
+                                  raft::device_matrix_view<const float, int64_t> queries)
+{
+  using namespace raft::neighbors;
+
+  ivf_flat::index_params index_params;
+  index_params.n_lists                  = 1024;
+  index_params.kmeans_trainset_fraction = 0.1;
+  index_params.metric                   = raft::distance::DistanceType::L2Expanded;
+
+  std::cout << "Building IVF-Flat index" << std::endl;
+  auto index = ivf_flat::build(dev_resources, index_params, dataset);
+
+  std::cout << "Number of clusters " << index.n_lists() << ", number of vectors added to index "
+            << index.size() << std::endl;
+
+  // Create output arrays.
+  int64_t topk      = 10;
+  int64_t n_queries = queries.extent(0);
+  auto neighbors    = raft::make_device_matrix<int64_t>(dev_resources, n_queries, topk);
+  auto distances    = raft::make_device_matrix<float>(dev_resources, n_queries, topk);
+
+  // Set search parameters.
+  ivf_flat::search_params search_params;
+  search_params.n_probes = 50;
+
+  // Search K nearest neighbors for each of the queries.
+  ivf_flat::search(
+    dev_resources, search_params, index, queries, neighbors.view(), distances.view());
+
+  // The call to ivf_flat::search is asynchronous. Before accessing the data, sync by calling
+  // raft::resource::sync_stream(dev_resources);
+
+  print_results(dev_resources, neighbors.view(), distances.view());
+}
+
+void ivf_flat_build_extend_search(raft::device_resources const& dev_resources,
+                                  raft::device_matrix_view<const float, int64_t> dataset,
+                                  raft::device_matrix_view<const float, int64_t> queries)
+{
+  using namespace raft::neighbors;
+
+  // Define dataset indices.
+  auto data_indices = raft::make_device_vector<int64_t, int64_t>(dev_resources, dataset.extent(0));
+  thrust::counting_iterator<int64_t> first(0);
+  thrust::device_ptr<int64_t> ptr(data_indices.data_handle());
+  thrust::copy(
+    raft::resource::get_thrust_policy(dev_resources), first, first + dataset.extent(0), ptr);
+
+  // Sub-sample the dataset to create a training set.
+  auto trainset =
+    subsample(dev_resources, dataset, raft::make_const_mdspan(data_indices.view()), 0.1);
+
+  ivf_flat::index_params index_params;
+  index_params.n_lists           = 100;
+  index_params.metric            = raft::distance::DistanceType::L2Expanded;
+  index_params.add_data_on_build = false;
+
+  std::cout << "\nRun k-means clustering using the training set" << std::endl;
+  auto index =
+    ivf_flat::build(dev_resources, index_params, raft::make_const_mdspan(trainset.view()));
+
+  std::cout << "Number of clusters " << index.n_lists() << ", number of vectors added to index "
+            << index.size() << std::endl;
+
+  std::cout << "Filling index with the dataset vectors" << std::endl;
+  index = ivf_flat::extend(dev_resources,
+                           dataset,
+                           std::make_optional(raft::make_const_mdspan(data_indices.view())),
+                           index);
+
+  std::cout << "Index size after addin dataset vectors " << index.size() << std::endl;
+
+  // Set search parameters.
+  ivf_flat::search_params search_params;
+  search_params.n_probes = 10;
+
+  // Create output arrays.
+  int64_t topk      = 10;
+  int64_t n_queries = queries.extent(0);
+  auto neighbors    = raft::make_device_matrix<int64_t, int64_t>(dev_resources, n_queries, topk);
+  auto distances    = raft::make_device_matrix<float, int64_t>(dev_resources, n_queries, topk);
+
+  // Search K nearest neighbors for each queries.
+  ivf_flat::search(
+    dev_resources, search_params, index, queries, neighbors.view(), distances.view());
+
+  // The call to ivf_flat::search is asynchronous. Before accessing the data, sync using:
+  // raft::resource::sync_stream(dev_resources);
+
+  print_results(dev_resources, neighbors.view(), distances.view());
+}
+
+int main()
+{
+  raft::device_resources dev_resources;
+
+  // Set pool memory resource with 1 GiB initial pool size. All allocations use the same pool.
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr(
+    rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull);
+  rmm::mr::set_current_device_resource(&pool_mr);
+
+  // Alternatively, one could define a pool allocator for temporary arrays (used within RAFT
+  // algorithms). In that case only the internal arrays would use the pool, any other allocation
+  // uses the default RMM memory resource. Here is how to change the workspace memory resource to
+  // a pool with 2 GiB upper limit.
+  // raft::resource::set_workspace_to_pool_resource(dev_resources, 2 * 1024 * 1024 * 1024ull);
+
+  // Create input arrays.
+  int64_t n_samples = 10000;
+  int64_t n_dim     = 3;
+  int64_t n_queries = 10;
+  auto dataset      = raft::make_device_matrix<float, int64_t>(dev_resources, n_samples, n_dim);
+  auto queries      = raft::make_device_matrix<float, int64_t>(dev_resources, n_queries, n_dim);
+  generate_dataset(dev_resources, dataset.view(), queries.view());
+
+  // Simple build and search example.
+  ivf_flat_build_search_simple(dev_resources,
+                               raft::make_const_mdspan(dataset.view()),
+                               raft::make_const_mdspan(queries.view()));
+
+  // Build and extend example.
+  ivf_flat_build_extend_search(dev_resources,
+                               raft::make_const_mdspan(dataset.view()),
+                               raft::make_const_mdspan(queries.view()));
+}
diff --git a/cpp/template/src/test_vector_search.cu b/cpp/template/src/test_vector_search.cu
deleted file mode 100644
index f54cfc03e7..0000000000
--- a/cpp/template/src/test_vector_search.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/core/device_mdarray.hpp>
-#include <raft/core/device_resources.hpp>
-#include <raft/neighbors/cagra.cuh>
-#include <raft/random/make_blobs.cuh>
-
-int main()
-{
-  using namespace raft::neighbors;
-  raft::device_resources dev_resources;
-  // Use 5 GB of pool memory
-  raft::resource::set_workspace_to_pool_resource(
-    dev_resources, std::make_optional<std::size_t>(5 * 1024 * 1024 * 1024ull));
-
-  int64_t n_samples = 50000;
-  int64_t n_dim     = 90;
-  int64_t topk      = 12;
-  int64_t n_queries = 1;
-
-  // create input and output arrays
-  auto input     = raft::make_device_matrix<float>(dev_resources, n_samples, n_dim);
-  auto labels    = raft::make_device_vector<int64_t>(dev_resources, n_samples);
-  auto queries   = raft::make_device_matrix<float>(dev_resources, n_queries, n_dim);
-  auto neighbors = raft::make_device_matrix<int64_t>(dev_resources, n_queries, topk);
-  auto distances = raft::make_device_matrix<float>(dev_resources, n_queries, topk);
-
-  raft::random::make_blobs(dev_resources, input.view(), labels.view());
-
-  // use default index parameters
-  cagra::index_params index_params;
-  // create and fill the index from a [n_samples, n_dim] input
-  auto index = cagra::build<float, int64_t>(
-    dev_resources, index_params, raft::make_const_mdspan(input.view()));
-  // use default search parameters
-  cagra::search_params search_params;
-  // search K nearest neighbors
-  cagra::search<float, int64_t>(dev_resources,
-                                search_params,
-                                index,
-                                raft::make_const_mdspan(queries.view()),
-                                neighbors.view(),
-                                distances.view());
-}
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index db4c59c807..0651ccac86 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -105,6 +105,7 @@ if(BUILD_TESTS)
     NAME
     CORE_TEST
     PATH
+    test/core/bitset.cu
     test/core/device_resources_manager.cpp
     test/core/device_setter.cpp
     test/core/logger.cpp
@@ -379,6 +380,21 @@ if(BUILD_TESTS)
     100
   )
 
+  ConfigureTest(
+    NAME
+    NEIGHBORS_ANN_NN_DESCENT_TEST
+    PATH
+    test/neighbors/ann_nn_descent/test_float_uint32_t.cu
+    test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
+    test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+    GPUS
+    1
+    PERCENT
+    100
+  )
+
   ConfigureTest(
     NAME NEIGHBORS_SELECTION_TEST PATH test/neighbors/selection.cu LIB EXPLICIT_INSTANTIATE_ONLY
     GPUS 1 PERCENT 50
diff --git a/cpp/test/core/bitset.cu b/cpp/test/core/bitset.cu
new file mode 100644
index 0000000000..215de98aaf
--- /dev/null
+++ b/cpp/test/core/bitset.cu
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/bitset.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/random/rng.cuh>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <numeric>
+
+namespace raft::core {
+
+struct test_spec_bitset {
+  uint64_t bitset_len;
+  uint64_t mask_len;
+  uint64_t query_len;
+};
+
+auto operator<<(std::ostream& os, const test_spec_bitset& ss) -> std::ostream&
+{
+  os << "bitset{bitset_len: " << ss.bitset_len << ", mask_len: " << ss.mask_len
+     << ", query_len: " << ss.query_len << "}";
+  return os;
+}
+
+template <typename bitset_t, typename index_t>
+void add_cpu_bitset(std::vector<bitset_t>& bitset, const std::vector<index_t>& mask_idx)
+{
+  static size_t constexpr const bitset_element_size = sizeof(bitset_t) * 8;
+  for (size_t i = 0; i < mask_idx.size(); i++) {
+    auto idx = mask_idx[i];
+    bitset[idx / bitset_element_size] &= ~(bitset_t{1} << (idx % bitset_element_size));
+  }
+}
+
+template <typename bitset_t, typename index_t>
+void create_cpu_bitset(std::vector<bitset_t>& bitset, const std::vector<index_t>& mask_idx)
+{
+  for (size_t i = 0; i < bitset.size(); i++) {
+    bitset[i] = ~bitset_t(0x00);
+  }
+  add_cpu_bitset(bitset, mask_idx);
+}
+
+template <typename bitset_t, typename index_t>
+void test_cpu_bitset(const std::vector<bitset_t>& bitset,
+                     const std::vector<index_t>& queries,
+                     std::vector<uint8_t>& result)
+{
+  static size_t constexpr const bitset_element_size = sizeof(bitset_t) * 8;
+  for (size_t i = 0; i < queries.size(); i++) {
+    result[i] = uint8_t((bitset[queries[i] / bitset_element_size] &
+                         (bitset_t{1} << (queries[i] % bitset_element_size))) != 0);
+  }
+}
+
+template <typename bitset_t>
+void flip_cpu_bitset(std::vector<bitset_t>& bitset)
+{
+  for (size_t i = 0; i < bitset.size(); i++) {
+    bitset[i] = ~bitset[i];
+  }
+}
+
+template <typename bitset_t, typename index_t>
+class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
+ protected:
+  index_t static constexpr const bitset_element_size = sizeof(bitset_t) * 8;
+  const test_spec_bitset spec;
+  std::vector<bitset_t> bitset_result;
+  std::vector<bitset_t> bitset_ref;
+  raft::resources res;
+
+ public:
+  explicit BitsetTest()
+    : spec(testing::TestWithParam<test_spec_bitset>::GetParam()),
+      bitset_result(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size))),
+      bitset_ref(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size)))
+  {
+  }
+
+  void run()
+  {
+    auto stream = resource::get_cuda_stream(res);
+
+    // generate input and mask
+    raft::random::RngState rng(42);
+    auto mask_device = raft::make_device_vector<index_t, index_t>(res, spec.mask_len);
+    std::vector<index_t> mask_cpu(spec.mask_len);
+    raft::random::uniformInt(res, rng, mask_device.view(), index_t(0), index_t(spec.bitset_len));
+    update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream);
+    resource::sync_stream(res, stream);
+
+    // calculate the results
+    auto my_bitset = raft::core::bitset<bitset_t, index_t>(
+      res, raft::make_const_mdspan(mask_device.view()), index_t(spec.bitset_len));
+    update_host(bitset_result.data(), my_bitset.data_handle(), bitset_result.size(), stream);
+
+    // calculate the reference
+    create_cpu_bitset(bitset_ref, mask_cpu);
+    resource::sync_stream(res, stream);
+    ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
+
+    auto query_device  = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
+    auto result_device = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
+    auto query_cpu     = std::vector<index_t>(spec.query_len);
+    auto result_cpu    = std::vector<uint8_t>(spec.query_len);
+    auto result_ref    = std::vector<uint8_t>(spec.query_len);
+
+    // Create queries and verify the test results
+    raft::random::uniformInt(res, rng, query_device.view(), index_t(0), index_t(spec.bitset_len));
+    update_host(query_cpu.data(), query_device.data_handle(), query_device.extent(0), stream);
+    my_bitset.test(res, raft::make_const_mdspan(query_device.view()), result_device.view());
+    update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
+    test_cpu_bitset(bitset_ref, query_cpu, result_ref);
+    resource::sync_stream(res, stream);
+    ASSERT_TRUE(hostVecMatch(result_cpu, result_ref, Compare<uint8_t>()));
+
+    // Add more sample to the bitset and re-test
+    raft::random::uniformInt(res, rng, mask_device.view(), index_t(0), index_t(spec.bitset_len));
+    update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream);
+    resource::sync_stream(res, stream);
+    my_bitset.set(res, mask_device.view());
+    update_host(bitset_result.data(), my_bitset.data_handle(), bitset_result.size(), stream);
+
+    add_cpu_bitset(bitset_ref, mask_cpu);
+    resource::sync_stream(res, stream);
+    ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
+
+    // Flip the bitset and re-test
+    my_bitset.flip(res);
+    update_host(bitset_result.data(), my_bitset.data_handle(), bitset_result.size(), stream);
+    flip_cpu_bitset(bitset_ref);
+    resource::sync_stream(res, stream);
+    ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
+  }
+};
+
+auto inputs_bitset = ::testing::Values(test_spec_bitset{32, 5, 10},
+                                       test_spec_bitset{100, 30, 10},
+                                       test_spec_bitset{1024, 55, 100},
+                                       test_spec_bitset{10000, 1000, 1000},
+                                       test_spec_bitset{1 << 15, 1 << 3, 1 << 12},
+                                       test_spec_bitset{1 << 15, 1 << 24, 1 << 13},
+                                       test_spec_bitset{1 << 25, 1 << 23, 1 << 14});
+
+using Uint16_32 = BitsetTest<uint16_t, uint32_t>;
+TEST_P(Uint16_32, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitsetTest, Uint16_32, inputs_bitset);
+
+using Uint32_32 = BitsetTest<uint32_t, uint32_t>;
+TEST_P(Uint32_32, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitsetTest, Uint32_32, inputs_bitset);
+
+using Uint64_32 = BitsetTest<uint64_t, uint32_t>;
+TEST_P(Uint64_32, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitsetTest, Uint64_32, inputs_bitset);
+
+using Uint8_64 = BitsetTest<uint8_t, uint64_t>;
+TEST_P(Uint8_64, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitsetTest, Uint8_64, inputs_bitset);
+
+using Uint32_64 = BitsetTest<uint32_t, uint64_t>;
+TEST_P(Uint32_64, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitsetTest, Uint32_64, inputs_bitset);
+
+using Uint64_64 = BitsetTest<uint64_t, uint64_t>;
+TEST_P(Uint64_64, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitsetTest, Uint64_64, inputs_bitset);
+
+}  // namespace raft::core
diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu
index 15c7b2b33a..8e3a9df01b 100644
--- a/cpp/test/core/math_device.cu
+++ b/cpp/test/core/math_device.cu
@@ -21,7 +21,9 @@
 #include <rmm/cuda_stream.hpp>
 #include <rmm/device_scalar.hpp>
 
-#if _RAFT_HAS_CUDA
+#include <cuda/std/type_traits>
+
+#ifdef _RAFT_HAS_CUDA
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #endif
@@ -35,7 +37,7 @@ __global__ void math_eval_kernel(OutT* out, OpT op, Args... args)
 template <typename OpT, typename... Args>
 auto math_eval(OpT op, Args&&... args)
 {
-  typedef decltype(op(args...)) OutT;
+  using OutT  = cuda::std::invoke_result_t<OpT, Args...>;
   auto stream = rmm::cuda_stream_default;
   rmm::device_scalar<OutT> result(stream);
   math_eval_kernel<<<1, 1, 0, stream>>>(result.data(), op, std::forward<Args>(args)...);
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 20d78c7bb5..8f616ada98 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -19,13 +19,13 @@
 #include <raft/common/nvtx.hpp>  // common::nvtx::range
 #include <raft/core/resource/cuda_stream.hpp>
 
-#include <raft/core/device_mdspan.hpp>       // make_device_matrix_view
-#include <raft/core/operators.hpp>           // raft::sqrt
-#include <raft/core/resources.hpp>           // raft::resources
+#include <raft/core/device_mdspan.hpp>  // make_device_matrix_view
+#include <raft/core/operators.hpp>      // raft::sqrt
+#include <raft/core/resources.hpp>      // raft::resources
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
 #include <raft/random/rng.cuh>
-#include <rmm/device_uvector.hpp>            // rmm::device_uvector
+#include <rmm/device_uvector.hpp>  // rmm::device_uvector
 
 namespace raft {
 namespace distance {
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index 0c66f47c7f..b5d10d215c 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -160,24 +160,24 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
 
 const std::vector<RsvdInputs<float>> inputs_fx = {
   // Test with ratios
-  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},      // Square + BBT
-  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},     // Tall + BBT
+  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
 
-  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},     // Square + non-BBT
-  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},    // Tall + non-BBT
+  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Square + non-BBT
+  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Tall + non-BBT
 
-  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},    // Square + BBT
-  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Tall + BBT
+  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
 
-  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Square + non-BBT
-  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}   // Tall + non-BBT
+  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Square + non-BBT
+  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}  // Tall + non-BBT
 
-  ,                                                                // Test with fixed ranks
-  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},     // Square + BBT
-  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},    // Tall + BBT
+  ,                                                              // Test with fixed ranks
+  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
+  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
 
-  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},    // Square + non-BBT
-  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Tall + non-BBT
+  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Square + non-BBT
+  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Tall + non-BBT
 
   {0.60f, 2048, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
   {1.00f, 16384, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
@@ -188,14 +188,14 @@ const std::vector<RsvdInputs<float>> inputs_fx = {
 
 const std::vector<RsvdInputs<double>> inputs_dx = {
   // Test with ratios
-  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},      // Square + BBT
-  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},     // Tall + BBT
-  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},     // Square + non-BBT
-  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},    // Tall + non-BBT
-  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},    // Square + BBT
-  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},   // Tall + BBT
-  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},   // Square + non-BBT
-  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}   // Tall + non-BBT
+  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},     // Square + BBT
+  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},    // Tall + BBT
+  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},    // Square + non-BBT
+  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},   // Tall + non-BBT
+  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},  // Tall + BBT
+  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},  // Square + non-BBT
+  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}  // Tall + non-BBT
 
   ,                                                             // Test with fixed ranks
   {0.10, 256, 256, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},     // Square + BBT
diff --git a/cpp/test/matrix/select_k.cu b/cpp/test/matrix/select_k.cu
index 63f020b420..ce4e3e867e 100644
--- a/cpp/test/matrix/select_k.cu
+++ b/cpp/test/matrix/select_k.cu
@@ -70,6 +70,28 @@ auto inputs_random_largek = testing::Values(select::params{100, 100000, 1000, tr
                                             select::params{100, 100000, 2048, false},
                                             select::params{100, 100000, 1237, true});
 
+auto inputs_random_many_infs =
+  testing::Values(select::params{10, 100000, 1, true, false, false, true, 0.9},
+                  select::params{10, 100000, 16, true, false, false, true, 0.9},
+                  select::params{10, 100000, 64, true, false, false, true, 0.9},
+                  select::params{10, 100000, 128, true, false, false, true, 0.9},
+                  select::params{10, 100000, 256, true, false, false, true, 0.9},
+                  select::params{1000, 10000, 1, true, false, false, true, 0.9},
+                  select::params{1000, 10000, 16, true, false, false, true, 0.9},
+                  select::params{1000, 10000, 64, true, false, false, true, 0.9},
+                  select::params{1000, 10000, 128, true, false, false, true, 0.9},
+                  select::params{1000, 10000, 256, true, false, false, true, 0.9},
+                  select::params{10, 100000, 1, true, false, false, true, 0.999},
+                  select::params{10, 100000, 16, true, false, false, true, 0.999},
+                  select::params{10, 100000, 64, true, false, false, true, 0.999},
+                  select::params{10, 100000, 128, true, false, false, true, 0.999},
+                  select::params{10, 100000, 256, true, false, false, true, 0.999},
+                  select::params{1000, 10000, 1, true, false, false, true, 0.999},
+                  select::params{1000, 10000, 16, true, false, false, true, 0.999},
+                  select::params{1000, 10000, 64, true, false, false, true, 0.999},
+                  select::params{1000, 10000, 128, true, false, false, true, 0.999},
+                  select::params{1000, 10000, 256, true, false, false, true, 0.999});
+
 using ReferencedRandomFloatInt =
   SelectK<float, uint32_t, with_ref<select::Algo::kPublicApi>::params_random>;
 TEST_P(ReferencedRandomFloatInt, Run) { run(); }  // NOLINT
@@ -111,4 +133,16 @@ INSTANTIATE_TEST_CASE_P(                                 // NOLINT
                                    select::Algo::kRadix8bits,
                                    select::Algo::kRadix11bits,
                                    select::Algo::kRadix11bitsExtraPass)));
+
+using ReferencedRandomFloatIntkWarpsortAsGT =
+  SelectK<float, uint32_t, with_ref<select::Algo::kWarpImmediate>::params_random>;
+TEST_P(ReferencedRandomFloatIntkWarpsortAsGT, Run) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                                       // NOLINT
+  SelectK,
+  ReferencedRandomFloatIntkWarpsortAsGT,
+  testing::Combine(inputs_random_many_infs,
+                   testing::Values(select::Algo::kRadix8bits,
+                                   select::Algo::kRadix11bits,
+                                   select::Algo::kRadix11bitsExtraPass)));
+
 }  // namespace raft::matrix
diff --git a/cpp/test/matrix/select_k.cuh b/cpp/test/matrix/select_k.cuh
index e0e0cad225..e94a6d029e 100644
--- a/cpp/test/matrix/select_k.cuh
+++ b/cpp/test/matrix/select_k.cuh
@@ -49,14 +49,16 @@ auto gen_simple_ids(uint32_t batch_size, uint32_t len) -> std::vector<IdxT>
 template <typename KeyT, typename IdxT>
 struct io_simple {
  public:
-  bool not_supported = false;
+  bool not_supported               = false;
+  std::optional<select::Algo> algo = std::nullopt;
 
   io_simple(const select::params& spec,
             const std::vector<KeyT>& in_dists,
+            const std::optional<std::vector<IdxT>>& in_ids,
             const std::vector<KeyT>& out_dists,
             const std::vector<IdxT>& out_ids)
     : in_dists_(in_dists),
-      in_ids_(gen_simple_ids<IdxT>(spec.batch_size, spec.len)),
+      in_ids_(in_ids.value_or(gen_simple_ids<IdxT>(spec.batch_size, spec.len))),
       out_dists_(out_dists),
       out_ids_(out_ids)
   {
@@ -78,12 +80,14 @@ template <typename KeyT, typename IdxT>
 struct io_computed {
  public:
   bool not_supported = false;
+  select::Algo algo;
 
   io_computed(const select::params& spec,
               const select::Algo& algo,
               const std::vector<KeyT>& in_dists,
               const std::optional<std::vector<IdxT>>& in_ids = std::nullopt)
-    : in_dists_(in_dists),
+    : algo(algo),
+      in_dists_(in_dists),
       in_ids_(in_ids.value_or(gen_simple_ids<IdxT>(spec.batch_size, spec.len))),
       out_dists_(spec.batch_size * spec.k),
       out_ids_(spec.batch_size * spec.k)
@@ -223,32 +227,62 @@ struct SelectK  // NOLINT
     if (ref.not_supported || res.not_supported) { GTEST_SKIP(); }
     ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare<KeyT>()));
 
-    // If the dists (keys) are the same, different corresponding ids may end up in the selection due
-    // to non-deterministic nature of some implementations.
-    auto& in_ids     = ref.get_in_ids();
-    auto& in_dists   = ref.get_in_dists();
-    auto compare_ids = [&in_ids, &in_dists](const IdxT& i, const IdxT& j) {
+    // If the dists (keys) are the same, different corresponding ids may end up in the selection
+    // due to non-deterministic nature of some implementations.
+    auto compare_ids = [this](const IdxT& i, const IdxT& j) {
       if (i == j) return true;
+      auto& in_ids   = ref.get_in_ids();
+      auto& in_dists = ref.get_in_dists();
       auto ix_i = static_cast<int64_t>(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin());
       auto ix_j = static_cast<int64_t>(std::find(in_ids.begin(), in_ids.end(), j) - in_ids.begin());
-      if (static_cast<size_t>(ix_i) >= in_ids.size() || static_cast<size_t>(ix_j) >= in_ids.size())
-        return false;
+      auto forgive_i = forgive_algo(ref.algo, i);
+      auto forgive_j = forgive_algo(res.algo, j);
+      // Some algorithms return invalid indices in special cases.
+      // TODO: https://github.com/rapidsai/raft/issues/1822
+      if (static_cast<size_t>(ix_i) >= in_ids.size()) return forgive_i;
+      if (static_cast<size_t>(ix_j) >= in_ids.size()) return forgive_j;
       auto dist_i = in_dists[ix_i];
       auto dist_j = in_dists[ix_j];
       if (dist_i == dist_j) return true;
+      const auto bound = spec.select_min ? raft::upper_bound<KeyT>() : raft::lower_bound<KeyT>();
+      if (forgive_i && dist_i == bound) return true;
+      if (forgive_j && dist_j == bound) return true;
+      // Otherwise really fail
       std::cout << "ERROR: ref[" << ix_i << "] = " << dist_i << " != "
                 << "res[" << ix_j << "] = " << dist_j << std::endl;
       return false;
     };
     ASSERT_TRUE(hostVecMatch(ref.get_out_ids(), res.get_out_ids(), compare_ids));
   }
+
+  auto forgive_algo(const std::optional<select::Algo>& algo, IdxT ix) const -> bool
+  {
+    if (!algo.has_value()) { return false; }
+    switch (algo.value()) {
+      // not sure which algo this is.
+      case select::Algo::kPublicApi: return true;
+      // warp-sort-based algos currently return zero index for inf distances.
+      case select::Algo::kWarpAuto:
+      case select::Algo::kWarpImmediate:
+      case select::Algo::kWarpFiltered:
+      case select::Algo::kWarpDistributed:
+      case select::Algo::kWarpDistributedShm: return ix == 0;
+      // FAISS version returns a special invalid value:
+      case select::Algo::kFaissBlockSelect: return ix == std::numeric_limits<IdxT>::max();
+      // Do not forgive by default
+      default: return false;
+    }
+  }
 };
 
 template <typename KeyT, typename IdxT>
 struct params_simple {
-  using io_t = io_simple<KeyT, IdxT>;
-  using input_t =
-    std::tuple<select::params, std::vector<KeyT>, std::vector<KeyT>, std::vector<IdxT>>;
+  using io_t     = io_simple<KeyT, IdxT>;
+  using input_t  = std::tuple<select::params,
+                             std::vector<KeyT>,
+                             std::optional<std::vector<IdxT>>,
+                             std::vector<KeyT>,
+                             std::vector<IdxT>>;
   using params_t = std::tuple<input_t, select::Algo>;
 
   static auto read(params_t ps) -> Params<io_t>
@@ -259,15 +293,17 @@ struct params_simple {
       std::get<0>(ins),
       algo,
       io_simple<KeyT, IdxT>(
-        std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins)));
+        std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins), std::get<4>(ins)));
   }
 };
 
+auto inf_f           = std::numeric_limits<float>::max();
 auto inputs_simple_f = testing::Values(
   params_simple<float, uint32_t>::input_t(
     {5, 5, 5, true, true},
     {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
      1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    std::nullopt,
     {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
      4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
     {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}),
@@ -275,12 +311,14 @@ auto inputs_simple_f = testing::Values(
     {5, 5, 3, true, true},
     {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
      1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    std::nullopt,
     {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0},
     {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}),
   params_simple<float, uint32_t>::input_t(
     {5, 5, 5, true, false},
     {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
      1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    std::nullopt,
     {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
      4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
     {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}),
@@ -288,20 +326,31 @@ auto inputs_simple_f = testing::Values(
     {5, 5, 3, true, false},
     {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
      1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    std::nullopt,
     {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0},
     {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}),
   params_simple<float, uint32_t>::input_t(
     {5, 7, 3, true, true},
     {5.0, 4.0, 3.0, 2.0, 1.3, 7.5, 19.0, 9.0, 2.0, 3.0, 3.0, 5.0, 6.0, 4.0, 2.0, 3.0, 5.0, 1.0,
      4.0, 1.0, 1.0, 5.0, 7.0, 2.5, 4.0,  7.0, 8.0, 8.0, 1.0, 3.0, 2.0, 5.0, 4.0, 1.1, 1.2},
+    std::nullopt,
     {1.3, 2.0, 3.0, 2.0, 3.0, 3.0, 1.0, 1.0, 1.0, 2.5, 4.0, 5.0, 1.0, 1.1, 1.2},
     {4, 3, 2, 1, 2, 3, 3, 5, 6, 2, 3, 0, 0, 5, 6}),
-  params_simple<float, uint32_t>::input_t(
-    {1, 7, 3, true, true}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {3, 5, 6}),
-  params_simple<float, uint32_t>::input_t(
-    {1, 7, 3, false, false}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {5.0, 4.0, 3.0}, {2, 4, 1}),
-  params_simple<float, uint32_t>::input_t(
-    {1, 7, 3, false, true}, {2.0, 3.0, 5.0, 9.0, 4.0, 9.0, 9.0}, {9.0, 9.0, 9.0}, {3, 5, 6}),
+  params_simple<float, uint32_t>::input_t({1, 7, 3, true, true},
+                                          {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0},
+                                          std::nullopt,
+                                          {1.0, 1.0, 1.0},
+                                          {3, 5, 6}),
+  params_simple<float, uint32_t>::input_t({1, 7, 3, false, false},
+                                          {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0},
+                                          std::nullopt,
+                                          {5.0, 4.0, 3.0},
+                                          {2, 4, 1}),
+  params_simple<float, uint32_t>::input_t({1, 7, 3, false, true},
+                                          {2.0, 3.0, 5.0, 9.0, 4.0, 9.0, 9.0},
+                                          std::nullopt,
+                                          {9.0, 9.0, 9.0},
+                                          {3, 5, 6}),
   params_simple<float, uint32_t>::input_t(
     {1, 130, 5, false, true},
     {19, 1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
@@ -309,6 +358,7 @@ auto inputs_simple_f = testing::Values(
      0,  1, 0, 1, 0, 1,  0,  1,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
      1,  2, 1, 2, 1, 2,  1,  2,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4,
      5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20},
+    std::nullopt,
     {20, 19, 18, 17, 16},
     {129, 0, 117, 116, 115}),
   params_simple<float, uint32_t>::input_t(
@@ -318,8 +368,20 @@ auto inputs_simple_f = testing::Values(
      0,  1, 0, 1, 0, 1,  0,  1,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
      1,  2, 1, 2, 1, 2,  1,  2,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4,
      5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20},
+    std::nullopt,
     {20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6},
-    {129, 0, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105}));
+    {129, 0, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105}),
+  params_simple<float, uint32_t>::input_t(
+    select::params{1, 32, 31, true, true},
+    {0,  1,  2,  3,  inf_f, inf_f, 6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+     16, 17, 18, 19, 20,    21,    22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
+    std::optional{std::vector<uint32_t>{31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21,
+                                        20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10,
+                                        9,  8,  7,  6,  75, 74, 3,  2,  1,  0}},
+    {0,  1,  2,  3,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,   17,
+     18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, inf_f},
+    {31, 30, 29, 28, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14,
+     13, 12, 11, 10, 9,  8,  7,  6,  75, 74, 3,  2,  1,  0,  27}));
 
 using SimpleFloatInt = SelectK<float, uint32_t, params_simple>;
 TEST_P(SimpleFloatInt, Run) { run(); }  // NOLINT
@@ -335,6 +397,12 @@ INSTANTIATE_TEST_CASE_P(                // NOLINT
                                    select::Algo::kWarpFiltered,
                                    select::Algo::kWarpDistributed)));
 
+template <typename KeyT>
+struct replace_with_mask {
+  KeyT replacement;
+  constexpr auto inline operator()(KeyT x, uint8_t mask) -> KeyT { return mask ? replacement : x; }
+};
+
 template <select::Algo RefAlgo>
 struct with_ref {
   template <typename KeyT, typename IdxT>
@@ -354,6 +422,19 @@ struct with_ref {
         rmm::device_uvector<KeyT> dists_d(spec.len * spec.batch_size, s);
         raft::random::RngState r(42);
         normal(handle, r, dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0));
+
+        if (spec.frac_infinities > 0.0) {
+          rmm::device_uvector<uint8_t> mask_buf(dists_d.size(), s);
+          auto mask = make_device_vector_view<uint8_t, size_t>(mask_buf.data(), mask_buf.size());
+          raft::random::bernoulli(handle, r, mask, spec.frac_infinities);
+          KeyT bound = spec.select_min ? raft::upper_bound<KeyT>() : raft::lower_bound<KeyT>();
+          auto mask_in =
+            make_device_vector_view<const uint8_t, size_t>(mask_buf.data(), mask_buf.size());
+          auto dists_in  = make_device_vector_view<const KeyT>(dists_d.data(), dists_d.size());
+          auto dists_out = make_device_vector_view<KeyT>(dists_d.data(), dists_d.size());
+          raft::linalg::map(handle, dists_out, replace_with_mask<KeyT>{bound}, dists_in, mask_in);
+        }
+
         update_host(dists.data(), dists_d.data(), dists_d.size(), s);
         s.synchronize();
       }
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index ea905d2089..b750372244 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY  // Search with filter instantiation
+
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 #include <raft/core/resource/cuda_stream.hpp>
@@ -25,6 +27,7 @@
 #include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/linalg/add.cuh>
 #include <raft/neighbors/cagra.cuh>
 #include <raft/neighbors/cagra_serialize.cuh>
 #include <raft/random/rng.cuh>
@@ -41,8 +44,22 @@
 #include <string>
 #include <vector>
 
-namespace raft::neighbors::experimental::cagra {
+namespace raft::neighbors::cagra {
 namespace {
+
+/* A filter that excludes all indices below `offset`. */
+struct test_cagra_sample_filter {
+  static constexpr unsigned offset = 300;
+  inline _RAFT_HOST_DEVICE auto operator()(
+    // query index
+    const uint32_t query_ix,
+    // the index of the current sample inside the current inverted list
+    const uint32_t sample_ix) const
+  {
+    return sample_ix >= offset;
+  }
+};
+
 // For sort_knn_graph test
 template <typename IdxT>
 void RandomSuffle(raft::host_matrix_view<IdxT, int64_t> index)
@@ -130,6 +147,7 @@ struct AnnCagraInputs {
   int n_rows;
   int dim;
   int k;
+  graph_build_algo build_algo;
   search_algo algo;
   int max_queries;
   int team_size;
@@ -144,12 +162,13 @@ struct AnnCagraInputs {
 
 inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p)
 {
-  std::vector<std::string> algo = {"single-cta", "multi_cta", "multi_kernel", "auto"};
+  std::vector<std::string> algo       = {"single-cta", "multi_cta", "multi_kernel", "auto"};
+  std::vector<std::string> build_algo = {"IVF_PQ", "NN_DESCENT"};
   os << "{n_queries=" << p.n_queries << ", dataset shape=" << p.n_rows << "x" << p.dim
      << ", k=" << p.k << ", " << algo.at((int)p.algo) << ", max_queries=" << p.max_queries
      << ", itopk_size=" << p.itopk_size << ", search_width=" << p.search_width
-     << ", metric=" << static_cast<int>(p.metric) << (p.host_dataset ? ", host" : ", device") << '}'
-     << std::endl;
+     << ", metric=" << static_cast<int>(p.metric) << (p.host_dataset ? ", host" : ", device")
+     << ", build_algo=" << build_algo.at((int)p.build_algo) << '}' << std::endl;
   return os;
 }
 
@@ -199,6 +218,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
         cagra::index_params index_params;
         index_params.metric = ps.metric;  // Note: currently ony the cagra::index_params metric is
                                           // not used for knn_graph building.
+        index_params.build_algo = ps.build_algo;
         cagra::search_params search_params;
         search_params.algo        = ps.algo;
         search_params.max_queries = ps.max_queries;
@@ -323,11 +343,25 @@ class AnnCagraSortTest : public ::testing::TestWithParam<AnnCagraInputs> {
       auto knn_graph =
         raft::make_host_matrix<IdxT, int64_t>(ps.n_rows, index_params.intermediate_graph_degree);
 
-      if (ps.host_dataset) {
-        cagra::build_knn_graph<DataT, IdxT>(handle_, database_host_view, knn_graph.view());
+      if (ps.build_algo == graph_build_algo::IVF_PQ) {
+        if (ps.host_dataset) {
+          cagra::build_knn_graph<DataT, IdxT>(handle_, database_host_view, knn_graph.view());
+        } else {
+          cagra::build_knn_graph<DataT, IdxT>(handle_, database_view, knn_graph.view());
+        }
       } else {
-        cagra::build_knn_graph<DataT, IdxT>(handle_, database_view, knn_graph.view());
-      };
+        auto nn_descent_idx_params                      = experimental::nn_descent::index_params{};
+        nn_descent_idx_params.graph_degree              = index_params.intermediate_graph_degree;
+        nn_descent_idx_params.intermediate_graph_degree = index_params.intermediate_graph_degree;
+
+        if (ps.host_dataset) {
+          cagra::build_knn_graph<DataT, IdxT>(
+            handle_, database_host_view, knn_graph.view(), nn_descent_idx_params);
+        } else {
+          cagra::build_knn_graph<DataT, IdxT>(
+            handle_, database_host_view, knn_graph.view(), nn_descent_idx_params);
+        }
+      }
 
       handle_.sync_stream();
       ASSERT_TRUE(CheckOrder<DistanceT>(knn_graph.view(), database_host.view()));
@@ -365,6 +399,162 @@ class AnnCagraSortTest : public ::testing::TestWithParam<AnnCagraInputs> {
   rmm::device_uvector<DataT> database;
 };
 
+template <typename DistanceT, typename DataT, typename IdxT>
+class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
+ public:
+  AnnCagraFilterTest()
+    : stream_(resource::get_cuda_stream(handle_)),
+      ps(::testing::TestWithParam<AnnCagraInputs>::GetParam()),
+      database(0, stream_),
+      search_queries(0, stream_)
+  {
+  }
+
+ protected:
+  void testCagraFilter()
+  {
+    size_t queries_size = ps.n_queries * ps.k;
+    std::vector<IdxT> indices_Cagra(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_Cagra(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      auto* database_filtered_ptr = database.data() + test_cagra_sample_filter::offset * ps.dim;
+      naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                        distances_naive_dev.data(),
+                                        indices_naive_dev.data(),
+                                        search_queries.data(),
+                                        database_filtered_ptr,
+                                        ps.n_queries,
+                                        ps.n_rows - test_cagra_sample_filter::offset,
+                                        ps.dim,
+                                        ps.k,
+                                        ps.metric);
+      raft::linalg::addScalar(indices_naive_dev.data(),
+                              indices_naive_dev.data(),
+                              IdxT(test_cagra_sample_filter::offset),
+                              queries_size,
+                              stream_);
+      update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      resource::sync_stream(handle_);
+    }
+
+    {
+      rmm::device_uvector<DistanceT> distances_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_dev(queries_size, stream_);
+
+      {
+        cagra::index_params index_params;
+        index_params.metric = ps.metric;  // Note: currently ony the cagra::index_params metric is
+                                          // not used for knn_graph building.
+        cagra::search_params search_params;
+        search_params.algo         = ps.algo;
+        search_params.max_queries  = ps.max_queries;
+        search_params.team_size    = ps.team_size;
+        search_params.hashmap_mode = cagra::hash_mode::HASH;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          (const DataT*)database.data(), ps.n_rows, ps.dim);
+
+        cagra::index<DataT, IdxT> index(handle_);
+        if (ps.host_dataset) {
+          auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+          raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+          auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
+            (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+          index = cagra::build<DataT, IdxT>(handle_, index_params, database_host_view);
+        } else {
+          index = cagra::build<DataT, IdxT>(handle_, index_params, database_view);
+        }
+
+        if (!ps.include_serialized_dataset) { index.update_dataset(handle_, database_view); }
+
+        auto search_queries_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          search_queries.data(), ps.n_queries, ps.dim);
+        auto indices_out_view =
+          raft::make_device_matrix_view<IdxT, int64_t>(indices_dev.data(), ps.n_queries, ps.k);
+        auto dists_out_view = raft::make_device_matrix_view<DistanceT, int64_t>(
+          distances_dev.data(), ps.n_queries, ps.k);
+
+        cagra::search_with_filtering(handle_,
+                                     search_params,
+                                     index,
+                                     search_queries_view,
+                                     indices_out_view,
+                                     dists_out_view,
+                                     test_cagra_sample_filter());
+        update_host(distances_Cagra.data(), distances_dev.data(), queries_size, stream_);
+        update_host(indices_Cagra.data(), indices_dev.data(), queries_size, stream_);
+        resource::sync_stream(handle_);
+      }
+
+      // Test filter
+      bool unacceptable_node = false;
+      for (int q = 0; q < ps.n_queries; q++) {
+        for (int i = 0; i < ps.k; i++) {
+          const auto n      = indices_Cagra[q * ps.k + i];
+          unacceptable_node = unacceptable_node | !test_cagra_sample_filter()(q, n);
+        }
+      }
+      EXPECT_FALSE(unacceptable_node);
+
+      double min_recall = ps.min_recall;
+      EXPECT_TRUE(eval_neighbours(indices_naive,
+                                  indices_Cagra,
+                                  distances_naive,
+                                  distances_Cagra,
+                                  ps.n_queries,
+                                  ps.k,
+                                  0.001,
+                                  min_recall));
+      EXPECT_TRUE(eval_distances(handle_,
+                                 database.data(),
+                                 search_queries.data(),
+                                 indices_dev.data(),
+                                 distances_dev.data(),
+                                 ps.n_rows,
+                                 ps.dim,
+                                 ps.n_queries,
+                                 ps.k,
+                                 ps.metric,
+                                 1.0e-4));
+    }
+  }
+
+  void SetUp() override
+  {
+    database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
+    search_queries.resize(ps.n_queries * ps.dim, stream_);
+    raft::random::Rng r(1234ULL);
+    if constexpr (std::is_same<DataT, float>{}) {
+      r.normal(database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0), stream_);
+      r.normal(search_queries.data(), ps.n_queries * ps.dim, DataT(0.1), DataT(2.0), stream_);
+    } else {
+      r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), stream_);
+      r.uniformInt(search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20), stream_);
+    }
+    resource::sync_stream(handle_);
+  }
+
+  void TearDown() override
+  {
+    resource::sync_stream(handle_);
+    database.resize(0, stream_);
+    search_queries.resize(0, stream_);
+  }
+
+ private:
+  raft::resources handle_;
+  rmm::cuda_stream_view stream_;
+  AnnCagraInputs ps;
+  rmm::device_uvector<DataT> database;
+  rmm::device_uvector<DataT> search_queries;
+};
+
 inline std::vector<AnnCagraInputs> generate_inputs()
 {
   // TODO(tfeher): test MULTI_CTA kernel with search_width > 1 to allow multiple CTA per queries
@@ -372,7 +562,8 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {100},
     {1000},
     {1, 8, 17},
-    {1, 16},          // k
+    {1, 16},  // k
+    {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
     {search_algo::SINGLE_CTA, search_algo::MULTI_CTA, search_algo::MULTI_KERNEL},
     {0, 1, 10, 100},  // query size
     {0},
@@ -388,6 +579,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {1000},
     {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},  // dim
     {16},                                                         // k
+    {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
     {search_algo::AUTO},
     {10},
     {0},
@@ -398,68 +590,55 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {true},
     {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
-  inputs2 =
-    raft::util::itertools::product<AnnCagraInputs>({100},
-                                                   {1000},
-                                                   {64},
-                                                   {16},
-                                                   {search_algo::AUTO},
-                                                   {10},
-                                                   {0, 4, 8, 16, 32},  // team_size
-                                                   {64},
-                                                   {1},
-                                                   {raft::distance::DistanceType::L2Expanded},
-                                                   {false},
-                                                   {false},
-                                                   {0.995});
-  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
-
-  inputs2 =
-    raft::util::itertools::product<AnnCagraInputs>({100},
-                                                   {1000},
-                                                   {64},
-                                                   {16},
-                                                   {search_algo::AUTO},
-                                                   {10},
-                                                   {0},  // team_size
-                                                   {32, 64, 128, 256, 512, 768},
-                                                   {1},
-                                                   {raft::distance::DistanceType::L2Expanded},
-                                                   {false},
-                                                   {true},
-                                                   {0.995});
+  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+    {100},
+    {1000},
+    {64},
+    {16},
+    {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
+    {search_algo::AUTO},
+    {10},
+    {0, 4, 8, 16, 32},  // team_size
+    {64},
+    {1},
+    {raft::distance::DistanceType::L2Expanded},
+    {false},
+    {false},
+    {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
-  inputs2 =
-    raft::util::itertools::product<AnnCagraInputs>({100},
-                                                   {10000, 20000},
-                                                   {32},
-                                                   {10},
-                                                   {search_algo::AUTO},
-                                                   {10},
-                                                   {0},  // team_size
-                                                   {64},
-                                                   {1},
-                                                   {raft::distance::DistanceType::L2Expanded},
-                                                   {false, true},
-                                                   {false},
-                                                   {0.995});
+  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+    {100},
+    {1000},
+    {64},
+    {16},
+    {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
+    {search_algo::AUTO},
+    {10},
+    {0},  // team_size
+    {32, 64, 128, 256, 512, 768},
+    {1},
+    {raft::distance::DistanceType::L2Expanded},
+    {false},
+    {true},
+    {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
-  inputs2 =
-    raft::util::itertools::product<AnnCagraInputs>({100},
-                                                   {10000, 20000},
-                                                   {32},
-                                                   {10},
-                                                   {search_algo::AUTO},
-                                                   {10},
-                                                   {0},  // team_size
-                                                   {64},
-                                                   {1},
-                                                   {raft::distance::DistanceType::L2Expanded},
-                                                   {false, true},
-                                                   {true},
-                                                   {0.995});
+  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+    {100},
+    {10000, 20000},
+    {32},
+    {10},
+    {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
+    {search_algo::AUTO},
+    {10},
+    {0},  // team_size
+    {64},
+    {1},
+    {raft::distance::DistanceType::L2Expanded},
+    {false, true},
+    {false},
+    {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
   return inputs;
@@ -467,4 +646,4 @@ inline std::vector<AnnCagraInputs> generate_inputs()
 
 const std::vector<AnnCagraInputs> inputs = generate_inputs();
 
-}  // namespace raft::neighbors::experimental::cagra
+}  // namespace raft::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh b/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh
index f61e476652..175e4ef483 100644
--- a/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh
+++ b/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh
@@ -1,93 +1,107 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
-
-namespace raft::neighbors::cagra::detail {
-
-namespace multi_cta_search {
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)   \
-  extern template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                     \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                          \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t result_buffer_size,                                                                \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    uint32_t num_cta_per_query,                                                                 \
-    uint32_t num_random_samplings,                                                              \
-    uint64_t rand_xor_mask,                                                                     \
-    uint32_t num_seeds,                                                                         \
-    size_t itopk_size,                                                                          \
-    size_t search_width,                                                                        \
-    size_t min_iterations,                                                                      \
-    size_t max_iterations,                                                                      \
-    cudaStream_t stream);
-
-instantiate_kernel_selection(32, 1024, float, uint64_t, float);
-instantiate_kernel_selection(8, 128, float, uint64_t, float);
-instantiate_kernel_selection(16, 256, float, uint64_t, float);
-instantiate_kernel_selection(32, 512, float, uint64_t, float);
-
-#undef instantiate_kernel_selection
-}  // namespace multi_cta_search
-
-namespace single_cta_search {
-
-#define instantiate_single_cta_select_and_run(                                                  \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                                      \
-  extern template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                     \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                          \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t num_itopk_candidates,                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    size_t small_hash_bitlen,                                                                   \
-    size_t small_hash_reset_interval,                                                           \
-    uint32_t num_random_samplings,                                                              \
-    uint64_t rand_xor_mask,                                                                     \
-    uint32_t num_seeds,                                                                         \
-    size_t itopk_size,                                                                          \
-    size_t search_width,                                                                        \
-    size_t min_iterations,                                                                      \
-    size_t max_iterations,                                                                      \
-    cudaStream_t stream);
-
-instantiate_single_cta_select_and_run(32, 1024, float, uint64_t, float);
-instantiate_single_cta_select_and_run(8, 128, float, uint64_t, float);
-instantiate_single_cta_select_and_run(16, 256, float, uint64_t, float);
-instantiate_single_cta_select_and_run(32, 512, float, uint64_t, float);
-
-}  // namespace single_cta_search
-}  // namespace raft::neighbors::cagra::detail
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/neighbors/sample_filter_types.hpp>  // none_cagra_sample_filter
+#include <raft/util/raft_explicit.hpp>             // RAFT_EXPLICIT
+
+namespace raft::neighbors::cagra::detail {
+
+namespace multi_cta_search {
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  extern template void                                                                      \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
+    cudaStream_t stream);
+
+instantiate_kernel_selection(
+  32, 1024, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  8, 128, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  16, 256, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  32, 512, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+
+#undef instantiate_kernel_selection
+}  // namespace multi_cta_search
+
+namespace single_cta_search {
+
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  extern template void                                                                      \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
+    cudaStream_t stream);
+
+instantiate_single_cta_select_and_run(
+  32, 1024, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  8, 128, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  16, 256, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  32, 512, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+
+}  // namespace single_cta_search
+}  // namespace raft::neighbors::cagra::detail
\ No newline at end of file
diff --git a/cpp/test/neighbors/ann_cagra/test_float_int64_t.cu b/cpp/test/neighbors/ann_cagra/test_float_int64_t.cu
index fa3d76d066..6f9e8dbd43 100644
--- a/cpp/test/neighbors/ann_cagra/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_float_int64_t.cu
@@ -19,11 +19,11 @@
 #include "../ann_cagra.cuh"
 #include "search_kernel_uint64_t.cuh"
 
-namespace raft::neighbors::experimental::cagra {
+namespace raft::neighbors::cagra {
 
 typedef AnnCagraTest<float, float, std::int64_t> AnnCagraTestF_I64;
 TEST_P(AnnCagraTestF_I64, AnnCagra) { this->testCagra(); }
 
 INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestF_I64, ::testing::ValuesIn(inputs));
 
-}  // namespace raft::neighbors::experimental::cagra
+}  // namespace raft::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
index dbaf4dedd9..01d7e1e1ea 100644
--- a/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
@@ -18,7 +18,7 @@
 
 #include "../ann_cagra.cuh"
 
-namespace raft::neighbors::experimental::cagra {
+namespace raft::neighbors::cagra {
 
 typedef AnnCagraTest<float, float, std::uint32_t> AnnCagraTestF_U32;
 TEST_P(AnnCagraTestF_U32, AnnCagra) { this->testCagra(); }
@@ -26,7 +26,11 @@ TEST_P(AnnCagraTestF_U32, AnnCagra) { this->testCagra(); }
 typedef AnnCagraSortTest<float, float, std::uint32_t> AnnCagraSortTestF_U32;
 TEST_P(AnnCagraSortTestF_U32, AnnCagraSort) { this->testCagraSort(); }
 
+typedef AnnCagraFilterTest<float, float, std::uint32_t> AnnCagraFilterTestF_U32;
+TEST_P(AnnCagraFilterTestF_U32, AnnCagraFilter) { this->testCagraFilter(); }
+
 INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestF_U32, ::testing::ValuesIn(inputs));
 INSTANTIATE_TEST_CASE_P(AnnCagraSortTest, AnnCagraSortTestF_U32, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(AnnCagraFilterTest, AnnCagraFilterTestF_U32, ::testing::ValuesIn(inputs));
 
-}  // namespace raft::neighbors::experimental::cagra
+}  // namespace raft::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
index ba60131677..ee06d369fa 100644
--- a/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
@@ -18,14 +18,17 @@
 
 #include "../ann_cagra.cuh"
 
-namespace raft::neighbors::experimental::cagra {
+namespace raft::neighbors::cagra {
 
 typedef AnnCagraTest<float, std::int8_t, std::uint32_t> AnnCagraTestI8_U32;
 TEST_P(AnnCagraTestI8_U32, AnnCagra) { this->testCagra(); }
 typedef AnnCagraSortTest<float, std::int8_t, std::uint32_t> AnnCagraSortTestI8_U32;
 TEST_P(AnnCagraSortTestI8_U32, AnnCagraSort) { this->testCagraSort(); }
+typedef AnnCagraFilterTest<float, std::int8_t, std::uint32_t> AnnCagraFilterTestI8_U32;
+TEST_P(AnnCagraFilterTestI8_U32, AnnCagraFilter) { this->testCagraFilter(); }
 
 INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestI8_U32, ::testing::ValuesIn(inputs));
 INSTANTIATE_TEST_CASE_P(AnnCagraSortTest, AnnCagraSortTestI8_U32, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(AnnCagraFilterTest, AnnCagraFilterTestI8_U32, ::testing::ValuesIn(inputs));
 
-}  // namespace raft::neighbors::experimental::cagra
+}  // namespace raft::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
index cc172e4833..3243e73ccd 100644
--- a/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
@@ -18,7 +18,7 @@
 
 #include "../ann_cagra.cuh"
 
-namespace raft::neighbors::experimental::cagra {
+namespace raft::neighbors::cagra {
 
 typedef AnnCagraTest<float, std::uint8_t, std::uint32_t> AnnCagraTestU8_U32;
 TEST_P(AnnCagraTestU8_U32, AnnCagra) { this->testCagra(); }
@@ -26,7 +26,11 @@ TEST_P(AnnCagraTestU8_U32, AnnCagra) { this->testCagra(); }
 typedef AnnCagraSortTest<float, std::uint8_t, std::uint32_t> AnnCagraSortTestU8_U32;
 TEST_P(AnnCagraSortTestU8_U32, AnnCagraSort) { this->testCagraSort(); }
 
+typedef AnnCagraFilterTest<float, std::uint8_t, std::uint32_t> AnnCagraFilterTestU8_U32;
+TEST_P(AnnCagraFilterTestU8_U32, AnnCagraSort) { this->testCagraFilter(); }
+
 INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestU8_U32, ::testing::ValuesIn(inputs));
 INSTANTIATE_TEST_CASE_P(AnnCagraSortTest, AnnCagraSortTestU8_U32, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(AnnCagraFilterTest, AnnCagraFilterTestU8_U32, ::testing::ValuesIn(inputs));
 
-}  // namespace raft::neighbors::experimental::cagra
+}  // namespace raft::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh
new file mode 100644
index 0000000000..948323cf6e
--- /dev/null
+++ b/cpp/test/neighbors/ann_nn_descent.cuh
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../test_utils.cuh"
+#include "ann_utils.cuh"
+
+#include <raft_internal/neighbors/naive_knn.cuh>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/neighbors/nn_descent.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace raft::neighbors::experimental::nn_descent {
+
+struct AnnNNDescentInputs {
+  int n_rows;
+  int dim;
+  int graph_degree;
+  raft::distance::DistanceType metric;
+  bool host_dataset;
+  double min_recall;
+};
+
+inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs& p)
+{
+  os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree
+     << ", metric=" << static_cast<int>(p.metric) << (p.host_dataset ? ", host" : ", device")
+     << std::endl;
+  return os;
+}
+
+template <typename DistanceT, typename DataT, typename IdxT>
+class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
+ public:
+  AnnNNDescentTest()
+    : stream_(resource::get_cuda_stream(handle_)),
+      ps(::testing::TestWithParam<AnnNNDescentInputs>::GetParam()),
+      database(0, stream_)
+  {
+  }
+
+ protected:
+  void testNNDescent()
+  {
+    size_t queries_size = ps.n_rows * ps.graph_degree;
+    std::vector<IdxT> indices_NNDescent(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                        distances_naive_dev.data(),
+                                        indices_naive_dev.data(),
+                                        database.data(),
+                                        database.data(),
+                                        ps.n_rows,
+                                        ps.n_rows,
+                                        ps.dim,
+                                        ps.graph_degree,
+                                        ps.metric);
+      update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      resource::sync_stream(handle_);
+    }
+
+    {
+      {
+        nn_descent::index_params index_params;
+        index_params.metric                    = ps.metric;
+        index_params.graph_degree              = ps.graph_degree;
+        index_params.intermediate_graph_degree = 2 * ps.graph_degree;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          (const DataT*)database.data(), ps.n_rows, ps.dim);
+
+        {
+          if (ps.host_dataset) {
+            auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+            raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+            auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
+              (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+            auto index = nn_descent::build<DataT, IdxT>(handle_, index_params, database_host_view);
+            update_host(
+              indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+          } else {
+            auto index = nn_descent::build<DataT, IdxT>(handle_, index_params, database_view);
+            update_host(
+              indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+          };
+        }
+        resource::sync_stream(handle_);
+      }
+
+      double min_recall = ps.min_recall;
+      EXPECT_TRUE(eval_recall(
+        indices_naive, indices_NNDescent, ps.n_rows, ps.graph_degree, 0.001, min_recall));
+    }
+  }
+
+  void SetUp() override
+  {
+    database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
+    raft::random::Rng r(1234ULL);
+    if constexpr (std::is_same<DataT, float>{}) {
+      r.normal(database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0), stream_);
+    } else {
+      r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), stream_);
+    }
+    resource::sync_stream(handle_);
+  }
+
+  void TearDown() override
+  {
+    resource::sync_stream(handle_);
+    database.resize(0, stream_);
+  }
+
+ private:
+  raft::resources handle_;
+  rmm::cuda_stream_view stream_;
+  AnnNNDescentInputs ps;
+  rmm::device_uvector<DataT> database;
+};
+
+const std::vector<AnnNNDescentInputs> inputs = raft::util::itertools::product<AnnNNDescentInputs>(
+  {1000, 2000},                                              // n_rows
+  {3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},  // dim
+  {32, 64},                                                  // graph_degree
+  {raft::distance::DistanceType::L2Expanded},
+  {false, true},
+  {0.92});
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
new file mode 100644
index 0000000000..13bff6ac90
--- /dev/null
+++ b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_nn_descent.cuh"
+
+namespace raft::neighbors::experimental::nn_descent {
+
+typedef AnnNNDescentTest<float, float, std::uint32_t> AnnNNDescentTestF_U32;
+TEST_P(AnnNNDescentTestF_U32, AnnCagra) { this->testNNDescent(); }
+
+INSTANTIATE_TEST_CASE_P(AnnNNDescentTest, AnnNNDescentTestF_U32, ::testing::ValuesIn(inputs));
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu b/cpp/test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
new file mode 100644
index 0000000000..5895303e09
--- /dev/null
+++ b/cpp/test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_nn_descent.cuh"
+
+namespace raft::neighbors::experimental::nn_descent {
+
+typedef AnnNNDescentTest<float, int8_t, std::uint32_t> AnnNNDescentTestI8_U32;
+TEST_P(AnnNNDescentTestI8_U32, AnnCagra) { this->testNNDescent(); }
+
+INSTANTIATE_TEST_CASE_P(AnnNNDescentTest, AnnNNDescentTestI8_U32, ::testing::ValuesIn(inputs));
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu b/cpp/test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
new file mode 100644
index 0000000000..a034e84074
--- /dev/null
+++ b/cpp/test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_nn_descent.cuh"
+
+namespace raft::neighbors::experimental::nn_descent {
+
+typedef AnnNNDescentTest<float, uint8_t, std::uint32_t> AnnNNDescentTestUI8_U32;
+TEST_P(AnnNNDescentTestUI8_U32, AnnCagra) { this->testNNDescent(); }
+
+INSTANTIATE_TEST_CASE_P(AnnNNDescentTest, AnnNNDescentTestUI8_U32, ::testing::ValuesIn(inputs));
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index 0e54e29c01..be60ec5b6d 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -123,6 +123,49 @@ struct idx_dist_pair {
   idx_dist_pair(IdxT x, DistT y, CompareDist op) : idx(x), dist(y), eq_compare(op) {}
 };
 
+template <typename T>
+auto eval_recall(const std::vector<T>& expected_idx,
+                 const std::vector<T>& actual_idx,
+                 size_t rows,
+                 size_t cols,
+                 double eps,
+                 double min_recall) -> testing::AssertionResult
+{
+  size_t match_count = 0;
+  size_t total_count = static_cast<size_t>(rows) * static_cast<size_t>(cols);
+  for (size_t i = 0; i < rows; ++i) {
+    for (size_t k = 0; k < cols; ++k) {
+      size_t idx_k = i * cols + k;  // row major assumption!
+      auto act_idx = actual_idx[idx_k];
+      for (size_t j = 0; j < cols; ++j) {
+        size_t idx   = i * cols + j;  // row major assumption!
+        auto exp_idx = expected_idx[idx];
+        if (act_idx == exp_idx) {
+          match_count++;
+          break;
+        }
+      }
+    }
+  }
+  double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
+  double error_margin  = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
+  RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).",
+                actual_recall,
+                match_count,
+                total_count,
+                std::abs(error_margin * 100.0),
+                error_margin < 0 ? "above" : "below",
+                eps);
+  if (actual_recall < min_recall - eps) {
+    return testing::AssertionFailure()
+           << "actual recall (" << actual_recall << ") is lower than the minimum expected recall ("
+           << min_recall << "); eps = " << eps << ". ";
+  }
+  return testing::AssertionSuccess();
+}
+
+/** same as eval_recall, but in case indices do not match,
+ * then check distances as well, and accept match if actual dist is equal to expected_dist */
 template <typename T, typename DistT>
 auto eval_neighbours(const std::vector<T>& expected_idx,
                      const std::vector<T>& actual_idx,
diff --git a/cpp/test/neighbors/tiled_knn.cu b/cpp/test/neighbors/tiled_knn.cu
index 2ab82b845e..ebde8e6d35 100644
--- a/cpp/test/neighbors/tiled_knn.cu
+++ b/cpp/test/neighbors/tiled_knn.cu
@@ -180,6 +180,36 @@ class TiledKNNTest : public ::testing::TestWithParam<TiledKNNInputs> {
                                                        float(0.001),
                                                        stream_,
                                                        true));
+
+    // Also test out the 'index' api - where we can use precomputed norms
+    if (params_.row_major) {
+      auto idx =
+        raft::neighbors::brute_force::build<T>(handle_,
+                                               raft::make_device_matrix_view<const T, int64_t>(
+                                                 database.data(), params_.num_db_vecs, params_.dim),
+                                               metric,
+                                               metric_arg);
+
+      raft::neighbors::brute_force::search<T, int>(
+        handle_,
+        idx,
+        raft::make_device_matrix_view<const T, int64_t>(
+          search_queries.data(), params_.num_queries, params_.dim),
+        raft::make_device_matrix_view<int, int64_t>(
+          raft_indices_.data(), params_.num_queries, params_.k),
+        raft::make_device_matrix_view<T, int64_t>(
+          raft_distances_.data(), params_.num_queries, params_.k));
+
+      ASSERT_TRUE(raft::spatial::knn::devArrMatchKnnPair(ref_indices_.data(),
+                                                         raft_indices_.data(),
+                                                         ref_distances_.data(),
+                                                         raft_distances_.data(),
+                                                         num_queries,
+                                                         k_,
+                                                         float(0.001),
+                                                         stream_,
+                                                         true));
+    }
   }
 
   void SetUp() override
diff --git a/cpp/test/util/bitonic_sort.cu b/cpp/test/util/bitonic_sort.cu
index d1f03f78b5..2cf5420334 100644
--- a/cpp/test/util/bitonic_sort.cu
+++ b/cpp/test/util/bitonic_sort.cu
@@ -103,12 +103,12 @@ struct bitonic_launch {
 };
 
 template <typename T>
-class BitonicTest : public testing::TestWithParam<test_spec> {     // NOLINT
+class BitonicTest : public testing::TestWithParam<test_spec> {  // NOLINT
  protected:
-  const test_spec spec;                                            // NOLINT
-  std::vector<T> in;                                               // NOLINT
-  std::vector<T> out;                                              // NOLINT
-  std::vector<T> ref;                                              // NOLINT
+  const test_spec spec;  // NOLINT
+  std::vector<T> in;     // NOLINT
+  std::vector<T> out;    // NOLINT
+  std::vector<T> ref;    // NOLINT
 
   void segmented_sort(std::vector<T>& vec, int k, bool ascending)  // NOLINT
   {
@@ -184,13 +184,13 @@ auto inputs = ::testing::Values(test_spec{1, 1, 1, true},
                                 test_spec{70, 1, 64, true},
                                 test_spec{70, 2, 128, false});
 
-using Floats = BitonicTest<float>;                      // NOLINT
-TEST_P(Floats, Run) { run(); }                          // NOLINT
-INSTANTIATE_TEST_CASE_P(BitonicTest, Floats, inputs);   // NOLINT
+using Floats = BitonicTest<float>;                     // NOLINT
+TEST_P(Floats, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Floats, inputs);  // NOLINT
 
-using Ints = BitonicTest<int>;                          // NOLINT
-TEST_P(Ints, Run) { run(); }                            // NOLINT
-INSTANTIATE_TEST_CASE_P(BitonicTest, Ints, inputs);     // NOLINT
+using Ints = BitonicTest<int>;                       // NOLINT
+TEST_P(Ints, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Ints, inputs);  // NOLINT
 
 using Doubles = BitonicTest<double>;                    // NOLINT
 TEST_P(Doubles, Run) { run(); }                         // NOLINT
diff --git a/cpp/test/util/reduction.cu b/cpp/test/util/reduction.cu
index 17deaf99eb..548d3b9d53 100644
--- a/cpp/test/util/reduction.cu
+++ b/cpp/test/util/reduction.cu
@@ -147,9 +147,9 @@ struct reduction_launch {
 template <typename T>
 class ReductionTest : public testing::TestWithParam<std::vector<int>> {  // NOLINT
  protected:
-  const std::vector<int> input;                                          // NOLINT
-  rmm::cuda_stream_view stream;                                          // NOLINT
-  rmm::device_uvector<int> arr_d;                                        // NOLINT
+  const std::vector<int> input;    // NOLINT
+  rmm::cuda_stream_view stream;    // NOLINT
+  rmm::device_uvector<int> arr_d;  // NOLINT
 
  public:
   explicit ReductionTest()
@@ -184,8 +184,8 @@ const std::vector<int> binary_test_vector{
 auto reduction_input        = ::testing::Values(test_vector);
 auto binary_reduction_input = ::testing::Values(binary_test_vector);
 
-using ReductionTestInt       = ReductionTest<int>;                            // NOLINT
-using BinaryReductionTestInt = ReductionTest<int>;                            // NOLINT
+using ReductionTestInt       = ReductionTest<int>;  // NOLINT
+using BinaryReductionTestInt = ReductionTest<int>;  // NOLINT
 TEST_P(ReductionTestInt, REDUCTIONS) { run_reduction(); }
 INSTANTIATE_TEST_CASE_P(ReductionTest, ReductionTestInt, reduction_input);    // NOLINT
 TEST_P(BinaryReductionTestInt, BINARY_REDUCTION) { run_binary_reduction(); }  // NOLINT
diff --git a/dependencies.yaml b/dependencies.yaml
index 6f64287f54..3ad51a6377 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -10,12 +10,15 @@ files:
       - build_pylibraft
       - cudatoolkit
       - develop
+      - checks
+      - build_wheels
       - test_libraft
       - docs
       - run_raft_dask
       - run_pylibraft
       - test_python_common
       - test_pylibraft
+      - cupy
   bench_ann:
     output: conda
     matrix:
@@ -38,6 +41,7 @@ files:
       - py_version
       - test_python_common
       - test_pylibraft
+      - cupy
   checks:
     output: none
     includes:
@@ -47,6 +51,7 @@ files:
     output: none
     includes:
       - test_pylibraft
+      - cupy
       - cudatoolkit
       - docs
       - py_version
@@ -75,6 +80,7 @@ files:
     includes:
       - test_python_common
       - test_pylibraft
+      - cupy
   py_build_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -145,11 +151,37 @@ dependencies:
             packages:
               - gcc_linux-aarch64=11.*
               - sysroot_linux-aarch64==2.17
+      - output_types: conda
+        matrices:
+          - matrix: {cuda: "12.0"}
+            packages: [cuda-version=12.0, cuda-nvcc]
+          - matrix: {cuda: "11.8", arch: x86_64}
+            packages: [nvcc_linux-64=11.8]
+          - matrix: {cuda: "11.8", arch: aarch64}
+            packages: [nvcc_linux-aarch64=11.8]
+          - matrix: {cuda: "11.5", arch: x86_64}
+            packages: [nvcc_linux-64=11.5]
+          - matrix: {cuda: "11.5", arch: aarch64}
+            packages: [nvcc_linux-aarch64=11.5]
+          - matrix: {cuda: "11.4", arch: x86_64}
+            packages: [nvcc_linux-64=11.4]
+          - matrix: {cuda: "11.4", arch: aarch64}
+            packages: [nvcc_linux-aarch64=11.4]
+          - matrix: {cuda: "11.2", arch: x86_64}
+            packages: [nvcc_linux-64=11.2]
+          - matrix: {cuda: "11.2", arch: aarch64}
+            packages: [nvcc_linux-aarch64=11.2]
+
   build_pylibraft:
     common:
-      - output_types: [conda, requirements, pyproject]
+      - output_types: [conda]
         packages:
-          - &rmm rmm==23.10.*
+          - &rmm_conda rmm==23.10.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
@@ -160,6 +192,20 @@ dependencies:
           - matrix: # All CUDA 11 versions
             packages:
               - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &build_pylibraft_packages_cu12
+              - &rmm_cu12 rmm-cu12==23.10.*
+          - {matrix: {cuda: "12.1"}, packages: *build_pylibraft_packages_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *build_pylibraft_packages_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &build_pylibraft_packages_cu11
+              - &rmm_cu11 rmm-cu11==23.10.*
+          - {matrix: {cuda: "11.5"}, packages: *build_pylibraft_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *build_pylibraft_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *build_pylibraft_packages_cu11}
+          - {matrix: null, packages: [*rmm_conda] }
   checks:
     common:
       - output_types: [conda, requirements]
@@ -167,12 +213,10 @@ dependencies:
           - pre-commit
   develop:
     common:
-      - output_types: [conda, requirements]
-        packages:
-          - clang=16.0.1
-      - output_types: [conda]
+      - output_types: conda
         packages:
-          - clang-tools=16.0.1
+          - clang==16.0.6
+          - clang-tools=16.0.6
   nn_bench:
     common:
       - output_types: [conda, pyproject, requirements]
@@ -265,6 +309,45 @@ dependencies:
               - *libcusolver114
               - *libcusparse_dev114
               - *libcusparse114
+
+  cupy:
+    common:
+      - output_types: conda
+        packages:
+          - cupy>=12.0.0
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          # All CUDA 12 + x86_64 versions
+          - matrix: {cuda: "12.2", arch: x86_64}
+            packages: &cupy_packages_cu12_x86_64
+              - &cupy_cu12_x86_64 cupy-cuda12x>=12.0.0
+          - {matrix: {cuda: "12.1", arch: x86_64}, packages: *cupy_packages_cu12_x86_64}
+          - {matrix: {cuda: "12.0", arch: x86_64}, packages: *cupy_packages_cu12_x86_64}
+          # All CUDA 12 + aarch64 versions
+          - matrix: {cuda: "12.2", arch: aarch64}
+            packages: &cupy_packages_cu12_aarch64
+              - &cupy_cu12_aarch64 cupy-cuda12x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
+          - {matrix: {cuda: "12.1", arch: aarch64}, packages: *cupy_packages_cu12_aarch64}
+          - {matrix: {cuda: "12.0", arch: aarch64}, packages: *cupy_packages_cu12_aarch64}
+
+          # All CUDA 11 + x86_64 versions
+          - matrix: {cuda: "11.8", arch: x86_64}
+            packages: &cupy_packages_cu11_x86_64
+              - cupy-cuda11x>=12.0.0
+          - {matrix: {cuda: "11.5", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
+          - {matrix: {cuda: "11.4", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
+          - {matrix: {cuda: "11.2", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
+
+          # All CUDA 11 + aarch64 versions
+          - matrix: {cuda: "11.8", arch: aarch64}
+            packages: &cupy_packages_cu11_aarch64
+              - cupy-cuda11x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
+          - {matrix: {cuda: "11.5", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
+          - {matrix: {cuda: "11.4", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
+          - {matrix: {cuda: "11.2", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
+          - {matrix: null, packages: [cupy-cuda11x>=12.0.0]}
+
   test_libraft:
     common:
       - output_types: [conda]
@@ -287,7 +370,7 @@ dependencies:
           - sphinx-markdown-tables
   build_wheels:
     common:
-      - output_types: pyproject
+      - output_types: [requirements, pyproject]
         packages:
           - wheel
           - setuptools
@@ -311,7 +394,14 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - &numpy numpy>=1.21
-          - *rmm
+      - output_types: [conda]
+        packages:
+          - *rmm_conda
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for cudf and rmm.
+          - --extra-index-url=https://pypi.nvidia.com
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
@@ -322,25 +412,61 @@ dependencies:
           - matrix: # All CUDA 11 versions
             packages:
               - *cuda_python11
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &run_pylibraft_packages_cu12
+              - *rmm_cu12
+          - {matrix: {cuda: "12.1"}, packages: *run_pylibraft_packages_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *run_pylibraft_packages_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &run_pylibraft_packages_cu11
+              - *rmm_cu11
+          - {matrix: {cuda: "11.5"}, packages: *run_pylibraft_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *run_pylibraft_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *run_pylibraft_packages_cu11}
+          - {matrix: null, packages: [*rmm_conda]}
   run_raft_dask:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - dask>=2023.7.1
+          - dask==2023.9.2
           - dask-cuda==23.10.*
-          - distributed>=2023.7.1
+          - distributed==2023.9.2
           - joblib>=0.11
           - numba>=0.57
           - *numpy
-          - ucx-py==0.34.*
       - output_types: conda
         packages:
-          - dask-core>=2023.7.1
+          - dask-core==2023.9.2
           - ucx>=1.13.0
           - ucx-proc=*=gpu
+          - &ucx_py_conda ucx-py==0.34.*
       - output_types: pyproject
         packages:
-          - pylibraft==23.10.*
+          - &pylibraft_conda pylibraft==23.10.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for cudf and rmm.
+          - --extra-index-url=https://pypi.nvidia.com
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &run_raft_dask_packages_cu12
+              - &pylibraft_cu12 pylibraft-cu12==23.10.*
+              - &ucx_py_cu12 ucx-py-cu12==0.34.*
+          - {matrix: {cuda: "12.1"}, packages: *run_raft_dask_packages_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *run_raft_dask_packages_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &run_raft_dask_packages_cu11
+              - &pylibraft_cu11 pylibraft-cu11==23.10.*
+              - &ucx_py_cu11 ucx-py-cu11==0.34.*
+          - {matrix: {cuda: "11.5"}, packages: *run_raft_dask_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *run_raft_dask_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *run_raft_dask_packages_cu11}
+          - {matrix: null, packages: [*pylibraft_conda, *ucx_py_conda]}
   test_python_common:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -353,9 +479,3 @@ dependencies:
         packages:
           - scikit-learn
           - scipy
-      - output_types: conda
-        packages:
-          - cupy>=12.0.0
-      - output_types: pyproject
-        packages:
-          - cupy-cuda11x>=12.0.0
diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
index 020c2d5ad9..433df2ae2f 100644
--- a/docs/source/ann_benchmarks_param_tuning.md
+++ b/docs/source/ann_benchmarks_param_tuning.md
@@ -1,6 +1,6 @@
 # ANN Benchmarks Parameter Tuning Guide
 
-This guide outlines the various parameter settings that can be specified in [RAFT ANN Benchmark](raft_ann_benchmarks.md) json configuration files and explains the impact they have on corresponding algorithms to help inform their settings for benchmarking across desired levels of recall. 
+This guide outlines the various parameter settings that can be specified in [RAFT ANN Benchmark](raft_ann_benchmarks.md) json configuration files and explains the impact they have on corresponding algorithms to help inform their settings for benchmarking across desired levels of recall.
 
 
 ## RAFT Indexes
@@ -11,41 +11,50 @@ IVF-flat uses an inverted-file index, which partitions the vectors into a series
 
 IVF-flat is a simple algorithm which won't save any space, but it provides competitive search times even at higher levels of recall.
 
-| Parameter | Type             | Required | Data Type           | Default | Description                                                                                                                                                                       |
-|-----------|------------------|----------|---------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `nlists`  | `build_param`    | Y        | Positive Integer >0 |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
-| `niter`   | `build_param`    | N        | Positive Integer >0 | 20      | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
-| `ratio`   | `build_param`     | N        | Positive Float >0   | 0.5     | Fraction of the number of training points which should be used to train the clusters.                                                                                             |
-| `nprobe`  | `search_params` | Y        |  Positive Integer >0 |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
+| Parameter             | Type             | Required | Data Type                  | Default  | Description                                                                                                                                                                       |
+|-----------------------|------------------|----------|----------------------------|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `nlists`              | `build_param`    | Y        | Positive Integer >0        |          | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `niter`               | `build_param`    | N        | Positive Integer >0        | 20       | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `ratio`               | `build_param`    | N        | Positive Integer >0        | 2        | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
+| `dataset_memory_type` | `build_param` | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside?                                                                                                                                       |
+| `query_memory_type`   | `search_params` | N | ["device", "host", "mmap"] | "device | What memory type should the queries reside? |
+| `nprobe`              | `search_params`  | Y        | Positive Integer >0        |          | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
 
 
 ### `raft_ivf_pq`
 
 IVF-pq is an inverted-file index, which partitions the vectors into a series of clusters, or lists, in a similar way to IVF-flat above. The difference is that IVF-PQ uses product quantization to also compress the vectors, giving the index a smaller memory footprint. Unfortunately, higher levels of compression can also shrink recall, which a refinement step can improve when the original vectors are still available.
 
-| Parameter               | Type           | Required | Data Type                    | Default | Description                                                                                                                                                                     |
-|-------------------------|----------------|---|------------------------------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `nlists`                | `build_param`  | Y | Positive Integer >0          |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
-| `niter`                 | `build_param`  | N | Positive Integer >0          | 20      | Number of k-means iterations to use when training the clusters.                                                                                                                 |
+| Parameter               | Type           | Required | Data Type                        | Default | Description                                                                                                                                                                     |
+|-------------------------|----------------|---|----------------------------------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `nlists`                | `build_param`  | Y | Positive Integer >0              |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `niter`                 | `build_param`  | N | Positive Integer >0              | 20      | Number of k-means iterations to use when training the clusters.                                                                                                                 |
+| `ratio`                 | `build_param`  | N | Positive Integer >0              | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
 | `pq_dim`                | `build_param`  | N | Positive Integer. Multiple of 8. | 0       | Dimensionality of the vector after product quantization. When 0, a heuristic is used to select this value. `pq_dim` * `pq_bits` must be a multiple of 8.                        |
-| `pq_bits`               | `build_param`  | N | Positive Integer. [4-8]      | 8       | Bit length of the vector element after quantization.                                                                                                                            |
-| `nprobe`                | `search_params` | Y | Positive Integer >0          |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                     |
-| `internalDistanceDtype` | `search_params` | N | [`float`, `half`]            | `half`  | The precision to use for the distance computations. Lower precision can increase performance at the cost of accuracy.                                                           |
-| `smemLutDtype`          | `search_params` | N | [`float`, `half`, `fp8`]     | `half`  | The precision to use for the lookup table in shared memory. Lower precision can increase performance at the cost of accuracy.                                                   |
-| `refine_ratio`          | `search_params` | N| Positive Number >=0          | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
+| `pq_bits`               | `build_param`  | N | Positive Integer. [4-8]          | 8       | Bit length of the vector element after quantization.                                                                                                                            |
+| `codebook_kind`         | `build_param`  | N | ["cluster", "subspace"]          | "subspace" | Type of codebook. See the [API docs](https://docs.rapids.ai/api/raft/nightly/cpp_api/neighbors_ivf_pq/#_CPPv412codebook_gen) for more detail                                 |
+| `dataset_memory_type`   | `build_param` | N | ["device", "host", "mmap"]       | "device" | What memory type should the dataset reside?                                                                                                                                       |
+| `query_memory_type`     | `search_params` | N | ["device", "host", "mmap"]       | "device | What memory type should the queries reside? |
+| `nprobe`                | `search_params` | Y | Positive Integer >0              |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                     |
+| `internalDistanceDtype` | `search_params` | N | [`float`, `half`]                | `half`  | The precision to use for the distance computations. Lower precision can increase performance at the cost of accuracy.                                                           |
+| `smemLutDtype`          | `search_params` | N | [`float`, `half`, `fp8`]         | `half`  | The precision to use for the lookup table in shared memory. Lower precision can increase performance at the cost of accuracy.                                                   |
+| `refine_ratio`          | `search_params` | N| Positive Number >=0              | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
 
 
 ### `raft_cagra`
 CAGRA uses a graph-based index, which creates an intermediate, approximate kNN graph using IVF-PQ and then further refining and optimizing to create a final kNN graph. This kNN graph is used by CAGRA as an index for search.
 
-| Parameter | Type           | Required | Data Type           | Default | Description                                                                                                                                                                       |
-|-----------|----------------|----------|---------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `graph_degree`  | `build_param`  | N        | Positive Integer >0 | 64 | Degree of the final kNN graph index. |
-| `intermediate_graph_degree`  | `build_param`  | N        | Positive Integer >0 | 128 | Degree of the intermediate kNN graph. |
-| `itopk`  | `search_wdith`  | N        | Positive Integer >0 | 64 | Number of intermediate search results retained during the search. Higher values improve search accuracy at the cost of speed. |
-| `search_width`  | `search_param`  | N        | Positive Integer >0 | 1 | Number of graph nodes to select as the starting point for the search in each iteration. |
-| `max_iterations`  | `search_param`  | N        | Integer >=0 | 0 | Upper limit of search iterations. Auto select when 0. |
-| `algo`  | `search_param`  | N        | string | "auto" | Algorithm to use for search. Possible values: {"auto", "single_cta", "multi_cta", "multi_kernel"} |
+| Parameter                   | Type           | Required | Data Type                  | Default | Description                                                                                                                                                                       |
+|-----------------------------|----------------|----------|----------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `graph_degree`              | `build_param`  | N        | Positive Integer >0        | 64 | Degree of the final kNN graph index. |
+| `intermediate_graph_degree` | `build_param`  | N        | Positive Integer >0        | 128 | Degree of the intermediate kNN graph. |
+| `graph_build_algo`          | `build_param`  | N | ["IVF_PQ", "NN_DESCENT"]   | "IVF_PQ" | Algorithm to use for search |
+| `dataset_memory_type`       | `build_param`  | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside?                                                                                                                                       |
+| `query_memory_type`         | `search_params` | N | ["device", "host", "mmap"] | "device | What memory type should the queries reside? |
+| `itopk`                     | `search_wdith`  | N        | Positive Integer >0        | 64 | Number of intermediate search results retained during the search. Higher values improve search accuracy at the cost of speed. |
+| `search_width`              | `search_param`  | N        | Positive Integer >0        | 1 | Number of graph nodes to select as the starting point for the search in each iteration. |
+| `max_iterations`            | `search_param`  | N        | Integer >=0                | 0 | Upper limit of search iterations. Auto select when 0. |
+| `algo`                      | `search_param`  | N        | string                     | "auto" | Algorithm to use for search. Possible values: {"auto", "single_cta", "multi_cta", "multi_kernel"} |
 
 
 ## FAISS Indexes
@@ -58,7 +67,8 @@ IVF-flat is a simple algorithm which won't save any space, but it provides compe
 
 | Parameter | Type           | Required | Data Type           | Default | Description                                                                                                                                                                       |
 |-----------|----------------|----------|---------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `nlists`  | `build_param`  | Y        | Positive Integer >0 | | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `nlists`  | `build_param`  | Y        | Positive Integer >0 |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `ratio`   | `build_param`  | N        | Positive Integer >0 | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
 | `nprobe`  | `search_params` | Y        | Positive Integer >0 | | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
 
 ### `faiss_gpu_ivf_pq`
@@ -68,6 +78,7 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 | Parameter        | Type           | Required | Data Type                        | Default | Description                                                                                                                                                                       |
 |------------------|----------------|----------|----------------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `nlists`         | `build_param`  | Y        | Positive Integer >0              |         | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `ratio`          | `build_param`  | N        | Positive Integer >0              | 2       | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
 | `M`              | `build_param`  | Y        | Positive Integer Power of 2 [8-64] |         | Number of chunks or subquantizers for each vector.                                                                                                                                |
 | `usePrecomputed` | `build_param`  | N        | Boolean. Default=`false`         | `false` | Use pre-computed lookup tables to speed up search at the cost of increased memory usage.                                                                                          |
 | `useFloat16`     | `build_param`  | N        | Boolean. Default=`false`         | `false`  | Use half-precision floats for clustering step.                                                                                                                                    |
@@ -81,6 +92,12 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 
 ### `hnswlib`
 
-## GGNN Index
+| Parameter        | Type            | Required | Data Type                            | Default | Description                                                                                                                                                                                                                                                                                       |
+|------------------|-----------------|----------|--------------------------------------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `efConstruction` | `build_param`   | Y        | Positive Integer >0                  |         | Controls index time and accuracy. Bigger values increase the index quality. At some point, increasing this will no longer improve the quality.                                                                                                                                                    |
+| `M`              | `build_param`   | Y        | Positive Integer often between 2-100 |         | Number of bi-directional links create for every new element during construction. Higher values work for higher intrinsic dimensionality and/or high recall, low values can work for datasets with low intrinsic dimensionality and/or low recalls. Also affects the algorithm's memory consumption. |
+| `numThreads`     | `build_param`   | N        | Positive Integer >0                  | 1       | Number of threads to use to build the index.                                                                                                                                                                                                                                                      |
+| `ef`             | `search_param`  | Y        | Positive Integer >0                  |         | Size of the dynamic list for the nearest neighbors used for search. Higher value leads to more accurate but slower search. Cannot be lower than `k`.                                                                                                                                              |
+| `numThreads`     | `search_params` | N        | Positive Integer >0                  | 1       | Number of threads to use for queries.                                                                                                                                                                                                                                                             |
 
-### `ggnn`
+Please refer to [HNSW algorithm parameters guide] from `hnswlib` to learn more about these arguments.
\ No newline at end of file
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
index 0e82d81e35..e60ef4e697 100644
--- a/docs/source/cpp_api.rst
+++ b/docs/source/cpp_api.rst
@@ -18,4 +18,5 @@ C++ API
    cpp_api/random.rst
    cpp_api/solver.rst
    cpp_api/sparse.rst
-   cpp_api/stats.rst
\ No newline at end of file
+   cpp_api/stats.rst
+   cpp_api/utils.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
index 7e69f92948..39e57fd69a 100644
--- a/docs/source/cpp_api/core.rst
+++ b/docs/source/cpp_api/core.rst
@@ -20,4 +20,5 @@ expose in public APIs.
    core_nvtx.rst
    core_interruptible.rst
    core_operators.rst
-   core_math.rst
\ No newline at end of file
+   core_math.rst
+   core_bitset.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/core_bitset.rst b/docs/source/cpp_api/core_bitset.rst
new file mode 100644
index 0000000000..af1cff6d37
--- /dev/null
+++ b/docs/source/cpp_api/core_bitset.rst
@@ -0,0 +1,15 @@
+Bitset
+======
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/core/bitset.cuh>``
+
+namespace *raft::core*
+
+.. doxygengroup:: bitset
+    :project: RAFT
+    :members:
+    :content-only:
\ No newline at end of file
diff --git a/docs/source/cpp_api/utils.rst b/docs/source/cpp_api/utils.rst
new file mode 100644
index 0000000000..4471093c8b
--- /dev/null
+++ b/docs/source/cpp_api/utils.rst
@@ -0,0 +1,21 @@
+Utilities
+=========
+
+RAFT contains numerous utility functions and primitives that are easily usable.
+This page provides C++ API references for the publicly-exposed utility functions.
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Memory Pool
+-----------
+
+``#include <raft/utils/memory_pool.cuh>``
+
+namespace *raft*
+
+.. doxygengroup:: memory_pool
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index e0c02bb7eb..8ae2d2535b 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -8,7 +8,7 @@ The easiest way to install these benchmarks is through conda. We provide package
 
 ```bash
 
-mamba env create --name raft_ann_benchmarks
+mamba create --name raft_ann_benchmarks
 conda activate raft_ann_benchmarks
 
 # to install GPU package:
@@ -25,7 +25,7 @@ Please see the [build instructions](ann_benchmarks_build.md) to build the benchm
 ## Running the benchmarks
 
 ### Usage
-There are 3 general steps to running the benchmarks and vizualizing the results:
+There are 4 general steps to running the benchmarks and visualizing the results:
 1. Prepare Dataset
 2. Build Index and Search Index
 3. Data Export
@@ -34,12 +34,11 @@ There are 3 general steps to running the benchmarks and vizualizing the results:
 We provide a collection of lightweight Python scripts that are wrappers over
 lower level scripts and executables to run our benchmarks. Either Python scripts or
 [low-level scripts and executables](ann_benchmarks_low_level.md) are valid methods to run benchmarks,
-however plots are only provided through our Python scripts. An environment variable `RAFT_HOME` is
-expected to be defined to run these scripts; this variable holds the directory where RAFT is cloned.
+however plots are only provided through our Python scripts.
 
 ### End-to-end example: Million-scale
 
-The steps below demonstrate how to download, install, and run benchmarks on a subset of 10M vectors from the Yandex Deep-1B dataset By default the datasets will be stored and used from the folder indicated by the RAPIDS_DATASET_ROOT_DIR environment variable if defined, otherwise a datasets subfolder from where the script is being called:
+The steps below demonstrate how to download, install, and run benchmarks on a subset of 10M vectors from the Yandex Deep-1B dataset By default the datasets will be stored and used from the folder indicated by the `RAPIDS_DATASET_ROOT_DIR` environment variable if defined, otherwise a datasets sub-folder from where the script is being called:
 
 ```bash
 
@@ -56,7 +55,7 @@ python -m raft-ann-bench.data_export --dataset deep-image-96-inner
 python -m raft-ann-bench.plot --dataset deep-image-96-inner
 ```
 
-Configuration files already exist for the following list of the million-scale datasets. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `python/raft-ann-bench/src/raft-ann-bench/conf`.
+Configuration files already exist for the following list of the million-scale datasets. Please refer to [ann-benchmarks datasets](https://github.com/erikbern/ann-benchmarks/#data-sets) for more information, including actual train and sizes. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `$CONDA_PREFIX/lib/python3.xx/site-packages/raft-ann-bench/run/conf`, or you can specify the `--configuration` option to use a specific file.
 - `deep-image-96-angular`
 - `fashion-mnist-784-euclidean`
 - `glove-50-angular`
@@ -80,20 +79,20 @@ mkdir -p datasets/deep-1B
 # (1) prepare dataset
 # download manually "Ground Truth" file of "Yandex DEEP"
 # suppose the file name is deep_new_groundtruth.public.10K.bin
-python python -m raft-ann-bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
+python -m raft-ann-bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
 # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
 
 # (2) build and search index
-python python -m raft-ann-bench.run --dataset deep-1B
+python -m raft-ann-bench.run --dataset deep-1B
 
 # (3) export data
-python python -m raft-ann-bench.data_export --dataset deep-1B
+python -m raft-ann-bench.data_export --dataset deep-1B
 
 # (4) plot results
-python python -m raft-ann-bench.plot --dataset deep-1B
+python -m raft-ann-bench.plot --dataset deep-1B
 ```
 
-The usage of `python -m raft-ann-bench.split-groundtruth` is:
+The usage of `python -m raft-ann-bench.split_groundtruth` is:
 ```bash
 usage: split_groundtruth.py [-h] --groundtruth GROUNDTRUTH
 
@@ -125,7 +124,7 @@ will be normalized to inner product. So, for example, the dataset `glove-100-ang
 will be written at location `datasets/glove-100-inner/`.
 
 #### Step 2: Build and Search Index
-The script `bench/ann/run.py` will build and search indices for a given dataset and its
+The script `raft-ann-bench.run` will build and search indices for a given dataset and its
 specified configuration.
 To confirgure which algorithms are available, we use `algos.yaml`.
 To configure building/searching indices for a dataset, look at [index configuration](#json-index-config).
@@ -182,7 +181,7 @@ it is assumed both are `True`.
 is available in `algos.yaml` and not disabled, as well as having an associated executable.
 
 #### Step 3: Data Export
-The script `bench/ann/data_export.py` will convert the intermediate JSON outputs produced by `raft-ann-bench.run` to more
+The script `raft-ann-bench.data_export` will convert the intermediate JSON outputs produced by `raft-ann-bench.run` to more
 easily readable CSV files, which are needed to build charts made by `raft-ann-bench.plot`.
 
 ```bash
@@ -198,7 +197,7 @@ Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<al
 and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.csv>`.
 
 #### Step 4: Plot Results
-The script `bench/ann/plot.py` will plot results for all algorithms found in index search statistics
+The script `raft-ann-bench.plot` will plot results for all algorithms found in index search statistics
 CSV file in `<dataset-path/<dataset>/result/search/<-k{k}-batch_size{batch_size}>.csv`.
 
 The usage of this script is:
@@ -262,7 +261,7 @@ The `index` section will contain a list of index objects, each of which will hav
    "algo": "algo_name",
    "file": "sift-128-euclidean/algo_name/param1_val1-param2_val2",
    "build_param": { "param1": "val1", "param2": "val2" },
-   "search_params": { "search_param1": "search_val1" }
+   "search_params": [{ "search_param1": "search_val1" }]
 }
 ```
 
@@ -345,7 +344,7 @@ How to interpret these JSON objects is totally left to the implementation and sh
     }
     ```
 
-2. Next, add corresponding `if` case to functions `create_algo()` (in `bench/ann/) and `create_search_param()` by calling parsing functions. The string literal in `if` condition statement must be the same as the value of `algo` in configuration file. For example,
+2. Next, add corresponding `if` case to functions `create_algo()` (in `cpp/bench/ann/) and `create_search_param()` by calling parsing functions. The string literal in `if` condition statement must be the same as the value of `algo` in configuration file. For example,
     ```c++
       // JSON configuration file contains a line like:  "algo" : "hnswlib"
       if (algo == "hnswlib") {
diff --git a/notebooks/ivf_flat_example.ipynb b/notebooks/ivf_flat_example.ipynb
new file mode 100644
index 0000000000..08b9d78169
--- /dev/null
+++ b/notebooks/ivf_flat_example.ipynb
@@ -0,0 +1,674 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4f49c5c4-1170-42a7-9d6a-b90acd00c3c3",
+   "metadata": {},
+   "source": [
+    "# RAFT IVF Flat Example Notebook"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bcfe810-f120-422c-b2bb-72cc43d0c4ca",
+   "metadata": {},
+   "source": [
+    "## Introduction\n",
+    "\n",
+    "This notebook demonstrates how to run approximate nearest neighbor search using RAFT IVF-Flat algorithm.\n",
+    "It builds and searches an index using a dataset from the ann-benchmarks million-scale datasets, saves/loads the index to disk, and explores important parameters for fine-tuning the search performance and accuracy of the index."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "fe73ada7-7b7f-4005-9440-85428194311b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import cupy as cp\n",
+    "import numpy as np\n",
+    "from pylibraft.common import DeviceResources\n",
+    "from pylibraft.neighbors import ivf_flat\n",
+    "import matplotlib.pyplot as plt\n",
+    "import tempfile\n",
+    "from utils import BenchmarkTimer, calc_recall, load_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da9e8615-ea9f-4735-b70f-15ccab36c0d9",
+   "metadata": {},
+   "source": [
+    "For best performance it is recommended to use an RMM pooling allocator, to minimize the overheads of repeated CUDA allocations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "5350e4d9-0993-406a-80af-29538b5677c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import rmm\n",
+    "from rmm.allocators.cupy import rmm_cupy_allocator\n",
+    "mr = rmm.mr.PoolMemoryResource(\n",
+    "     rmm.mr.CudaMemoryResource(),\n",
+    "     initial_pool_size=2**30\n",
+    ")\n",
+    "rmm.mr.set_current_device_resource(mr)\n",
+    "cp.cuda.set_allocator(rmm_cupy_allocator)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b0d935f2-ba24-44fc-bdfe-a769b7fcd8e6",
+   "metadata": {},
+   "source": [
+    "The following GPU is used for this notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a5daa4b4-96de-4e74-bfd6-505b13595f62",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Thu Sep 21 02:30:53 2023       \n",
+      "+---------------------------------------------------------------------------------------+\n",
+      "| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |\n",
+      "|-----------------------------------------+----------------------+----------------------+\n",
+      "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+      "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
+      "|                                         |                      |               MIG M. |\n",
+      "|=========================================+======================+======================|\n",
+      "|   0  NVIDIA H100 PCIe               On  | 00000000:41:00.0 Off |                    0 |\n",
+      "| N/A   35C    P0              69W / 350W |   1487MiB / 81559MiB |      0%      Default |\n",
+      "|                                         |                      |             Disabled |\n",
+      "+-----------------------------------------+----------------------+----------------------+\n",
+      "                                                                                         \n",
+      "+---------------------------------------------------------------------------------------+\n",
+      "| Processes:                                                                            |\n",
+      "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
+      "|        ID   ID                                                             Usage      |\n",
+      "|=======================================================================================|\n",
+      "|    0   N/A  N/A      3940      C   /opt/conda/envs/rapids/bin/python          1474MiB |\n",
+      "+---------------------------------------------------------------------------------------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Report the GPU in use\n",
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88a654cc-6389-4526-a3e6-826de5606a09",
+   "metadata": {},
+   "source": [
+    "## Load dataset\n",
+    "\n",
+    "The ANN benchmarks website provides the datasets in HDF5 format.\n",
+    "\n",
+    "The list of prepared datasets can be found at https://github.com/erikbern/ann-benchmarks/#data-sets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "5f529ad6-b0bd-495c-bf7c-43f10fb6aa14",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The index and data will be saved in /tmp/raft_example\n"
+     ]
+    }
+   ],
+   "source": [
+    "WORK_FOLDER = os.path.join(tempfile.gettempdir(), \"raft_example\")\n",
+    "f = load_dataset(\"http://ann-benchmarks.com/sift-128-euclidean.hdf5\", work_folder=WORK_FOLDER)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3d68a7db-bcf4-449c-96c3-1e8ab146c84d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded dataset of size (1000000, 128),  0.5 GiB; metric: 'euclidean'.\n",
+      "Number of test queries: 10000\n"
+     ]
+    }
+   ],
+   "source": [
+    "metric = f.attrs['distance']\n",
+    "\n",
+    "dataset = cp.array(f['train'])\n",
+    "queries = cp.array(f['test'])\n",
+    "gt_neighbors = cp.array(f['neighbors'])\n",
+    "gt_distances = cp.array(f['distances'])\n",
+    "\n",
+    "itemsize = dataset.dtype.itemsize \n",
+    "\n",
+    "print(f\"Loaded dataset of size {dataset.shape}, {dataset.size*itemsize/(1<<30):4.1f} GiB; metric: '{metric}'.\")\n",
+    "print(f\"Number of test queries: {queries.shape[0]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9f463c50-d1d3-49be-bcfe-952602efa603",
+   "metadata": {},
+   "source": [
+    "## Build index\n",
+    "We set [IndexParams](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#pylibraft.neighbors.ivf_flat.IndexParams) and build the index. The index parameters will be discussed in more detail in later sections of this notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "737f8841-93f9-4c8e-b2e1-787d4474ef94",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 120 ms, sys: 5.33 ms, total: 125 ms\n",
+      "Wall time: 124 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "build_params = ivf_flat.IndexParams(\n",
+    "        n_lists=1024,\n",
+    "        metric=\"euclidean\",\n",
+    "        kmeans_trainset_fraction=0.1,\n",
+    "        kmeans_n_iters=20,\n",
+    "        add_data_on_build=True\n",
+    "    )\n",
+    "\n",
+    "index = ivf_flat.build(build_params, dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a16a0cf6-3b05-4afd-9bb8-54431e0d7439",
+   "metadata": {},
+   "source": [
+    "The index is built. We can print some basic information of the index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1aec7024-6e5d-4d2c-82e6-7b5734aec958",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(type=IVF-FLAT, metric=euclidean, size=1000000, dim=128, n_lists=1024, adaptive_centers=False)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(index)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df7d4958-56a3-48ea-bd64-3486fdb57fb7",
+   "metadata": {},
+   "source": [
+    "## Search neighbors"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "89ba2eaa-4c85-4e1c-b07c-920394e55dce",
+   "metadata": {},
+   "source": [
+    "It is recommended to reuse [device recosources](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/common/#pylibraft.common.DeviceResources) across multiple invocations of search, since constructing these can be time consuming. We will reuse the resources by passing the same handle to each  RAFT API call."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "46e0421b-9335-47a2-8451-a91f56c2f086",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "handle = DeviceResources()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6365229-18fd-468f-af30-e24b950cbd6e",
+   "metadata": {},
+   "source": [
+    "After setting [SearchParams](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#pylibraft.neighbors.ivf_flat.SearchParams) we search for for `k=10` neighbors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "595454e1-7240-4b43-9a73-963d5670b00c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 171 ms, sys: 52.6 ms, total: 224 ms\n",
+      "Wall time: 236 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "n_queries=10000\n",
+    "# n_probes is the number of clusters we select in the first (coarse) search step. This is the only hyper parameter for search.\n",
+    "search_params = ivf_flat.SearchParams(n_probes=30)\n",
+    "\n",
+    "# Search 10 nearest neighbors.\n",
+    "distances, indices = ivf_flat.search(search_params, index, cp.asarray(queries[:n_queries,:]), k=10, handle=handle)\n",
+    "    \n",
+    "# RAFT calls are asynchronous (when handle arg is provided), we need to sync before accessing the results.\n",
+    "handle.sync()\n",
+    "distances, neighbors = cp.asnumpy(distances), cp.asnumpy(indices)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "43d20ca7-7b9e-4046-bb52-640a2744db75",
+   "metadata": {},
+   "source": [
+    "The returned arrays have shape {n_queries x 10] and store the distance values and the indices of the searched vectors. We check how accurate the search is. The accuracy of the search is quantified as `recall`, which is a value between 0 and 1 and tells us what fraction of the returned neighbors are actual k nearest neighbors. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "8cd9cd20-ca00-4a35-a0a0-86636521b31a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.97406"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "calc_recall(neighbors, gt_neighbors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cde5079c-9777-45a1-9545-cffbcc59988f",
+   "metadata": {},
+   "source": [
+    "## Save and load the index\n",
+    "You can serialize the index to file using [save](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#pylibraft.neighbors.ivf_flat.save), and [load](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#pylibraft.neighbors.ivf_flat.load) it later."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "bf94e45c-e7fb-4aa3-a611-ddaee7ac41ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index_file = os.path.join(WORK_FOLDER, \"my_ivf_flat_index.bin\")\n",
+    "ivf_flat.save(index_file, index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "1622d9be-be41-4d25-be99-d348c5e54957",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index = ivf_flat.load(index_file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15d503e5-05e8-47ce-8501-e13fc512099c",
+   "metadata": {},
+   "source": [
+    "## Tune search parameters\n",
+    "Search has a single hyper parameter: `n_probes`, which describes how many neighboring cluster is searched (probed) for each query. Within a probed cluster, the distance is computed between all the vectors in the cluster and the query point, and the top-k neighbors are selected. Finally, the top-k neighbors are selected from all the neighbor candidates from the probed clusters.\n",
+    "\n",
+    "Let's see how search accuracy and latency changes when we change the `n_probes` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "ace0c31f-af75-4352-a438-123a9a03612c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Benchmarking search with n_probes = 10\n",
+      "recall 0.86625\n",
+      "Average search time:   0.026 +/- 0.000259 s\n",
+      "Queries per second (QPS):   384968\n",
+      "\n",
+      "Benchmarking search with n_probes = 20\n",
+      "recall 0.94705\n",
+      "Average search time:   0.050 +/- 5.43e-05 s\n",
+      "Queries per second (QPS):   198880\n",
+      "\n",
+      "Benchmarking search with n_probes = 30\n",
+      "recall 0.97406\n",
+      "Average search time:   0.075 +/- 8.59e-05 s\n",
+      "Queries per second (QPS):   133954\n",
+      "\n",
+      "Benchmarking search with n_probes = 50\n",
+      "recall 0.99169\n",
+      "Average search time:   0.123 +/- 4.78e-05 s\n",
+      "Queries per second (QPS):    80997\n",
+      "\n",
+      "Benchmarking search with n_probes = 100\n",
+      "recall 0.99844\n",
+      "Average search time:   0.244 +/- 0.000249 s\n",
+      "Queries per second (QPS):    40934\n",
+      "\n",
+      "Benchmarking search with n_probes = 200\n",
+      "recall 0.99932\n",
+      "Average search time:   0.468 +/- 0.000367 s\n",
+      "Queries per second (QPS):    21382\n",
+      "\n",
+      "Benchmarking search with n_probes = 500\n",
+      "recall 0.99933\n",
+      "Average search time:   1.039 +/- 0.000209 s\n",
+      "Queries per second (QPS):     9625\n",
+      "\n",
+      "Benchmarking search with n_probes = 1024\n",
+      "recall 0.99935\n",
+      "Average search time:   0.701 +/- 0.00579 s\n",
+      "Queries per second (QPS):    14273\n"
+     ]
+    }
+   ],
+   "source": [
+    "n_probes = np.asarray([10, 20, 30, 50, 100, 200, 500, 1024]);\n",
+    "qps = np.zeros(n_probes.shape);\n",
+    "recall = np.zeros(n_probes.shape);\n",
+    "\n",
+    "for i in range(len(n_probes)):\n",
+    "    print(\"\\nBenchmarking search with n_probes =\", n_probes[i])\n",
+    "    timer = BenchmarkTimer(reps=1, warmup=1)\n",
+    "    for rep in timer.benchmark_runs():\n",
+    "        distances, neighbors = ivf_flat.search(\n",
+    "            ivf_flat.SearchParams(n_probes=n_probes[i]),\n",
+    "            index,\n",
+    "            cp.asarray(queries),\n",
+    "            k=10,\n",
+    "            handle=handle,\n",
+    "        )\n",
+    "        handle.sync()\n",
+    "    \n",
+    "    recall[i] = calc_recall(cp.asnumpy(neighbors), gt_neighbors)\n",
+    "    print(\"recall\", recall[i])\n",
+    "\n",
+    "    timings = np.asarray(timer.timings)\n",
+    "    avg_time = timings.mean()\n",
+    "    std_time = timings.std()\n",
+    "    qps[i] = queries.shape[0] / avg_time\n",
+    "    print(\"Average search time: {0:7.3f} +/- {1:7.3} s\".format(avg_time, std_time))\n",
+    "    print(\"Queries per second (QPS): {0:8.0f}\".format(qps[i]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "20b2498c-7231-4211-990e-600d5c26a9a1",
+   "metadata": {},
+   "source": [
+    "The plots below illustrate how the accuracy (recall) and the throughput (queries per second) depends on the `n_probes` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1ac370f-91c8-4054-95c7-a749df5f16d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plt.figure(figsize=(12,3))\n",
+    "ax = fig.add_subplot(131)\n",
+    "ax.plot(n_probes, recall,'o-')\n",
+    "#ax.set_xticks(bench_k, bench_k)\n",
+    "ax.set_xlabel('n_probes')\n",
+    "ax.grid()\n",
+    "ax.set_ylabel('recall (@k=10)')\n",
+    "\n",
+    "ax = fig.add_subplot(132)\n",
+    "ax.plot(n_probes, qps,'o-')\n",
+    "#ax.set_xticks(bench_k, bench_k)\n",
+    "ax.set_xlabel('n_probes')\n",
+    "ax.grid()\n",
+    "ax.set_ylabel('queries per second');\n",
+    "\n",
+    "ax = fig.add_subplot(133)\n",
+    "ax.plot(recall, qps,'o-')\n",
+    "#ax.set_xticks(bench_k, bench_k)\n",
+    "ax.set_xlabel('recall')\n",
+    "ax.grid()\n",
+    "ax.set_ylabel('queries per second');\n",
+    "#ax.set_yscale('log')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81e7ad6a-bddc-45de-9cce-0fb913f91efe",
+   "metadata": {},
+   "source": [
+    "## Adjust build parameters\n",
+    "### n_lists\n",
+    "The number of clusters (or lists) is set by the n_list parameter. Let's change it to 100 clusters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "addbfff3-7773-4290-9608-5489edf4886d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "build_params = ivf_flat.IndexParams(\n",
+    "        n_lists=100,\n",
+    "        metric=\"euclidean\",\n",
+    "        kmeans_trainset_fraction=1,\n",
+    "        kmeans_n_iters=20,\n",
+    "        add_data_on_build=True\n",
+    "    )\n",
+    "\n",
+    "index = ivf_flat.build(build_params, dataset, handle=handle)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "48db27f9-54c8-4dac-839b-af94ada8885f",
+   "metadata": {},
+   "source": [
+    "The ratio of n_probes / n_list will determine how large fraction of the dataset is searched for each query. The right combination depends on the use case. Here we will search 10 of the clusters for each query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a0149ad-de38-4195-97a5-ce5d5d877036",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "n_queries=10000\n",
+    "\n",
+    "search_params = ivf_flat.SearchParams(n_probes=10)\n",
+    "\n",
+    "# Search 10 nearest neighbors.\n",
+    "distances, indices = ivf_flat.search(search_params, index, cp.asarray(queries[:n_queries,:]), k=10, handle=handle)\n",
+    "    \n",
+    "handle.sync()\n",
+    "distances, neighbors = cp.asnumpy(distances), cp.asnumpy(indices)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eedc3ec4-06af-42c5-8cdf-490a5c2bc49a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "calc_recall(neighbors, gt_neighbors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c44800f-1e9e-4f7b-87fe-0f25e6590faa",
+   "metadata": {},
+   "source": [
+    "### trainset_fraction\n",
+    "During clustering we can sub-sample the dataset. The parameter `trainset_fraction` determines what fraction to use. Often we get good results by using only 1/10th of the dataset for clustering. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a54d190-64d4-4cd4-a497-365cbffda871",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "build_params = ivf_flat.IndexParams( \n",
+    "        n_lists=100, \n",
+    "        metric=\"sqeuclidean\", \n",
+    "        kmeans_trainset_fraction=0.1, \n",
+    "        kmeans_n_iters=20 \n",
+    "    ) \n",
+    "index = ivf_flat.build(build_params, dataset, handle=handle)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9d86a213-d6ae-4fca-9082-cb5a4d1dab36",
+   "metadata": {},
+   "source": [
+    "We see only a minimal change in the recall"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cc992e8-a5e5-4508-b790-0e934160b660",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "search_params = ivf_flat.SearchParams(n_probes=10)\n",
+    "\n",
+    "distances, indices = ivf_flat.search(search_params, index, cp.asarray(queries[:n_queries,:]), k=10, handle=handle)\n",
+    "    \n",
+    "handle.sync()\n",
+    "distances, neighbors = cp.asnumpy(distances), cp.asnumpy(indices)\n",
+    "calc_recall(neighbors, gt_neighbors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "25289ebc-7d89-4fa6-bc62-e25b6e77750c",
+   "metadata": {},
+   "source": [
+    "### Add vectors on build\n",
+    "Currently you cannot configure how RAFT sub-samples the input. If you want to have a fine control on how the training set is selected, then create the index in two steps:\n",
+    "1. Define cluster centers on a training set, but do not add any vector to the index\n",
+    "2. Add vectors to the index (extend)\n",
+    "\n",
+    "This workflow shall be familiar to FAISS users. Note that raft does not require adding the data in batches, internal batching is used when necessary.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ebcf970-94ed-4825-9885-277bd984b90c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# subsample the dataset\n",
+    "n_train = 10000\n",
+    "train_set = dataset[cp.random.choice(dataset.shape[0], n_train, replace=False),:]\n",
+    "\n",
+    "# build using training set\n",
+    "build_params = ivf_flat.IndexParams(\n",
+    "        n_lists=1024,\n",
+    "        metric=\"sqeuclidean\",\n",
+    "        kmeans_trainset_fraction=1,\n",
+    "        kmeans_n_iters=20,\n",
+    "        add_data_on_build=False\n",
+    "    )\n",
+    "index = ivf_flat.build(build_params, train_set)\n",
+    "\n",
+    "print(\"Index before adding vectors\", index)\n",
+    "\n",
+    "ivf_flat.extend(index, dataset, cp.arange(dataset.shape[0], dtype=cp.int64))\n",
+    "\n",
+    "print(\"Index after adding vectors\", index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "029d48a9-baf7-4263-af43-9e500ef3cce4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/tutorial_ivf_pq.ipynb b/notebooks/tutorial_ivf_pq.ipynb
index 6aa8cd6495..397e39bfba 100644
--- a/notebooks/tutorial_ivf_pq.ipynb
+++ b/notebooks/tutorial_ivf_pq.ipynb
@@ -79,6 +79,7 @@
     "from pylibraft.common import DeviceResources\n",
     "from pylibraft.neighbors import ivf_pq, refine\n",
     "from adjustText import adjust_text\n",
+    "from utils import calc_recall, load_dataset\n",
     "\n",
     "%matplotlib inline"
    ]
@@ -194,15 +195,18 @@
    "cell_type": "code",
    "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The index and data will be saved in /tmp/raft_example\n"
+     ]
+    }
+   ],
    "source": [
     "DATASET_URL = \"http://ann-benchmarks.com/sift-128-euclidean.hdf5\"\n",
-    "DATASET_FILENAME = DATASET_URL.split('/')[-1]\n",
-    "\n",
-    "## download the dataset\n",
-    "dataset_path = os.path.join(WORK_FOLDER, DATASET_FILENAME)\n",
-    "if not os.path.exists(dataset_path):\n",
-    "    urllib.request.urlretrieve(DATASET_URL, dataset_path)"
+    "f = load_dataset(DATASET_URL)"
    ]
   },
   {
@@ -227,8 +231,6 @@
     }
    ],
    "source": [
-    "f = h5py.File(dataset_path, \"r\")\n",
-    "\n",
     "metric = f.attrs['distance']\n",
     "\n",
     "dataset = cp.array(f['train'])\n",
@@ -456,28 +458,6 @@
     }
    ],
    "source": [
-    "## Check the quality of the prediction (recall)\n",
-    "def calc_recall(found_indices, ground_truth):\n",
-    "    found_indices = cp.asarray(found_indices)\n",
-    "    bs, k = found_indices.shape\n",
-    "    if bs != ground_truth.shape[0]:\n",
-    "        raise RuntimeError(\n",
-    "            \"Batch sizes do not match {} vs {}\".format(\n",
-    "                bs, ground_truth.shape[0])\n",
-    "        )\n",
-    "    if k > ground_truth.shape[1]:\n",
-    "        raise RuntimeError(\n",
-    "            \"Not enough indices in the ground truth ({} > {})\".format(\n",
-    "                k, ground_truth.shape[1])\n",
-    "        )\n",
-    "    n = 0\n",
-    "    # Go over the batch\n",
-    "    for i in range(bs):\n",
-    "        # Note, ivf-pq does not guarantee the ordered input, hence the use of intersect1d\n",
-    "        n += cp.intersect1d(found_indices[i, :k], ground_truth[i, :k]).size\n",
-    "    recall = n / found_indices.size\n",
-    "    return recall\n",
-    "\n",
     "recall_first_try = calc_recall(neighbors, gt_neighbors)\n",
     "print(f\"Got recall = {recall_first_try} with the default parameters (k = {k}).\")"
    ]
diff --git a/notebooks/utils.py b/notebooks/utils.py
new file mode 100644
index 0000000000..1c2e44a6ae
--- /dev/null
+++ b/notebooks/utils.py
@@ -0,0 +1,103 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import cupy as cp
+import h5py
+import os
+import tempfile
+import time
+import urllib
+
+## Check the quality of the prediction (recall)
+def calc_recall(found_indices, ground_truth):
+    found_indices = cp.asarray(found_indices)
+    bs, k = found_indices.shape
+    if bs != ground_truth.shape[0]:
+        raise RuntimeError(
+            "Batch sizes do not match {} vs {}".format(
+                bs, ground_truth.shape[0]
+            )
+        )
+    if k > ground_truth.shape[1]:
+        raise RuntimeError(
+            "Not enough indices in the ground truth ({} > {})".format(
+                k, ground_truth.shape[1]
+            )
+        )
+    n = 0
+    # Go over the batch
+    for i in range(bs):
+        # Note, ivf-pq does not guarantee the ordered input, hence the use of intersect1d
+        n += cp.intersect1d(found_indices[i, :k], ground_truth[i, :k]).size
+    recall = n / found_indices.size
+    return recall
+
+
+class BenchmarkTimer:
+    """Provides a context manager that runs a code block `reps` times
+    and records results to the instance variable `timings`. Use like:
+    .. code-block:: python
+        timer = BenchmarkTimer(rep=5)
+        for _ in timer.benchmark_runs():
+            ... do something ...
+        print(np.min(timer.timings))
+
+        This class is borrowed from the rapids/cuml benchmark suite
+    """
+
+    def __init__(self, reps=1, warmup=0):
+        self.warmup = warmup
+        self.reps = reps
+        self.timings = []
+
+    def benchmark_runs(self):
+        for r in range(self.reps + self.warmup):
+            t0 = time.time()
+            yield r
+            t1 = time.time()
+            self.timings.append(t1 - t0)
+            if r >= self.warmup:
+                self.timings.append(t1 - t0)
+
+
+def load_dataset(dataset_url, work_folder=None):
+    """Download dataset from url. It is expected that the dataset contains a hdf5 file in ann-benchmarks format
+
+    Parameters
+    ----------
+      dataset_url address of hdf5 file
+      work_folder name of the local folder to store the dataset
+
+    """
+    dataset_url = "http://ann-benchmarks.com/sift-128-euclidean.hdf5"
+    dataset_filename = dataset_url.split("/")[-1]
+
+    # We'll need to load store some data in this tutorial
+    if work_folder is None:
+        work_folder = os.path.join(tempfile.gettempdir(), "raft_example")
+
+    if not os.path.exists(work_folder):
+        os.makedirs(work_folder)
+    print("The index and data will be saved in", work_folder)
+
+    ## download the dataset
+    dataset_path = os.path.join(work_folder, dataset_filename)
+    if not os.path.exists(dataset_path):
+        urllib.request.urlretrieve(dataset_url, dataset_path)
+
+    f = h5py.File(dataset_path, "r")
+
+    return f
diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
index e0c59a5ed3..c11d933b27 100644
--- a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
+++ b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
@@ -104,11 +104,13 @@ cdef class IndexParams:
 
     graph_degree : int, default = 64
 
-    add_data_on_build : bool, default = True
-        After training the coarse and fine quantizers, we will populate
-        the index with the dataset if add_data_on_build == True, otherwise
-        the index is left empty, and the extend method can be used
-        to add new vectors to the index.
+    build_algo: string denoting the graph building algorithm to use,
+                default = "ivf_pq"
+        Valid values for algo: ["ivf_pq", "nn_descent"], where
+        - ivf_pq will use the IVF-PQ algorithm for building the knn graph
+        - nn_descent (experimental) will use the NN-Descent algorithm for
+          building the knn graph. It is expected to be generally
+          faster than ivf_pq.
     """
     cdef c_cagra.index_params params
 
@@ -116,12 +118,15 @@ cdef class IndexParams:
                  metric="sqeuclidean",
                  intermediate_graph_degree=128,
                  graph_degree=64,
-                 add_data_on_build=True):
+                 build_algo="ivf_pq"):
         self.params.metric = _get_metric(metric)
         self.params.metric_arg = 0
         self.params.intermediate_graph_degree = intermediate_graph_degree
         self.params.graph_degree = graph_degree
-        self.params.add_data_on_build = add_data_on_build
+        if build_algo == "ivf_pq":
+            self.params.build_algo = c_cagra.graph_build_algo.IVF_PQ
+        elif build_algo == "nn_descent":
+            self.params.build_algo = c_cagra.graph_build_algo.NN_DESCENT
 
     @property
     def metric(self):
@@ -135,10 +140,6 @@ cdef class IndexParams:
     def graph_degree(self):
         return self.params.graph_degree
 
-    @property
-    def add_data_on_build(self):
-        return self.params.add_data_on_build
-
 
 cdef class Index:
     cdef readonly bool trained
diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd b/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd
index 0c683bcd9b..7e22f274e9 100644
--- a/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd
+++ b/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd
@@ -51,9 +51,14 @@ from pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq cimport (
 cdef extern from "raft/neighbors/cagra_types.hpp" \
         namespace "raft::neighbors::cagra" nogil:
 
+    ctypedef enum graph_build_algo:
+        IVF_PQ "raft::neighbors::cagra::graph_build_algo::IVF_PQ",
+        NN_DESCENT "raft::neighbors::cagra::graph_build_algo::NN_DESCENT"
+
     cpdef cppclass index_params(ann_index_params):
         size_t intermediate_graph_degree
         size_t graph_degree
+        graph_build_algo build_algo
 
     ctypedef enum search_algo:
         SINGLE_CTA "raft::neighbors::cagra::search_algo::SINGLE_CTA",
diff --git a/python/pylibraft/pylibraft/test/test_cagra.py b/python/pylibraft/pylibraft/test/test_cagra.py
index 74e9f53b91..24126c0c5a 100644
--- a/python/pylibraft/pylibraft/test/test_cagra.py
+++ b/python/pylibraft/pylibraft/test/test_cagra.py
@@ -52,6 +52,7 @@ def run_cagra_build_search_test(
     metric="euclidean",
     intermediate_graph_degree=128,
     graph_degree=64,
+    build_algo="ivf_pq",
     array_type="device",
     compare=True,
     inplace=True,
@@ -67,6 +68,7 @@ def run_cagra_build_search_test(
         metric=metric,
         intermediate_graph_degree=intermediate_graph_degree,
         graph_degree=graph_degree,
+        build_algo=build_algo,
     )
 
     if array_type == "device":
@@ -139,13 +141,17 @@ def run_cagra_build_search_test(
 @pytest.mark.parametrize("inplace", [True, False])
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
 @pytest.mark.parametrize("array_type", ["device", "host"])
-def test_cagra_dataset_dtype_host_device(dtype, array_type, inplace):
+@pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
+def test_cagra_dataset_dtype_host_device(
+    dtype, array_type, inplace, build_algo
+):
     # Note that inner_product tests use normalized input which we cannot
     # represent in int8, therefore we test only sqeuclidean metric here.
     run_cagra_build_search_test(
         dtype=dtype,
         inplace=inplace,
         array_type=array_type,
+        build_algo=build_algo,
     )
 
 
@@ -158,6 +164,7 @@ def test_cagra_dataset_dtype_host_device(dtype, array_type, inplace):
             "add_data_on_build": True,
             "k": 1,
             "metric": "euclidean",
+            "build_algo": "ivf_pq",
         },
         {
             "intermediate_graph_degree": 32,
@@ -165,6 +172,7 @@ def test_cagra_dataset_dtype_host_device(dtype, array_type, inplace):
             "add_data_on_build": False,
             "k": 5,
             "metric": "sqeuclidean",
+            "build_algo": "ivf_pq",
         },
         {
             "intermediate_graph_degree": 128,
@@ -172,6 +180,7 @@ def test_cagra_dataset_dtype_host_device(dtype, array_type, inplace):
             "add_data_on_build": True,
             "k": 10,
             "metric": "inner_product",
+            "build_algo": "nn_descent",
         },
     ],
 )
@@ -184,6 +193,7 @@ def test_cagra_index_params(params):
         graph_degree=params["graph_degree"],
         intermediate_graph_degree=params["intermediate_graph_degree"],
         compare=False,
+        build_algo=params["build_algo"],
     )
 
 
@@ -241,7 +251,7 @@ def test_cagra_index_params(params):
             "search_width": 4,
             "min_iterations": 0,
             "thread_block_size": 0,
-            "hashmap_mode": "small",
+            "hashmap_mode": "auto",
             "hashmap_min_bitlen": 0,
             "hashmap_max_fill_rate": 0.5,
             "num_random_samplings": 1,
diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
index 198d0a2b14..233607c281 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
@@ -254,18 +254,18 @@ def create_plot_build(
     xn = "k-nn"
     yn = "qps"
 
-    # recall_85 = [-1] * len(linestyles)
     qps_85 = [-1] * len(linestyles)
     bt_85 = [0] * len(linestyles)
     i_85 = [-1] * len(linestyles)
-    # recall_90 = [-1] * len(linestyles)
+
     qps_90 = [-1] * len(linestyles)
     bt_90 = [0] * len(linestyles)
     i_90 = [-1] * len(linestyles)
-    # recall_95 = [-1] * len(linestyles)
+
     qps_95 = [-1] * len(linestyles)
     bt_95 = [0] * len(linestyles)
     i_95 = [-1] * len(linestyles)
+
     data = OrderedDict()
     colors = OrderedDict()
 
@@ -303,7 +303,7 @@ def mean_y(algo):
     plt.figure(figsize=(12, 9))
     ax = df.plot.bar(rot=0, color=colors)
     fig = ax.get_figure()
-    print(f"writing search output to {fn_out}")
+    print(f"writing build output to {fn_out}")
     plt.title("Build Time for Highest QPS")
     plt.suptitle(f"{dataset} k={k} batch_size={batch_size}")
     plt.ylabel("Build Time (s)")
@@ -313,35 +313,22 @@ def mean_y(algo):
 def load_lines(results_path, result_files, method, index_key):
     results = dict()
 
-    linebreaker = "name,iterations"
-
     for result_filename in result_files:
         if result_filename.endswith(".csv"):
             with open(os.path.join(results_path, result_filename), "r") as f:
                 lines = f.readlines()
                 lines = lines[:-1] if lines[-1] == "\n" else lines
-                idx = 0
-                for pos, line in enumerate(lines):
-                    if linebreaker in line:
-                        idx = pos
-                        break
 
                 if method == "build":
-                    if "hnswlib" in result_filename:
-                        key_idx = [2]
-                    else:
-                        key_idx = [10]
+                    key_idx = [2]
                 elif method == "search":
-                    if "hnswlib" in result_filename:
-                        key_idx = [10, 6]
-                    else:
-                        key_idx = [12, 10]
+                    key_idx = [2, 3]
 
-                for line in lines[idx + 1 :]:
+                for line in lines[1:]:
                     split_lines = line.split(",")
 
-                    algo_name = split_lines[0].split(".")[0].strip('"')
-                    index_name = split_lines[0].split("/")[0].strip('"')
+                    algo_name = split_lines[0]
+                    index_name = split_lines[1]
 
                     if index_key == "algo":
                         dict_key = algo_name
@@ -394,9 +381,7 @@ def main():
     )
     parser.add_argument(
         "--dataset-path",
-        help="path to dataset folder, by default will look in "
-        "RAPIDS_DATASET_ROOT_DIR if defined, otherwise a datasets "
-        "subdirectory from the calling directory",
+        help="path to dataset folder",
         default=default_dataset_path,
     )
     parser.add_argument(
@@ -460,10 +445,12 @@ def main():
         search = args.search
 
     search_output_filepath = os.path.join(
-        args.output_filepath, f"search-{args.dataset}-{k}-{batch_size}.png"
+        args.output_filepath,
+        f"search-{args.dataset}-k{k}-batch_size{batch_size}.png",
     )
     build_output_filepath = os.path.join(
-        args.output_filepath, f"build-{args.dataset}-{k}-{batch_size}.png"
+        args.output_filepath,
+        f"build-{args.dataset}-k{k}-batch_size{batch_size}.png",
     )
 
     search_results = load_all_results(
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index d5a65ddfb7..347c68c477 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -145,7 +145,7 @@ def main():
 
     # Read list of allowed algorithms
     try:
-        import pylibraft  # noqa: F401
+        import rmm  # noqa: F401
 
         gpu_present = True
     except ImportError:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json
index 6bef94c070..bc77b522a8 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json
@@ -248,184 +248,111 @@
         {"max_batch":10000, "max_k":10, "nprobe":1000}
       ]
     },
-
-    {
-      "name": "raft_ivf_pq.dimpq128-cluster1024",
-      "algo": "raft_ivf_pq",
-      "build_param": {"nlist": 1024, "pq_dim": 128, "ratio": 1, "niter": 25
-      },
-      "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024",
-      "search_params": [
-        {"nprobe": 10, "internalDistanceDtype": "half", "smemLutDtype": "half"},
-        {"nprobe": 50, "internalDistanceDtype": "half", "smemLutDtype": "half"},
-        {"nprobe": 100, "internalDistanceDtype": "half", "smemLutDtype": "half"},
-        {"nprobe": 200, "internalDistanceDtype": "half", "smemLutDtype": "half"},
-        {"nprobe": 500, "internalDistanceDtype": "half", "smemLutDtype": "half"},
-        {"nprobe": 1024, "internalDistanceDtype": "half", "smemLutDtype": "half"}
-      ],
-      "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024"
-    },
-    {
-      "name": "raft_ivf_pq.dimpq128-cluster1024-float-float",
-      "algo": "raft_ivf_pq",
-      "build_param": {
-        "nlist": 1024,
-        "pq_dim": 128,
-        "ratio": 1,
-        "niter": 25
-      },
-      "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-float-float",
-      "search_params": [
-        {"nprobe": 1, "internalDistanceDtype": "float", "smemLutDtype": "float"},
-        {"nprobe": 5, "internalDistanceDtype": "float", "smemLutDtype": "float"},
-        {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "float"},
-        {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "float"},
-        {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "float"},
-        {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "float"},
-        {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "float"},
-        {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "float"}
-      ],
-      "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-float-float"
-    },
-    {
-      "name": "raft_ivf_pq.dimpq128-cluster1024-float-half",
-      "algo": "raft_ivf_pq",
-      "build_param": {
-        "nlist": 1024,
-        "pq_dim": 128,
-        "ratio": 1,
-        "niter": 25
-      },
-      "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-float-half",
-      "search_params": [
-        {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "half"},
-        {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "half"},
-        {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "half"},
-        {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "half"},
-        {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "half"},
-        {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "half"}
-      ],
-      "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-float-half"
-    },
-    {
-      "name": "raft_ivf_pq.dimpq128-cluster1024-float-fp8",
-      "algo": "raft_ivf_pq",
-      "build_param": {
-        "nlist": 1024,
-        "pq_dim": 128,
-        "ratio": 1,
-        "niter": 25
-      },
-      "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-float-fp8",
-      "search_params": [
-        {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}
-      ],
-      "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-float-fp8"
-    },
-    {
-      "name": "raft_ivf_pq.dimpq64-cluster1024-float-fp8",
-      "algo": "raft_ivf_pq",
-      "build_param": {
-        "nlist": 1024,
-        "pq_dim": 64,
-        "ratio": 1,
-        "niter": 25
-      },
-      "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq64-cluster1024-float-fp8",
-      "search_params": [
-        {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}
-      ],
-      "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq64-cluster1024-float-fp8"
-    },
-    {
-      "name": "raft_ivf_pq.dimpq64-cluster1024-float-half",
-      "algo": "raft_ivf_pq",
-      "build_param": {
-        "nlist": 1024,
-        "pq_dim": 64,
-        "ratio": 1,
-        "niter": 25
-      },
-      "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq64-cluster1024-float-half",
-      "search_params": [
-        {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "half"},
-        {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "half"},
-        {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "half"},
-        {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "half"},
-        {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "half"},
-        {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "half"}
-      ],
-      "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq64-cluster1024-float-half"
-    },
-    {
-      "name": "raft_ivf_pq.dimpq32-cluster1024-float-fp8",
-      "algo": "raft_ivf_pq",
-      "build_param": {
-        "nlist": 1024,
-        "pq_dim": 32,
-        "ratio": 1,
-        "niter": 25
-      },
-      "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq32-cluster1024-float-fp8",
-      "search_params": [
-        {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}
-      ],
-      "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq32-cluster1024-float-fp8"
-    },
-    {
-      "name": "raft_ivf_pq.dimpq16-cluster1024-float-fp8",
+{
+      "name": "raft_ivf_pq.d96b5n50K",
       "algo": "raft_ivf_pq",
-      "build_param": {
-        "nlist": 1024,
-        "pq_dim": 16,
-        "ratio": 1,
-        "niter": 25
-      },
-      "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq16-cluster1024-float-fp8",
+      "build_param": {"nlist": 50000, "pq_dim": 96, "pq_bits": 5, "ratio": 10, "niter": 25},
+      "file": "deep-100M/raft_ivf_pq/d96b5n50K",
       "search_params": [
-        {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "fp8"},
-        {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}
-      ],
-      "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq16-cluster1024-float-fp8"
+        { "nprobe": 20, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 },
+        { "nprobe": 30, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 },
+        { "nprobe": 40, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 },
+        { "nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 },
+        { "nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 },
+        { "nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 },
+        { "nprobe": 1000, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 },
+        { "nprobe": 2000, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 },
+        { "nprobe": 5000, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 },
+        { "nprobe": 20, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 30, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 40, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 1000, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 2000, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 5000, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 20, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 30, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 40, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 1000, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 2000, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 5000, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 20, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 30, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 40, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 50, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 100, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 200, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 1000, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 2000, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 5000, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 },
+        { "nprobe": 20, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 30, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 40, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 50, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 100, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 200, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 1000, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 2000, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 },
+        { "nprobe": 5000, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 }
+      ]
     },
     {
-      "name": "raft_ivf_pq.dimpq128-cluster1024-half-float",
+      "name": "raft_ivf_pq.d64b5n50K",
       "algo": "raft_ivf_pq",
-      "build_param": {
-        "nlist": 1024,
-        "pq_dim": 128,
-        "ratio": 1,
-        "niter": 25
-      },
-      "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-half-float",
+      "build_param": {"nlist": 50000, "pq_dim": 64, "pq_bits": 5, "ratio": 10, "niter": 25},
+      "file": "deep-100M/raft_ivf_pq/d64b5n50K",
       "search_params": [
-        {"nprobe": 10, "internalDistanceDtype": "half", "smemLutDtype": "float"},
-        {"nprobe": 50, "internalDistanceDtype": "half", "smemLutDtype": "float"},
-        {"nprobe": 100, "internalDistanceDtype": "half", "smemLutDtype": "float"},
-        {"nprobe": 200, "internalDistanceDtype": "half", "smemLutDtype": "float"},
-        {"nprobe": 500, "internalDistanceDtype": "half", "smemLutDtype": "float"},
-        {"nprobe": 1024, "internalDistanceDtype": "half", "smemLutDtype": "float"}
-      ],
-      "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-half-float"
+        { "nprobe": 20, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 },
+        { "nprobe": 30, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 },
+        { "nprobe": 40, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 },
+        { "nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 },
+        { "nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 },
+        { "nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 },
+        { "nprobe": 1000, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 },
+        { "nprobe": 2000, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 },
+        { "nprobe": 5000, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 },
+        { "nprobe": 20, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 30, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 40, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 1000, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 2000, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 5000, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 20, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 30, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 40, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 1000, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 2000, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 5000, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 20, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 30, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 40, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 50, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 100, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 200, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 1000, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 2000, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 5000, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 },
+        { "nprobe": 20, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 30, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 40, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 50, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 100, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 200, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 1000, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 2000, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 },
+        { "nprobe": 5000, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 }
+      ]
     },
     {
       "name": "raft_ivf_pq.dimpq512-cluster1024-float-float",
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json
index f1c033e415..ab82405439 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json
@@ -3,6 +3,7 @@
     "name": "deep-image-96-inner",
     "base_file": "deep-image-96-inner/base.fbin",
     "query_file": "deep-image-96-inner/query.fbin",
+    "groundtruth_neighbors_file": "deep-image-96-inner/groundtruth.neighbors.ibin",
     "distance": "euclidean"
   },
   "search_basic_param": {
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json
index 65f28fc81a..0efe1fc498 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json
@@ -3,6 +3,7 @@
     "name": "fashion-mnist-784-euclidean",
     "base_file": "fashion-mnist-784-euclidean/base.fbin",
     "query_file": "fashion-mnist-784-euclidean/query.fbin",
+    "groundtruth_neighbors_file": "fashion-mnist-784-euclidean/groundtruth.neighbors.ibin",
     "distance": "euclidean"
   },
   "search_basic_param": {
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json
index 526aef2db0..3595084d19 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json
@@ -735,37 +735,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         }
@@ -785,55 +785,55 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 1,
+          "nprobe": 1,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1,
+          "nprobe": 1,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 5,
+          "nprobe": 5,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         }
@@ -853,37 +853,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         }
@@ -903,37 +903,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -953,37 +953,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -1003,37 +1003,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         }
@@ -1053,37 +1053,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -1103,37 +1103,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -1153,37 +1153,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         }
@@ -1203,37 +1203,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         }
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json
index 7c95ceb439..8b9f1cfb35 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json
@@ -3,711 +3,1349 @@
     "name": "glove-100-inner",
     "base_file": "glove-100-inner/base.fbin",
     "query_file": "glove-100-inner/query.fbin",
-    "groundtruth_neighbors_file": "glove-100-inner/groundtruth.neighbors.ibin",
-    "distance": "inner_product"
+    "distance": "euclidean"
   },
-
   "search_basic_param": {
-    "batch_size": 1,
-    "k": 10
+    "batch_size": 5000,
+    "k": 10,
+    "run_count": 3
   },
-
   "index": [
     {
-      "name": "hnswlib.M4",
-      "algo": "hnswlib",
-      "build_param": {"M":4, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M4",
-      "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
-      ]
-    },
-    {
-      "name": "hnswlib.M8",
-      "algo": "hnswlib",
-      "build_param": {"M":8, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M8",
-      "search_params": [
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-100-inner/hnswlib/M12",
+      "search_params" : [
         {"ef":10, "numThreads":1},
         {"ef":20, "numThreads":1},
         {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
         {"ef":80, "numThreads":1},
         {"ef":120, "numThreads":1},
         {"ef":200, "numThreads":1},
         {"ef":400, "numThreads":1},
         {"ef":600, "numThreads":1},
         {"ef":800, "numThreads":1}
-      ]
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M12"
     },
     {
-      "name": "hnswlib.M12",
-      "algo": "hnswlib",
-      "build_param": {"M":12, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M12",
-      "search_params": [
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-100-inner/hnswlib/M16",
+      "search_params" : [
         {"ef":10, "numThreads":1},
         {"ef":20, "numThreads":1},
         {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
         {"ef":80, "numThreads":1},
         {"ef":120, "numThreads":1},
         {"ef":200, "numThreads":1},
         {"ef":400, "numThreads":1},
         {"ef":600, "numThreads":1},
         {"ef":800, "numThreads":1}
-      ]
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M16"
     },
     {
-      "name": "hnswlib.M16",
-      "algo": "hnswlib",
-      "build_param": {"M":16, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M16",
-      "search_params": [
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-100-inner/hnswlib/M24",
+      "search_params" : [
         {"ef":10, "numThreads":1},
         {"ef":20, "numThreads":1},
         {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
         {"ef":80, "numThreads":1},
         {"ef":120, "numThreads":1},
         {"ef":200, "numThreads":1},
         {"ef":400, "numThreads":1},
         {"ef":600, "numThreads":1},
         {"ef":800, "numThreads":1}
-      ]
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M24"
     },
     {
-      "name": "hnswlib.M24",
-      "algo": "hnswlib",
-      "build_param": {"M":24, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M24",
-      "search_params": [
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-100-inner/hnswlib/M36",
+      "search_params" : [
         {"ef":10, "numThreads":1},
         {"ef":20, "numThreads":1},
         {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
         {"ef":80, "numThreads":1},
         {"ef":120, "numThreads":1},
         {"ef":200, "numThreads":1},
         {"ef":400, "numThreads":1},
         {"ef":600, "numThreads":1},
         {"ef":800, "numThreads":1}
-      ]
-    },
-    {
-      "name": "hnswlib.M36",
-      "algo": "hnswlib",
-      "build_param": {"M":36, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M36",
-      "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
-      ]
-    },
-    {
-      "name": "hnswlib.M48",
-      "algo": "hnswlib",
-      "build_param": {"M":48, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M48",
-      "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
-      ]
-    },
-    {
-      "name": "hnswlib.M64",
-      "algo": "hnswlib",
-      "build_param": {"M":64, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M64",
-      "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
-      ]
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M36"
     },
+
+
+
+
     {
-      "name": "hnswlib.M96",
-      "algo": "hnswlib",
-      "build_param": {"M":96, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M96",
+      "name": "raft_bfknn",
+      "algo": "raft_bfknn",
+      "build_param": {},
+      "file": "index/glove-100-inner/raft_bfknn/bfknn",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
-      ]
+        {
+          "probe": 1
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_bfknn/bfknn"
     },
     {
       "name": "faiss_ivf_flat.nlist1024",
       "algo": "faiss_gpu_ivf_flat",
-      "build_param": {"nlist":1024},
-      "file": "glove-100-inner/faiss_ivf_flat/nlist1024",
+      "build_param": {
+        "nlist": 1024
+      },
+      "file": "index/glove-100-inner/faiss_ivf_flat/nlist1024",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_flat/nlist1024"
     },
     {
       "name": "faiss_ivf_flat.nlist2048",
       "algo": "faiss_gpu_ivf_flat",
-      "build_param": {"nlist":2048},
-      "file": "glove-100-inner/faiss_ivf_flat/nlist2048",
+      "build_param": {
+        "nlist": 2048
+      },
+      "file": "index/glove-100-inner/faiss_ivf_flat/nlist2048",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_flat/nlist2048"
     },
     {
       "name": "faiss_ivf_flat.nlist4096",
       "algo": "faiss_gpu_ivf_flat",
-      "build_param": {"nlist":4096},
-      "file": "glove-100-inner/faiss_ivf_flat/nlist4096",
+      "build_param": {
+        "nlist": 4096
+      },
+      "file": "index/glove-100-inner/faiss_ivf_flat/nlist4096",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_flat/nlist4096"
     },
     {
       "name": "faiss_ivf_flat.nlist8192",
       "algo": "faiss_gpu_ivf_flat",
-      "build_param": {"nlist":8192},
-      "file": "glove-100-inner/faiss_ivf_flat/nlist8192",
+      "build_param": {
+        "nlist": 8192
+      },
+      "file": "index/glove-100-inner/faiss_ivf_flat/nlist8192",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_flat/nlist8192"
     },
     {
       "name": "faiss_ivf_flat.nlist16384",
       "algo": "faiss_gpu_ivf_flat",
-      "build_param": {"nlist":16384},
-      "file": "glove-100-inner/faiss_ivf_flat/nlist16384",
+      "build_param": {
+        "nlist": 16384
+      },
+      "file": "index/glove-100-inner/faiss_ivf_flat/nlist16384",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000},
-        {"nprobe":2000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_flat/nlist16384"
     },
-
-
-
     {
-      "name": "faiss_ivf_pq.M2-nlist1024",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":1024, "M":2},
-      "file": "glove-100-inner/faiss_ivf_pq/M2-nlist1024",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M2-nlist2048",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":2048, "M":2},
-      "file": "glove-100-inner/faiss_ivf_pq/M2-nlist2048",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M2-nlist4096",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":4096, "M":2},
-      "file": "glove-100-inner/faiss_ivf_pq/M2-nlist4096",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M2-nlist8192",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":8192, "M":2},
-      "file": "glove-100-inner/faiss_ivf_pq/M2-nlist8192",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M2-nlist16384",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":16384, "M":2},
-      "file": "glove-100-inner/faiss_ivf_pq/M2-nlist16384",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000},
-        {"nprobe":2000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M4-nlist1024",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":1024, "M":4},
-      "file": "glove-100-inner/faiss_ivf_pq/M4-nlist1024",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M4-nlist2048",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":2048, "M":4},
-      "file": "glove-100-inner/faiss_ivf_pq/M4-nlist2048",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M4-nlist4096",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":4096, "M":4},
-      "file": "glove-100-inner/faiss_ivf_pq/M4-nlist4096",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M4-nlist8192",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":8192, "M":4},
-      "file": "glove-100-inner/faiss_ivf_pq/M4-nlist8192",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M4-nlist16384",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":16384, "M":4},
-      "file": "glove-100-inner/faiss_ivf_pq/M4-nlist16384",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000},
-        {"nprobe":2000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M20-nlist1024",
+      "name": "faiss_ivf_pq.M64-nlist1024",
       "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":1024, "M":20},
-      "file": "glove-100-inner/faiss_ivf_pq/M20-nlist1024",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M20-nlist2048",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":2048, "M":20},
-      "file": "glove-100-inner/faiss_ivf_pq/M20-nlist2048",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M20-nlist4096",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":4096, "M":20},
-      "file": "glove-100-inner/faiss_ivf_pq/M20-nlist4096",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M20-nlist8192",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":8192, "M":20},
-      "file": "glove-100-inner/faiss_ivf_pq/M20-nlist8192",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": true
+      },
+      "file": "index/glove-100-inner/faiss_ivf_pq/M64-nlist1024",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M20-nlist16384",
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024.noprecomp",
       "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":16384, "M":20},
-      "file": "glove-100-inner/faiss_ivf_pq/M20-nlist16384",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": false
+      },
+      "file": "index/glove-100-inner/faiss_ivf_pq/M64-nlist1024.noprecomp",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000},
-        {"nprobe":2000}
-      ]
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_pq/M64-nlist1024"
     },
-
-
     {
       "name": "faiss_ivf_sq.nlist1024-fp16",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":1024, "quantizer_type":"fp16"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist1024-fp16",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist1024-fp16",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist1024-fp16"
     },
     {
       "name": "faiss_ivf_sq.nlist2048-fp16",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":2048, "quantizer_type":"fp16"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist2048-fp16",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist2048-fp16",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist2048-fp16"
     },
     {
       "name": "faiss_ivf_sq.nlist4096-fp16",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":4096, "quantizer_type":"fp16"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist4096-fp16",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist4096-fp16",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist4096-fp16"
     },
     {
       "name": "faiss_ivf_sq.nlist8192-fp16",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":8192, "quantizer_type":"fp16"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist8192-fp16",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist8192-fp16",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist8192-fp16"
     },
     {
       "name": "faiss_ivf_sq.nlist16384-fp16",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":16384, "quantizer_type":"fp16"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist16384-fp16",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist16384-fp16",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000},
-        {"nprobe":2000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist16384-fp16"
     },
     {
       "name": "faiss_ivf_sq.nlist1024-int8",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":1024, "quantizer_type":"int8"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist1024-int8",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist1024-int8",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist1024-int8"
     },
     {
       "name": "faiss_ivf_sq.nlist2048-int8",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":2048, "quantizer_type":"int8"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist2048-int8",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist2048-int8",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist2048-int8"
     },
     {
       "name": "faiss_ivf_sq.nlist4096-int8",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":4096, "quantizer_type":"int8"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist4096-int8",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist4096-int8",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist4096-int8"
     },
     {
       "name": "faiss_ivf_sq.nlist8192-int8",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":8192, "quantizer_type":"int8"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist8192-int8",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist8192-int8",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist8192-int8"
     },
     {
       "name": "faiss_ivf_sq.nlist16384-int8",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":16384, "quantizer_type":"int8"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist16384-int8",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist16384-int8",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000},
-        {"nprobe":2000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist16384-int8"
     },
     {
       "name": "faiss_flat",
       "algo": "faiss_gpu_flat",
       "build_param": {},
-      "file": "glove-100-inner/faiss_flat/flat",
-      "search_params": [{}]
+      "file": "index/glove-100-inner/faiss_flat/flat",
+      "search_params": [
+        {}
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_flat/flat"
+    },
+
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-float-half"
     },
     {
-      "name": "ggnn.kbuild96-segment64-refine2-k10",
-      "algo": "ggnn",
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
       "build_param": {
-        "k_build": 96,
-        "segment_size": 64,
-        "refine_iterations": 2,
-        "dataset_size": 1183514,
-        "k": 10
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
       },
-      "file": "glove-100-inner/ggnn/kbuild96-segment64-refine2-k10",
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-float-fp8",
       "search_params": [
-        {"tau":0.001, "block_dim":64, "sorted_size":32},
-        {"tau":0.005, "block_dim":64, "sorted_size":32},
-        {"tau":0.01,  "block_dim":64, "sorted_size":32},
-        {"tau":0.02,  "block_dim":64, "sorted_size":32},
-        {"tau":0.03,  "block_dim":64, "sorted_size":32},
-        {"tau":0.04,  "block_dim":64, "sorted_size":32},
-        {"tau":0.05,  "block_dim":64, "sorted_size":32},
-        {"tau":0.06,  "block_dim":64, "sorted_size":32},
-        {"tau":0.09,  "block_dim":64, "sorted_size":32},
-        {"tau":0.12,  "block_dim":64, "sorted_size":32},
-        {"tau":0.18,  "block_dim":64, "sorted_size":32},
-        {"tau":0.21,  "block_dim":64, "sorted_size":32},
-        {"tau":0.24,  "block_dim":64, "sorted_size":32},
-        {"tau":0.27,  "block_dim":64, "sorted_size":32},
-        {"tau":0.3,   "block_dim":64, "sorted_size":32},
-        {"tau":0.4,   "block_dim":64, "sorted_size":32},
-        {"tau":0.01, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.02, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.03, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.04, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.05, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.06, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.09, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.12, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.18, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.21, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.24, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.27, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.3,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.4,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.5,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32}
-      ]
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq64-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq64-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq64-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq64-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq32-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 32,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq32-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq32-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq16-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 16,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq16-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq16-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-half-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-half-float",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-half-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq512-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 512,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq512-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq512-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_flat.nlist1024",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 1024,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_flat/nlist1024"
+    },
+    {
+      "name": "raft_ivf_flat.nlist16384",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 16384,
+        "ratio": 2,
+        "niter": 20
+      },
+      "file": "index/glove-100-inner/raft_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_flat/nlist16384"
+    },
+
+    {
+      "name" : "raft_cagra.dim32",
+      "algo" : "raft_cagra",
+      "build_param": {
+        "graph_degree" : 32
+      },
+      "file" : "index/glove-100-inner/raft_cagra/dim32",
+      "search_params" : [
+        {"itopk": 32},
+        {"itopk": 64},
+        {"itopk": 128}
+      ],
+      "search_result_file" : "result/glove-100-inner/raft_cagra/dim32"
+    },
+
+    {
+      "name" : "raft_cagra.dim64",
+      "algo" : "raft_cagra",
+      "build_param": {
+        "graph_degree" : 64
+      },
+      "file" : "index/glove-100-inner/raft_cagra/dim64",
+      "search_params" : [
+        {"itopk": 32},
+        {"itopk": 64},
+        {"itopk": 128}
+      ],
+      "search_result_file" : "result/glove-100-inner/raft_cagra/dim64"
     }
   ]
 }
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json
index 9b3f192c9f..0f02620cb2 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json
@@ -735,37 +735,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         }
@@ -785,55 +785,55 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 1,
+          "nprobe": 1,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1,
+          "nprobe": 1,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 5,
+          "nprobe": 5,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         }
@@ -853,37 +853,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         }
@@ -903,37 +903,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -953,37 +953,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -1003,37 +1003,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         }
@@ -1053,37 +1053,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -1103,37 +1103,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -1153,37 +1153,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         }
@@ -1203,37 +1203,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         }
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json
new file mode 100644
index 0000000000..41dec5adb3
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json
@@ -0,0 +1,1351 @@
+{
+  "dataset": {
+    "name": "glove-50-inner",
+    "base_file": "glove-50-inner/base.fbin",
+    "query_file": "glove-50-inner/query.fbin",
+    "distance": "euclidean"
+  },
+  "search_basic_param": {
+    "batch_size": 5000,
+    "k": 10,
+    "run_count": 3
+  },
+  "index": [
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-50-inner/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-50-inner/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-50-inner/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-50-inner/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-50-inner/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-50-inner/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-50-inner/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-50-inner/hnswlib/M36"
+    },
+
+
+
+
+    {
+      "name": "raft_bfknn",
+      "algo": "raft_bfknn",
+      "build_param": {},
+      "file": "index/glove-50-inner/raft_bfknn/bfknn",
+      "search_params": [
+        {
+          "probe": 1
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_bfknn/bfknn"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist1024",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 1024
+      },
+      "file": "index/glove-50-inner/faiss_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_flat/nlist1024"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist2048",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 2048
+      },
+      "file": "index/glove-50-inner/faiss_ivf_flat/nlist2048",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_flat/nlist2048"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist4096",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 4096
+      },
+      "file": "index/glove-50-inner/faiss_ivf_flat/nlist4096",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_flat/nlist4096"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist8192",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 8192
+      },
+      "file": "index/glove-50-inner/faiss_ivf_flat/nlist8192",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_flat/nlist8192"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist16384",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 16384
+      },
+      "file": "index/glove-50-inner/faiss_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_flat/nlist16384"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": true
+      },
+      "file": "index/glove-50-inner/faiss_ivf_pq/M64-nlist1024",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024.noprecomp",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": false
+      },
+      "file": "index/glove-50-inner/faiss_ivf_pq/M64-nlist1024.noprecomp",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist1024-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist1024-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist2048-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist2048-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist4096-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist4096-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist8192-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist8192-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist16384-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist16384-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist1024-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist1024-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist2048-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist2048-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist4096-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist4096-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist8192-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist8192-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist16384-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist16384-int8"
+    },
+    {
+      "name": "faiss_flat",
+      "algo": "faiss_gpu_flat",
+      "build_param": {},
+      "file": "index/glove-50-inner/faiss_flat/flat",
+      "search_params": [
+        {}
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_flat/flat"
+    },
+
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq64-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq64-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq64-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq64-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq32-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 32,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq32-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq32-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq16-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 16,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq16-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq16-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-half-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-half-float",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-half-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq512-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 512,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq512-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq512-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_flat.nlist1024",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 1024,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_flat/nlist1024"
+    },
+    {
+      "name": "raft_ivf_flat.nlist16384",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 16384,
+        "ratio": 2,
+        "niter": 20
+      },
+      "file": "index/glove-50-inner/raft_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_flat/nlist16384"
+    },
+
+    {
+      "name" : "raft_cagra.dim32",
+      "algo" : "raft_cagra",
+      "build_param": {
+        "graph_degree" : 32
+      },
+      "file" : "index/glove-50-inner/raft_cagra/dim32",
+      "search_params" : [
+        {"itopk": 32},
+        {"itopk": 64},
+        {"itopk": 128}
+      ],
+      "search_result_file" : "result/glove-50-inner/raft_cagra/dim32"
+    },
+
+    {
+      "name" : "raft_cagra.dim64",
+      "algo" : "raft_cagra",
+      "build_param": {
+        "graph_degree" : 64
+      },
+      "file" : "index/glove-50-inner/raft_cagra/dim64",
+      "search_params" : [
+        {"itopk": 32},
+        {"itopk": 64},
+        {"itopk": 128}
+      ],
+      "search_result_file" : "result/glove-50-inner/raft_cagra/dim64"
+    }
+  ]
+}
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
index 2a493edeed..343deb8927 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
@@ -3,6 +3,7 @@
     "name": "mnist-784-euclidean",
     "base_file": "mnist-784-euclidean/base.fbin",
     "query_file": "mnist-784-euclidean/query.fbin",
+    "groundtruth_neighbors_file": "mnist-784-euclidean/groundtruth.neighbors.ibin",
     "distance": "euclidean"
   },
   "search_basic_param": {
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json
index 630b700ba5..e94a9969d9 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json
@@ -3,6 +3,7 @@
     "name": "nytimes-256-angular",
     "base_file": "nytimes-256-angular/base.fbin",
     "query_file": "nytimes-256-angular/query.fbin",
+    "groundtruth_neighbors_file": "nytimes-256-angular/groundtruth.neighbors.ibin",
     "distance": "euclidean"
   },
   "search_basic_param": {
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json
new file mode 100644
index 0000000000..f849abad35
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json
@@ -0,0 +1,1352 @@
+{
+  "dataset": {
+    "name": "nytimes-256-inner",
+    "base_file": "nytimes-256-inner/base.fbin",
+    "query_file": "nytimes-256-inner/query.fbin",
+    "groundtruth_neighbors_file": "nytimes-256-inner/groundtruth.neighbors.ibin",
+    "distance": "euclidean"
+  },
+  "search_basic_param": {
+    "batch_size": 5000,
+    "k": 10,
+    "run_count": 3
+  },
+  "index": [
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/nytimes-256-inner/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/nytimes-256-inner/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/nytimes-256-inner/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/nytimes-256-inner/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/nytimes-256-inner/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/nytimes-256-inner/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/nytimes-256-inner/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/nytimes-256-inner/hnswlib/M36"
+    },
+
+
+
+
+    {
+      "name": "raft_bfknn",
+      "algo": "raft_bfknn",
+      "build_param": {},
+      "file": "index/nytimes-256-inner/raft_bfknn/bfknn",
+      "search_params": [
+        {
+          "probe": 1
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_bfknn/bfknn"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist1024",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 1024
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_flat/nlist1024"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist2048",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 2048
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_flat/nlist2048",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_flat/nlist2048"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist4096",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 4096
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_flat/nlist4096",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_flat/nlist4096"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist8192",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 8192
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_flat/nlist8192",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_flat/nlist8192"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist16384",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 16384
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_flat/nlist16384"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": true
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_pq/M64-nlist1024",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024.noprecomp",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": false
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_pq/M64-nlist1024.noprecomp",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist1024-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist1024-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist2048-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist2048-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist4096-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist4096-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist8192-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist8192-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist16384-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist16384-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "int8"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist1024-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist1024-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "int8"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist2048-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist2048-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "int8"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist4096-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist4096-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "int8"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist8192-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist8192-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "int8"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist16384-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist16384-int8"
+    },
+    {
+      "name": "faiss_flat",
+      "algo": "faiss_gpu_flat",
+      "build_param": {},
+      "file": "index/nytimes-256-inner/faiss_flat/flat",
+      "search_params": [
+        {}
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_flat/flat"
+    },
+
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq64-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq64-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq64-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq64-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq32-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 32,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq32-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq32-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq16-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 16,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq16-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq16-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-half-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-half-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-half-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq512-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 512,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq512-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq512-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_flat.nlist1024",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 1024,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_flat/nlist1024"
+    },
+    {
+      "name": "raft_ivf_flat.nlist16384",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 16384,
+        "ratio": 2,
+        "niter": 20
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_flat/nlist16384"
+    },
+
+    {
+      "name" : "raft_cagra.dim32",
+      "algo" : "raft_cagra",
+      "build_param": {
+        "graph_degree" : 32
+      },
+      "file" : "index/nytimes-256-inner/raft_cagra/dim32",
+      "search_params" : [
+        {"itopk": 32},
+        {"itopk": 64},
+        {"itopk": 128}
+      ],
+      "search_result_file" : "result/nytimes-256-inner/raft_cagra/dim32"
+    },
+
+    {
+      "name" : "raft_cagra.dim64",
+      "algo" : "raft_cagra",
+      "build_param": {
+        "graph_degree" : 64
+      },
+      "file" : "index/nytimes-256-inner/raft_cagra/dim64",
+      "search_params" : [
+        {"itopk": 32},
+        {"itopk": 64},
+        {"itopk": 128}
+      ],
+      "search_result_file" : "result/nytimes-256-inner/raft_cagra/dim64"
+    }
+  ]
+}
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index f9a20b46bb..af98f31857 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -17,6 +17,8 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 set(raft_dask_version 23.10.00)
 
 include(../../fetch_rapids.cmake)
+include(rapids-cuda)
+rapids_cuda_init_architectures(raft-dask-python)
 
 project(
   raft-dask-python
@@ -25,7 +27,7 @@ project(
             # language to be enabled here. The test project that is built in scikit-build to verify
             # various linking options for the python library is hardcoded to build with C, so until
             # that is fixed we need to keep C.
-            C CXX
+            C CXX CUDA
 )
 
 option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
@@ -42,14 +44,6 @@ else()
 endif()
 
 if(NOT raft_FOUND)
-  # TODO: This will not be necessary once we upgrade to CMake 3.22, which will pull in the required
-  # languages for the C++ project even if this project does not require those languages.
-  include(rapids-cuda)
-  rapids_cuda_init_architectures(raft-dask)
-  enable_language(CUDA)
-  # Since raft-dask only enables CUDA optionally we need to manually include the file that
-  # rapids_cuda_init_architectures relies on `project` including.
-  include("${CMAKE_PROJECT_raft-dask_INCLUDE}")
   find_package(ucx REQUIRED)
 
   # raft-dask doesn't actually use raft libraries, it just needs the headers, so we can turn off all
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index bdbcf61e0f..3e0ffc2848 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -35,8 +35,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "dask-cuda==23.10.*",
-    "dask>=2023.7.1",
-    "distributed>=2023.7.1",
+    "dask==2023.9.2",
+    "distributed==2023.9.2",
     "joblib>=0.11",
     "numba>=0.57",
     "numpy>=1.21",