diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 0000000000..9d35e3f97f
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,30 @@
+# syntax=docker/dockerfile:1.5
+
+ARG BASE
+ARG PYTHON_PACKAGE_MANAGER=conda
+
+FROM ${BASE} as pip-base
+
+ENV DEFAULT_VIRTUAL_ENV=rapids
+
+FROM ${BASE} as conda-base
+
+ENV DEFAULT_CONDA_ENV=rapids
+
+FROM ${PYTHON_PACKAGE_MANAGER}-base
+
+ARG CUDA
+ENV CUDAARCHS="RAPIDS"
+ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
+
+ARG PYTHON_PACKAGE_MANAGER
+ENV PYTHON_PACKAGE_MANAGER="${PYTHON_PACKAGE_MANAGER}"
+
+ENV PYTHONSAFEPATH="1"
+ENV PYTHONUNBUFFERED="1"
+ENV PYTHONDONTWRITEBYTECODE="1"
+
+ENV SCCACHE_REGION="us-east-2"
+ENV SCCACHE_BUCKET="rapids-sccache-devs"
+ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV HISTFILE="/home/coder/.cache/._bash_history"
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
new file mode 100644
index 0000000000..3c76b8963d
--- /dev/null
+++ b/.devcontainer/README.md
@@ -0,0 +1,64 @@
+# RAFT Development Containers
+
+This directory contains [devcontainer configurations](https://containers.dev/implementors/json_reference/) for using VSCode to [develop in a container](https://code.visualstudio.com/docs/devcontainers/containers) via the `Remote Containers` [extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) or [GitHub Codespaces](https://github.com/codespaces).
+
+This container is a turnkey development environment for building and testing the RAFT C++ and Python libraries.
+
+## Table of Contents
+
+* [Prerequisites](#prerequisites)
+* [Host bind mounts](#host-bind-mounts)
+* [Launch a Dev Container](#launch-a-dev-container)
+
+## Prerequisites
+
+* [VSCode](https://code.visualstudio.com/download)
+* [VSCode Remote Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
+
+## Host bind mounts
+
+By default, the following directories are bind-mounted into the devcontainer:
+
+* `${repo}:/home/coder/raft`
+* `${repo}/../.aws:/home/coder/.aws`
+* `${repo}/../.local:/home/coder/.local`
+* `${repo}/../.cache:/home/coder/.cache`
+* `${repo}/../.conda:/home/coder/.conda`
+* `${repo}/../.config:/home/coder/.config`
+
+This ensures caches, configurations, dependencies, and your commits are persisted on the host across container runs.
+
+## Launch a Dev Container
+
+To launch a devcontainer from VSCode, open the RAFT repo and select the "Reopen in Container" button in the bottom right:<br/><img src="https://user-images.githubusercontent.com/178183/221771999-97ab29d5-e718-4e5f-b32f-2cdd51bba25c.png"/>
+
+Alternatively, open the VSCode command palette (typically `cmd/ctrl + shift + P`) and run the "Rebuild and Reopen in Container" command.
+
+## Using the devcontainer
+
+On startup, the devcontainer creates or updates the conda/pip environment using `raft/dependencies.yaml`.
+
+The container includes convenience functions to clean, configure, and build the various RAFT components:
+
+```shell
+$ clean-raft-cpp # only cleans the C++ build dir
+$ clean-pylibraft-python # only cleans the Python build dir
+$ clean-raft # cleans both C++ and Python build dirs
+
+$ configure-raft-cpp # only configures raft C++ lib
+
+$ build-raft-cpp # only builds raft C++ lib
+$ build-pylibraft-python # only builds raft Python lib
+$ build-raft # builds both C++ and Python libs
+```
+
+* The C++ build script is a small wrapper around `cmake -S ~/raft/cpp -B ~/raft/cpp/build` and `cmake --build ~/raft/cpp/build`
+* The Python build script is a small wrapper around `pip install --editable ~/raft/cpp`
+
+Unlike `build.sh`, these convenience scripts *don't* install the libraries after building them. Instead, they automatically inject the correct arguments to build the C++ libraries from source and use their build dirs as package roots:
+
+```shell
+$ cmake -S ~/raft/cpp -B ~/raft/cpp/build
+$ CMAKE_ARGS="-Draft_ROOT=~/raft/cpp/build" \ # <-- this argument is automatic
+  pip install -e ~/raft/cpp
+```
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
new file mode 100644
index 0000000000..203f52f1a2
--- /dev/null
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda11.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
new file mode 100644
index 0000000000..080ece996e
--- /dev/null
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -0,0 +1,38 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda11.8-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:23.12": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/ucx",
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
new file mode 100644
index 0000000000..da8bfb4db9
--- /dev/null
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:23.12-cpp-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.0-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
new file mode 100644
index 0000000000..e2bee94f8a
--- /dev/null
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -0,0 +1,38 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda12.0-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:23.12": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/ucx",
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c2b318df47..1b7fb8e1a5 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -22,6 +22,7 @@ jobs:
       - wheel-tests-pylibraft
       - wheel-build-raft-dask
       - wheel-tests-raft-dask
+      - devcontainer
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-120-arm
   checks:
@@ -92,3 +93,11 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
+  devcontainer:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.12
+    with:
+      build_command: |
+        sccache -z;
+        build-all -DBUILD_PRIMS_BENCH=ON -DBUILD_ANN_BENCH=ON --verbose;
+        sccache -s;
diff --git a/.gitignore b/.gitignore
index 7939fc1622..11b7bc3eba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -62,3 +62,7 @@ _xml
 # sphinx
 _html
 _text
+
+# clang tooling
+compile_commands.json
+.clangd/
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 7a69b95da1..a867a71f68 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -91,3 +91,10 @@ sed_runner "/^PROJECT_NUMBER/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" cpp/doxygen/Doxy
 sed_runner "/^set(RAFT_VERSION/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" docs/source/build.md
 sed_runner "/GIT_TAG.*branch-/ s|branch-.*|branch-${NEXT_SHORT_TAG}|g" docs/source/build.md
 sed_runner "/rapidsai\/raft/ s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" docs/source/developer_guide.md
+
+# .devcontainer files
+find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
+    sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+done
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index a20e950313..a9ae5dcabb 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -12,7 +12,7 @@ RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.12
+python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://github.com/dask/distributed.git@2023.9.2 git+https://github.com/rapidsai/dask-cuda.git@branch-23.12
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/raft_dask*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index dc27a0aa32..fcbf1451a8 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,7 +10,7 @@ dependencies:
 - breathe
 - c-compiler
 - clang-tools=16.0.6
-- clang=16.0.6
+- clang==16.0.6
 - cmake>=3.26.4
 - cuda-profiler-api=11.8.86
 - cuda-python>=11.7.1,<12.0a0
@@ -19,10 +19,10 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.12.*
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -43,6 +43,8 @@ dependencies:
 - numba>=0.57
 - numpy>=1.21
 - numpydoc
+- nvcc_linux-64=11.8
+- pre-commit
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 019679592f..a1e22f50a0 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -10,19 +10,20 @@ dependencies:
 - breathe
 - c-compiler
 - clang-tools=16.0.6
-- clang=16.0.6
+- clang==16.0.6
 - cmake>=3.26.4
 - cuda-cudart-dev
+- cuda-nvcc
 - cuda-profiler-api
 - cuda-python>=12.0,<13.0a0
 - cuda-version=12.0
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.12.*
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -39,6 +40,7 @@ dependencies:
 - numba>=0.57
 - numpy>=1.21
 - numpydoc
+- pre-commit
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 5a9ef5bd32..4f1df12dfa 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -10,7 +10,7 @@ dependencies:
 - benchmark>=1.8.2
 - c-compiler
 - clang-tools=16.0.6
-- clang=16.0.6
+- clang==16.0.6
 - cmake>=3.26.4
 - cuda-profiler-api=11.8.86
 - cuda-version=11.8
@@ -34,6 +34,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
+- nvcc_linux-64=11.8
 - scikit-build>=0.13.1
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index c9caa4dd9b..04dfef5063 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -60,10 +60,10 @@ requirements:
     - cudatoolkit
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
-    - dask >=2023.7.1
-    - dask-core >=2023.7.1
+    - dask ==2023.9.2
+    - dask-core ==2023.9.2
     - dask-cuda ={{ minor_version }}
-    - distributed >=2023.7.1
+    - distributed ==2023.9.2
     - joblib >=0.11
     - nccl >=2.9.9
     - pylibraft {{ version }}
diff --git a/cpp/.clangd b/cpp/.clangd
new file mode 100644
index 0000000000..7c4fe036dd
--- /dev/null
+++ b/cpp/.clangd
@@ -0,0 +1,65 @@
+# https://clangd.llvm.org/config
+
+# Apply a config conditionally to all C files
+If:
+  PathMatch: .*\.(c|h)$
+
+---
+
+# Apply a config conditionally to all C++ files
+If:
+  PathMatch: .*\.(c|h)pp
+
+---
+
+# Apply a config conditionally to all CUDA files
+If:
+  PathMatch: .*\.cuh?
+CompileFlags:
+  Add:
+    - "-x"
+    - "cuda"
+    # No error on unknown CUDA versions
+    - "-Wno-unknown-cuda-version"
+    # Allow variadic CUDA functions
+    - "-Xclang=-fcuda-allow-variadic-functions"
+Diagnostics:
+  Suppress:
+    - "variadic_device_fn"
+    - "attributes_not_allowed"
+
+---
+
+# Tweak the clangd parse settings for all files
+CompileFlags:
+  Add:
+    # report all errors
+    - "-ferror-limit=0"
+    - "-fmacro-backtrace-limit=0"
+    - "-ftemplate-backtrace-limit=0"
+    # Skip the CUDA version check
+    - "--no-cuda-version-check"
+  Remove:
+    # remove gcc's -fcoroutines
+    - -fcoroutines
+    # remove nvc++ flags unknown to clang
+    - "-gpu=*"
+    - "-stdpar*"
+    # remove nvcc flags unknown to clang
+    - "-arch*"
+    - "-gencode*"
+    - "--generate-code*"
+    - "-ccbin*"
+    - "-t=*"
+    - "--threads*"
+    - "-Xptxas*"
+    - "-Xcudafe*"
+    - "-Xfatbin*"
+    - "-Xcompiler*"
+    - "--diag-suppress*"
+    - "--diag_suppress*"
+    - "--compiler-options*"
+    - "--expt-extended-lambda"
+    - "--expt-relaxed-constexpr"
+    - "-forward-unknown-to-host-compiler"
+    - "-Werror=cross-execution-space-call"
diff --git a/cpp/bench/prims/neighbors/cagra_bench.cuh b/cpp/bench/prims/neighbors/cagra_bench.cuh
index bb405088bb..63f6c14686 100644
--- a/cpp/bench/prims/neighbors/cagra_bench.cuh
+++ b/cpp/bench/prims/neighbors/cagra_bench.cuh
@@ -18,8 +18,10 @@
 
 #include <common/benchmark.hpp>
 #include <raft/neighbors/cagra.cuh>
+#include <raft/neighbors/sample_filter.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/itertools.hpp>
+#include <thrust/sequence.h>
 
 #include <optional>
 
@@ -40,6 +42,8 @@ struct params {
   int block_size;
   int search_width;
   int max_iterations;
+  /** Ratio of removed indices. */
+  double removed_ratio;
 };
 
 template <typename T, typename IdxT>
@@ -49,7 +53,8 @@ struct CagraBench : public fixture {
       params_(ps),
       queries_(make_device_matrix<T, int64_t>(handle, ps.n_queries, ps.n_dims)),
       dataset_(make_device_matrix<T, int64_t>(handle, ps.n_samples, ps.n_dims)),
-      knn_graph_(make_device_matrix<IdxT, int64_t>(handle, ps.n_samples, ps.degree))
+      knn_graph_(make_device_matrix<IdxT, int64_t>(handle, ps.n_samples, ps.degree)),
+      removed_indices_bitset_(handle, ps.n_samples)
   {
     // Generate random dataset and queriees
     raft::random::RngState state{42};
@@ -74,6 +79,13 @@ struct CagraBench : public fixture {
 
     auto metric = raft::distance::DistanceType::L2Expanded;
 
+    auto removed_indices =
+      raft::make_device_vector<IdxT, int64_t>(handle, ps.removed_ratio * ps.n_samples);
+    thrust::sequence(
+      resource::get_thrust_policy(handle),
+      thrust::device_pointer_cast(removed_indices.data_handle()),
+      thrust::device_pointer_cast(removed_indices.data_handle() + removed_indices.extent(0)));
+    removed_indices_bitset_.set(handle, removed_indices.view());
     index_.emplace(raft::neighbors::cagra::index<T, IdxT>(
       handle, metric, make_const_mdspan(dataset_.view()), make_const_mdspan(knn_graph_.view())));
   }
@@ -95,10 +107,18 @@ struct CagraBench : public fixture {
       distances.data_handle(), params_.n_queries, params_.k);
 
     auto queries_v = make_const_mdspan(queries_.view());
-    loop_on_state(state, [&]() {
-      raft::neighbors::cagra::search(
-        this->handle, search_params, *this->index_, queries_v, ind_v, dist_v);
-    });
+    if (params_.removed_ratio > 0) {
+      auto filter = raft::neighbors::filtering::bitset_filter(removed_indices_bitset_.view());
+      loop_on_state(state, [&]() {
+        raft::neighbors::cagra::search_with_filtering(
+          this->handle, search_params, *this->index_, queries_v, ind_v, dist_v, filter);
+      });
+    } else {
+      loop_on_state(state, [&]() {
+        raft::neighbors::cagra::search(
+          this->handle, search_params, *this->index_, queries_v, ind_v, dist_v);
+      });
+    }
 
     double data_size  = params_.n_samples * params_.n_dims * sizeof(T);
     double graph_size = params_.n_samples * params_.degree * sizeof(IdxT);
@@ -120,6 +140,7 @@ struct CagraBench : public fixture {
     state.counters["block_size"]    = params_.block_size;
     state.counters["search_width"]  = params_.search_width;
     state.counters["iterations"]    = iterations;
+    state.counters["removed_ratio"] = params_.removed_ratio;
   }
 
  private:
@@ -128,6 +149,7 @@ struct CagraBench : public fixture {
   raft::device_matrix<T, int64_t, row_major> queries_;
   raft::device_matrix<T, int64_t, row_major> dataset_;
   raft::device_matrix<IdxT, int64_t, row_major> knn_graph_;
+  raft::core::bitset<std::uint32_t, IdxT> removed_indices_bitset_;
 };
 
 inline const std::vector<params> generate_inputs()
@@ -141,7 +163,8 @@ inline const std::vector<params> generate_inputs()
                                            {64},                   // itopk_size
                                            {0},                    // block_size
                                            {1},                    // search_width
-                                           {0}                     // max_iterations
+                                           {0},                    // max_iterations
+                                           {0.0}                   // removed_ratio
     );
   auto inputs2 = raft::util::itertools::product<params>({2000000ull, 10000000ull},  // n_samples
                                                         {128},                      // dataset dim
@@ -151,7 +174,22 @@ inline const std::vector<params> generate_inputs()
                                                         {64},  // itopk_size
                                                         {64, 128, 256, 512, 1024},  // block_size
                                                         {1},                        // search_width
-                                                        {0}  // max_iterations
+                                                        {0},   // max_iterations
+                                                        {0.0}  // removed_ratio
+  );
+  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+
+  inputs2 = raft::util::itertools::product<params>(
+    {2000000ull, 10000000ull},                 // n_samples
+    {128},                                     // dataset dim
+    {1, 10, 10000},                            // n_queries
+    {255},                                     // k
+    {64},                                      // knn graph degree
+    {300},                                     // itopk_size
+    {256},                                     // block_size
+    {2},                                       // search_width
+    {0},                                       // max_iterations
+    {0.0, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64}  // removed_ratio
   );
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
   return inputs;
diff --git a/cpp/include/raft/neighbors/brute_force_types.hpp b/cpp/include/raft/neighbors/brute_force_types.hpp
index cc934b7a98..19dd6b8350 100644
--- a/cpp/include/raft/neighbors/brute_force_types.hpp
+++ b/cpp/include/raft/neighbors/brute_force_types.hpp
@@ -66,11 +66,11 @@ struct index : ann::index {
   /** Dataset norms */
   [[nodiscard]] inline auto norms() const -> device_vector_view<const T, int64_t, row_major>
   {
-    return make_const_mdspan(norms_.value().view());
+    return norms_view_.value();
   }
 
   /** Whether ot not this index has dataset norms */
-  [[nodiscard]] inline bool has_norms() const noexcept { return norms_.has_value(); }
+  [[nodiscard]] inline bool has_norms() const noexcept { return norms_view_.has_value(); }
 
   [[nodiscard]] inline T metric_arg() const noexcept { return metric_arg_; }
 
@@ -102,10 +102,30 @@ struct index : ann::index {
       norms_(std::move(norms)),
       metric_arg_(metric_arg)
   {
+    if (norms_) { norms_view_ = make_const_mdspan(norms_.value().view()); }
     update_dataset(res, dataset);
     resource::sync_stream(res);
   }
 
+  /** Construct a brute force index from dataset
+   *
+   * This class stores a non-owning reference to the dataset and norms here.
+   * Having precomputed norms gives us a performance advantage at query time.
+   */
+  index(raft::resources const& res,
+        raft::device_matrix_view<const T, int64_t, row_major> dataset_view,
+        std::optional<raft::device_vector_view<const T, int64_t>> norms_view,
+        raft::distance::DistanceType metric,
+        T metric_arg = 0.0)
+    : ann::index(),
+      metric_(metric),
+      dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
+      dataset_view_(dataset_view),
+      norms_view_(norms_view),
+      metric_arg_(metric_arg)
+  {
+  }
+
  private:
   /**
    * Replace the dataset with a new dataset.
@@ -135,6 +155,7 @@ struct index : ann::index {
   raft::distance::DistanceType metric_;
   raft::device_matrix<T, int64_t, row_major> dataset_;
   std::optional<raft::device_vector<T, int64_t>> norms_;
+  std::optional<raft::device_vector_view<const T, int64_t>> norms_view_;
   raft::device_matrix_view<const T, int64_t, row_major> dataset_view_;
   T metric_arg_;
 };
diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
index f96dd34e05..f9682a973f 100644
--- a/cpp/include/raft/neighbors/cagra.cuh
+++ b/cpp/include/raft/neighbors/cagra.cuh
@@ -391,7 +391,25 @@ void search(raft::resources const& res,
 /**
  * @brief Search ANN using the constructed index with the given sample filter.
  *
- * See the [cagra::build](#cagra::build) documentation for a usage example.
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *   // use default search parameters
+ *   cagra::search_params search_params;
+ *   // create a bitset to filter the search
+ *   auto removed_indices = raft::make_device_vector<IdxT>(res, n_removed_indices);
+ *   raft::core::bitset<std::uint32_t, IdxT> removed_indices_bitset(
+ *     res, removed_indices.view(), dataset.extent(0));
+ *   // search K nearest neighbours according to a bitset
+ *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
+ *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
+ *   cagra::search_with_filtering(res, search_params, index, queries, neighbors, distances,
+ *     filtering::bitset_filter(removed_indices_bitset.view()));
+ * @endcode
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
index 5dcfcb3929..9392bde440 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -478,13 +478,15 @@ __global__ void apply_filter_kernel(INDEX_T* const result_indices_ptr,
                                     const INDEX_T query_id_offset,
                                     SAMPLE_FILTER_T sample_filter)
 {
-  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+  const auto tid                     = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= result_buffer_size * num_queries) { return; }
   const auto i     = tid % result_buffer_size;
   const auto j     = tid / result_buffer_size;
   const auto index = i + j * lds;
 
-  if (!sample_filter(query_id_offset + j, result_indices_ptr[index])) {
+  if (result_indices_ptr[index] != ~index_msb_1_mask &&
+      !sample_filter(query_id_offset + j, result_indices_ptr[index])) {
     result_indices_ptr[index]   = utils::get_max_value<INDEX_T>();
     result_distances_ptr[index] = utils::get_max_value<DISTANCE_T>();
   }
@@ -788,12 +790,15 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
     auto result_indices_ptr   = result_indices.data() + (iter & 0x1) * result_buffer_size;
     auto result_distances_ptr = result_distances.data() + (iter & 0x1) * result_buffer_size;
 
-    // Remove parent bit in search results
-    remove_parent_bit(
-      num_queries, itopk_size, result_indices_ptr, result_buffer_allocation_size, stream);
+    if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                                raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+      // Remove parent bit in search results
+      remove_parent_bit(num_queries,
+                        result_buffer_size,
+                        result_indices.data() + (iter & 0x1) * itopk_size,
+                        result_buffer_allocation_size,
+                        stream);
 
-    if (!std::is_same<SAMPLE_FILTER_T,
-                      raft::neighbors::filtering::none_cagra_sample_filter>::value) {
       apply_filter<INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>(
         result_indices.data() + (iter & 0x1) * itopk_size,
         result_distances.data() + (iter & 0x1) * itopk_size,
@@ -821,6 +826,10 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
                        true,
                        topk_hint.data(),
                        stream);
+    } else {
+      // Remove parent bit in search results
+      remove_parent_bit(
+        num_queries, itopk_size, result_indices_ptr, result_buffer_allocation_size, stream);
     }
 
     // Copy results from working buffer to final buffer
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
index a0f346ab51..147b8b753d 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
@@ -291,6 +291,14 @@ struct search_plan_impl : public search_plan_impl_base {
         "`hashmap_max_fill_rate` must be equal to or greater than 0.1 and smaller than 0.9. " +
         std::to_string(hashmap_max_fill_rate) + " has been given.";
     }
+    if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                                raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+      if (hashmap_mode == hash_mode::SMALL) {
+        error_message += "`SMALL` hash is not available when filtering";
+      } else {
+        hashmap_mode = hash_mode::HASH;
+      }
+    }
     if (algo == search_algo::MULTI_CTA) {
       if (hashmap_mode == hash_mode::SMALL) {
         error_message += "`small_hash` is not available when 'search_mode' is \"multi-cta\"";
diff --git a/cpp/include/raft/neighbors/detail/nn_descent.cuh b/cpp/include/raft/neighbors/detail/nn_descent.cuh
index 3e4d0409bd..009ffd4684 100644
--- a/cpp/include/raft/neighbors/detail/nn_descent.cuh
+++ b/cpp/include/raft/neighbors/detail/nn_descent.cuh
@@ -1278,8 +1278,7 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
 
     std::thread update_and_sample_thread(update_and_sample, it);
 
-    std::cout << "# GNND iteraton: " << it + 1 << "/" << build_config_.max_iterations << "\r";
-    std::fflush(stdout);
+    RAFT_LOG_DEBUG("# GNND iteraton: %lu / %lu", it + 1, build_config_.max_iterations);
 
     // Reuse dists_buffer_ to save GPU memory. graph_buffer_ cannot be reused, because it
     // contains some information for local_join.
diff --git a/cpp/include/raft/neighbors/sample_filter.cuh b/cpp/include/raft/neighbors/sample_filter.cuh
new file mode 100644
index 0000000000..9182d72da9
--- /dev/null
+++ b/cpp/include/raft/neighbors/sample_filter.cuh
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <raft/core/bitset.cuh>
+
+namespace raft::neighbors::filtering {
+/**
+ * @brief Filter an index with a bitset
+ *
+ * @tparam index_t Indexing type
+ */
+template <typename bitset_t, typename index_t>
+struct bitset_filter {
+  // View of the bitset to use as a filter
+  const raft::core::bitset_view<bitset_t, index_t> bitset_view_;
+
+  bitset_filter(const raft::core::bitset_view<bitset_t, index_t> bitset_for_filtering)
+    : bitset_view_{bitset_for_filtering}
+  {
+  }
+  inline _RAFT_HOST_DEVICE bool operator()(
+    // query index
+    const uint32_t query_ix,
+    // the index of the current sample
+    const uint32_t sample_ix) const
+  {
+    return bitset_view_.test(sample_ix);
+  }
+};
+}  // namespace raft::neighbors::filtering
diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu
index 15c7b2b33a..8e3a9df01b 100644
--- a/cpp/test/core/math_device.cu
+++ b/cpp/test/core/math_device.cu
@@ -21,7 +21,9 @@
 #include <rmm/cuda_stream.hpp>
 #include <rmm/device_scalar.hpp>
 
-#if _RAFT_HAS_CUDA
+#include <cuda/std/type_traits>
+
+#ifdef _RAFT_HAS_CUDA
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #endif
@@ -35,7 +37,7 @@ __global__ void math_eval_kernel(OutT* out, OpT op, Args... args)
 template <typename OpT, typename... Args>
 auto math_eval(OpT op, Args&&... args)
 {
-  typedef decltype(op(args...)) OutT;
+  using OutT  = cuda::std::invoke_result_t<OpT, Args...>;
   auto stream = rmm::cuda_stream_default;
   rmm::device_scalar<OutT> result(stream);
   math_eval_kernel<<<1, 1, 0, stream>>>(result.data(), op, std::forward<Args>(args)...);
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index b750372244..e6c3873063 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -30,6 +30,7 @@
 #include <raft/linalg/add.cuh>
 #include <raft/neighbors/cagra.cuh>
 #include <raft/neighbors/cagra_serialize.cuh>
+#include <raft/neighbors/sample_filter.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/itertools.hpp>
 
@@ -525,6 +526,119 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
     }
   }
 
+  void testCagraRemoved()
+  {
+    size_t queries_size = ps.n_queries * ps.k;
+    std::vector<IdxT> indices_Cagra(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_Cagra(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      auto* database_filtered_ptr = database.data() + test_cagra_sample_filter::offset * ps.dim;
+      naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                        distances_naive_dev.data(),
+                                        indices_naive_dev.data(),
+                                        search_queries.data(),
+                                        database_filtered_ptr,
+                                        ps.n_queries,
+                                        ps.n_rows - test_cagra_sample_filter::offset,
+                                        ps.dim,
+                                        ps.k,
+                                        ps.metric);
+      raft::linalg::addScalar(indices_naive_dev.data(),
+                              indices_naive_dev.data(),
+                              IdxT(test_cagra_sample_filter::offset),
+                              queries_size,
+                              stream_);
+      update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      resource::sync_stream(handle_);
+    }
+
+    {
+      rmm::device_uvector<DistanceT> distances_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_dev(queries_size, stream_);
+
+      {
+        cagra::index_params index_params;
+        index_params.metric = ps.metric;  // Note: currently ony the cagra::index_params metric is
+                                          // not used for knn_graph building.
+        cagra::search_params search_params;
+        search_params.algo         = ps.algo;
+        search_params.max_queries  = ps.max_queries;
+        search_params.team_size    = ps.team_size;
+        search_params.hashmap_mode = cagra::hash_mode::HASH;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          (const DataT*)database.data(), ps.n_rows, ps.dim);
+
+        cagra::index<DataT, IdxT> index(handle_);
+        if (ps.host_dataset) {
+          auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+          raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+          auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
+            (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+          index = cagra::build<DataT, IdxT>(handle_, index_params, database_host_view);
+        } else {
+          index = cagra::build<DataT, IdxT>(handle_, index_params, database_view);
+        }
+
+        if (!ps.include_serialized_dataset) { index.update_dataset(handle_, database_view); }
+
+        auto search_queries_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          search_queries.data(), ps.n_queries, ps.dim);
+        auto indices_out_view =
+          raft::make_device_matrix_view<IdxT, int64_t>(indices_dev.data(), ps.n_queries, ps.k);
+        auto dists_out_view = raft::make_device_matrix_view<DistanceT, int64_t>(
+          distances_dev.data(), ps.n_queries, ps.k);
+        auto removed_indices =
+          raft::make_device_vector<IdxT, int64_t>(handle_, test_cagra_sample_filter::offset);
+        thrust::sequence(
+          resource::get_thrust_policy(handle_),
+          thrust::device_pointer_cast(removed_indices.data_handle()),
+          thrust::device_pointer_cast(removed_indices.data_handle() + removed_indices.extent(0)));
+        resource::sync_stream(handle_);
+        raft::core::bitset<std::uint32_t, IdxT> removed_indices_bitset(
+          handle_, removed_indices.view(), ps.n_rows);
+        cagra::search_with_filtering(
+          handle_,
+          search_params,
+          index,
+          search_queries_view,
+          indices_out_view,
+          dists_out_view,
+          raft::neighbors::filtering::bitset_filter(removed_indices_bitset.view()));
+        update_host(distances_Cagra.data(), distances_dev.data(), queries_size, stream_);
+        update_host(indices_Cagra.data(), indices_dev.data(), queries_size, stream_);
+        resource::sync_stream(handle_);
+      }
+
+      double min_recall = ps.min_recall;
+      EXPECT_TRUE(eval_neighbours(indices_naive,
+                                  indices_Cagra,
+                                  distances_naive,
+                                  distances_Cagra,
+                                  ps.n_queries,
+                                  ps.k,
+                                  0.001,
+                                  min_recall));
+      EXPECT_TRUE(eval_distances(handle_,
+                                 database.data(),
+                                 search_queries.data(),
+                                 indices_dev.data(),
+                                 distances_dev.data(),
+                                 ps.n_rows,
+                                 ps.dim,
+                                 ps.n_queries,
+                                 ps.k,
+                                 ps.metric,
+                                 1.0e-4));
+    }
+  }
+
   void SetUp() override
   {
     database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
diff --git a/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
index 01d7e1e1ea..944c2cbc89 100644
--- a/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
@@ -27,7 +27,11 @@ typedef AnnCagraSortTest<float, float, std::uint32_t> AnnCagraSortTestF_U32;
 TEST_P(AnnCagraSortTestF_U32, AnnCagraSort) { this->testCagraSort(); }
 
 typedef AnnCagraFilterTest<float, float, std::uint32_t> AnnCagraFilterTestF_U32;
-TEST_P(AnnCagraFilterTestF_U32, AnnCagraFilter) { this->testCagraFilter(); }
+TEST_P(AnnCagraFilterTestF_U32, AnnCagraFilter)
+{
+  this->testCagraFilter();
+  this->testCagraRemoved();
+}
 
 INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestF_U32, ::testing::ValuesIn(inputs));
 INSTANTIATE_TEST_CASE_P(AnnCagraSortTest, AnnCagraSortTestF_U32, ::testing::ValuesIn(inputs));
diff --git a/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
index ee06d369fa..3d9dc76953 100644
--- a/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
@@ -25,7 +25,11 @@ TEST_P(AnnCagraTestI8_U32, AnnCagra) { this->testCagra(); }
 typedef AnnCagraSortTest<float, std::int8_t, std::uint32_t> AnnCagraSortTestI8_U32;
 TEST_P(AnnCagraSortTestI8_U32, AnnCagraSort) { this->testCagraSort(); }
 typedef AnnCagraFilterTest<float, std::int8_t, std::uint32_t> AnnCagraFilterTestI8_U32;
-TEST_P(AnnCagraFilterTestI8_U32, AnnCagraFilter) { this->testCagraFilter(); }
+TEST_P(AnnCagraFilterTestI8_U32, AnnCagraFilter)
+{
+  this->testCagraFilter();
+  this->testCagraRemoved();
+}
 
 INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestI8_U32, ::testing::ValuesIn(inputs));
 INSTANTIATE_TEST_CASE_P(AnnCagraSortTest, AnnCagraSortTestI8_U32, ::testing::ValuesIn(inputs));
diff --git a/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
index 3243e73ccd..c5b1b1704b 100644
--- a/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
@@ -27,7 +27,11 @@ typedef AnnCagraSortTest<float, std::uint8_t, std::uint32_t> AnnCagraSortTestU8_
 TEST_P(AnnCagraSortTestU8_U32, AnnCagraSort) { this->testCagraSort(); }
 
 typedef AnnCagraFilterTest<float, std::uint8_t, std::uint32_t> AnnCagraFilterTestU8_U32;
-TEST_P(AnnCagraFilterTestU8_U32, AnnCagraSort) { this->testCagraFilter(); }
+TEST_P(AnnCagraFilterTestU8_U32, AnnCagraSort)
+{
+  this->testCagraFilter();
+  this->testCagraRemoved();
+}
 
 INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestU8_U32, ::testing::ValuesIn(inputs));
 INSTANTIATE_TEST_CASE_P(AnnCagraSortTest, AnnCagraSortTestU8_U32, ::testing::ValuesIn(inputs));
diff --git a/dependencies.yaml b/dependencies.yaml
index f1b74cfe49..fe4a4620e0 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -10,12 +10,15 @@ files:
       - build_pylibraft
       - cudatoolkit
       - develop
+      - checks
+      - build_wheels
       - test_libraft
       - docs
       - run_raft_dask
       - run_pylibraft
       - test_python_common
       - test_pylibraft
+      - cupy
   bench_ann:
     output: conda
     matrix:
@@ -38,6 +41,7 @@ files:
       - py_version
       - test_python_common
       - test_pylibraft
+      - cupy
   checks:
     output: none
     includes:
@@ -47,6 +51,7 @@ files:
     output: none
     includes:
       - test_pylibraft
+      - cupy
       - cudatoolkit
       - docs
       - py_version
@@ -75,6 +80,7 @@ files:
     includes:
       - test_python_common
       - test_pylibraft
+      - cupy
   py_build_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -145,11 +151,37 @@ dependencies:
             packages:
               - gcc_linux-aarch64=11.*
               - sysroot_linux-aarch64==2.17
+      - output_types: conda
+        matrices:
+          - matrix: {cuda: "12.0"}
+            packages: [cuda-version=12.0, cuda-nvcc]
+          - matrix: {cuda: "11.8", arch: x86_64}
+            packages: [nvcc_linux-64=11.8]
+          - matrix: {cuda: "11.8", arch: aarch64}
+            packages: [nvcc_linux-aarch64=11.8]
+          - matrix: {cuda: "11.5", arch: x86_64}
+            packages: [nvcc_linux-64=11.5]
+          - matrix: {cuda: "11.5", arch: aarch64}
+            packages: [nvcc_linux-aarch64=11.5]
+          - matrix: {cuda: "11.4", arch: x86_64}
+            packages: [nvcc_linux-64=11.4]
+          - matrix: {cuda: "11.4", arch: aarch64}
+            packages: [nvcc_linux-aarch64=11.4]
+          - matrix: {cuda: "11.2", arch: x86_64}
+            packages: [nvcc_linux-64=11.2]
+          - matrix: {cuda: "11.2", arch: aarch64}
+            packages: [nvcc_linux-aarch64=11.2]
+
   build_pylibraft:
     common:
-      - output_types: [conda, requirements, pyproject]
+      - output_types: [conda]
         packages:
-          - &rmm rmm==23.12.*
+          - &rmm_conda rmm==23.12.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
@@ -160,6 +192,20 @@ dependencies:
           - matrix: # All CUDA 11 versions
             packages:
               - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &build_pylibraft_packages_cu12
+              - &rmm_cu12 rmm-cu12==23.12.*
+          - {matrix: {cuda: "12.1"}, packages: *build_pylibraft_packages_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *build_pylibraft_packages_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &build_pylibraft_packages_cu11
+              - &rmm_cu11 rmm-cu11==23.12.*
+          - {matrix: {cuda: "11.5"}, packages: *build_pylibraft_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *build_pylibraft_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *build_pylibraft_packages_cu11}
+          - {matrix: null, packages: [*rmm_conda] }
   checks:
     common:
       - output_types: [conda, requirements]
@@ -167,11 +213,9 @@ dependencies:
           - pre-commit
   develop:
     common:
-      - output_types: [conda, requirements]
-        packages:
-          - clang=16.0.6
-      - output_types: [conda]
+      - output_types: conda
         packages:
+          - clang==16.0.6
           - clang-tools=16.0.6
   nn_bench:
     common:
@@ -265,6 +309,45 @@ dependencies:
               - *libcusolver114
               - *libcusparse_dev114
               - *libcusparse114
+
+  cupy:
+    common:
+      - output_types: conda
+        packages:
+          - cupy>=12.0.0
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          # All CUDA 12 + x86_64 versions
+          - matrix: {cuda: "12.2", arch: x86_64}
+            packages: &cupy_packages_cu12_x86_64
+              - &cupy_cu12_x86_64 cupy-cuda12x>=12.0.0
+          - {matrix: {cuda: "12.1", arch: x86_64}, packages: *cupy_packages_cu12_x86_64}
+          - {matrix: {cuda: "12.0", arch: x86_64}, packages: *cupy_packages_cu12_x86_64}
+          # All CUDA 12 + aarch64 versions
+          - matrix: {cuda: "12.2", arch: aarch64}
+            packages: &cupy_packages_cu12_aarch64
+              - &cupy_cu12_aarch64 cupy-cuda12x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
+          - {matrix: {cuda: "12.1", arch: aarch64}, packages: *cupy_packages_cu12_aarch64}
+          - {matrix: {cuda: "12.0", arch: aarch64}, packages: *cupy_packages_cu12_aarch64}
+
+          # All CUDA 11 + x86_64 versions
+          - matrix: {cuda: "11.8", arch: x86_64}
+            packages: &cupy_packages_cu11_x86_64
+              - cupy-cuda11x>=12.0.0
+          - {matrix: {cuda: "11.5", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
+          - {matrix: {cuda: "11.4", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
+          - {matrix: {cuda: "11.2", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
+
+          # All CUDA 11 + aarch64 versions
+          - matrix: {cuda: "11.8", arch: aarch64}
+            packages: &cupy_packages_cu11_aarch64
+              - cupy-cuda11x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
+          - {matrix: {cuda: "11.5", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
+          - {matrix: {cuda: "11.4", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
+          - {matrix: {cuda: "11.2", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
+          - {matrix: null, packages: [cupy-cuda11x>=12.0.0]}
+
   test_libraft:
     common:
       - output_types: [conda]
@@ -287,7 +370,7 @@ dependencies:
           - sphinx-markdown-tables
   build_wheels:
     common:
-      - output_types: pyproject
+      - output_types: [requirements, pyproject]
         packages:
           - wheel
           - setuptools
@@ -311,7 +394,14 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - &numpy numpy>=1.21
-          - *rmm
+      - output_types: [conda]
+        packages:
+          - *rmm_conda
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for cudf and rmm.
+          - --extra-index-url=https://pypi.nvidia.com
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
@@ -322,25 +412,62 @@ dependencies:
           - matrix: # All CUDA 11 versions
             packages:
               - *cuda_python11
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &run_pylibraft_packages_cu12
+              - *rmm_cu12
+          - {matrix: {cuda: "12.1"}, packages: *run_pylibraft_packages_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *run_pylibraft_packages_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &run_pylibraft_packages_cu11
+              - *rmm_cu11
+          - {matrix: {cuda: "11.5"}, packages: *run_pylibraft_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *run_pylibraft_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *run_pylibraft_packages_cu11}
+          - {matrix: null, packages: [*rmm_conda]}
   run_raft_dask:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - dask>=2023.7.1
+          - dask==2023.9.2
           - dask-cuda==23.12.*
-          - distributed>=2023.7.1
+          - distributed==2023.9.2
           - joblib>=0.11
           - numba>=0.57
           - *numpy
           - ucx-py==0.35.*
       - output_types: conda
         packages:
-          - dask-core>=2023.7.1
+          - dask-core==2023.9.2
           - ucx>=1.13.0
           - ucx-proc=*=gpu
+          - &ucx_py_conda ucx-py==0.35.*
       - output_types: pyproject
         packages:
-          - pylibraft==23.12.*
+          - &pylibraft_conda pylibraft==23.12.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for cudf and rmm.
+          - --extra-index-url=https://pypi.nvidia.com
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &run_raft_dask_packages_cu12
+              - &pylibraft_cu12 pylibraft-cu12==23.12.*
+              - &ucx_py_cu12 ucx-py-cu12==0.35.*
+          - {matrix: {cuda: "12.1"}, packages: *run_raft_dask_packages_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *run_raft_dask_packages_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &run_raft_dask_packages_cu11
+              - &pylibraft_cu11 pylibraft-cu11==23.12.*
+              - &ucx_py_cu11 ucx-py-cu11==0.35.*
+          - {matrix: {cuda: "11.5"}, packages: *run_raft_dask_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *run_raft_dask_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *run_raft_dask_packages_cu11}
+          - {matrix: null, packages: [*pylibraft_conda, *ucx_py_conda]}
   test_python_common:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -353,9 +480,3 @@ dependencies:
         packages:
           - scikit-learn
           - scipy
-      - output_types: conda
-        packages:
-          - cupy>=12.0.0
-      - output_types: pyproject
-        packages:
-          - cupy-cuda11x>=12.0.0
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index 53bb12c81c..d29997b4a3 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -17,6 +17,8 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 set(raft_dask_version 23.12.00)
 
 include(../../fetch_rapids.cmake)
+include(rapids-cuda)
+rapids_cuda_init_architectures(raft-dask-python)
 
 project(
   raft-dask-python
@@ -25,7 +27,7 @@ project(
             # language to be enabled here. The test project that is built in scikit-build to verify
             # various linking options for the python library is hardcoded to build with C, so until
             # that is fixed we need to keep C.
-            C CXX
+            C CXX CUDA
 )
 
 option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
@@ -42,14 +44,6 @@ else()
 endif()
 
 if(NOT raft_FOUND)
-  # TODO: This will not be necessary once we upgrade to CMake 3.22, which will pull in the required
-  # languages for the C++ project even if this project does not require those languages.
-  include(rapids-cuda)
-  rapids_cuda_init_architectures(raft-dask)
-  enable_language(CUDA)
-  # Since raft-dask only enables CUDA optionally we need to manually include the file that
-  # rapids_cuda_init_architectures relies on `project` including.
-  include("${CMAKE_PROJECT_raft-dask_INCLUDE}")
   find_package(ucx REQUIRED)
 
   # raft-dask doesn't actually use raft libraries, it just needs the headers, so we can turn off all
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 5c616806a2..1619edbbbf 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -35,8 +35,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "dask-cuda==23.12.*",
-    "dask>=2023.7.1",
-    "distributed>=2023.7.1",
+    "dask==2023.9.2",
+    "distributed==2023.9.2",
     "joblib>=0.11",
     "numba>=0.57",
     "numpy>=1.21",