diff --git a/.clang-format b/.clang-format
index 21fd8c447ad..3cd3f6da331 100644
--- a/.clang-format
+++ b/.clang-format
@@ -117,12 +117,15 @@ IncludeCategories:
   - Regex:           '^<thrust/'
     Priority:            3
     SortPriority:        2
+  - Regex:           '^<cuda/experimental'
+    Priority:            5
+    SortPriority:        4
   - Regex:           '^<cuda/'
     Priority:            4
     SortPriority:        3
   - Regex:           '^<[a-z_]*>$'
-    Priority:            5
-    SortPriority:        4
+    Priority:            6
+    SortPriority:        5
   - Regex:           '^<cuda'
     Priority:            0
     SortPriority:        0
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
index f92a6dd96f7..d854931292a 100644
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@@ -63,6 +63,7 @@ For more information about the sccache configuration and authentication, see the
 ## Quickstart: Docker (Manual Approach) <a name="docker"></a>
 
 ### Prerequisites
+- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
 - [Docker](https://docs.docker.com/desktop/install/linux-install/)
 
 ### Steps
diff --git a/.devcontainer/cuda11.1-gcc6/devcontainer.json b/.devcontainer/cuda11.1-gcc6/devcontainer.json
index 2114e5fd8fb..ed345016ec1 100644
--- a/.devcontainer/cuda11.1-gcc6/devcontainer.json
+++ b/.devcontainer/cuda11.1-gcc6/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc6-cuda11.1",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc6-cuda11.1",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda11.1-gcc7/devcontainer.json b/.devcontainer/cuda11.1-gcc7/devcontainer.json
index 7a9a07355fc..b1ff078547b 100644
--- a/.devcontainer/cuda11.1-gcc7/devcontainer.json
+++ b/.devcontainer/cuda11.1-gcc7/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc7-cuda11.1",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc7-cuda11.1",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda11.1-gcc8/devcontainer.json b/.devcontainer/cuda11.1-gcc8/devcontainer.json
index 50c1bdca6b2..f480d0003a3 100644
--- a/.devcontainer/cuda11.1-gcc8/devcontainer.json
+++ b/.devcontainer/cuda11.1-gcc8/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc8-cuda11.1",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc8-cuda11.1",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda11.1-gcc9/devcontainer.json b/.devcontainer/cuda11.1-gcc9/devcontainer.json
index f069ed0a116..a622e145191 100644
--- a/.devcontainer/cuda11.1-gcc9/devcontainer.json
+++ b/.devcontainer/cuda11.1-gcc9/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc9-cuda11.1",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc9-cuda11.1",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda11.1-llvm9/devcontainer.json b/.devcontainer/cuda11.1-llvm9/devcontainer.json
index 0b95a93677f..3eaa29a8b88 100644
--- a/.devcontainer/cuda11.1-llvm9/devcontainer.json
+++ b/.devcontainer/cuda11.1-llvm9/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm9-cuda11.1",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm9-cuda11.1",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json
index 20b430c4c16..4d03dc2de06 100644
--- a/.devcontainer/cuda11.8-gcc11/devcontainer.json
+++ b/.devcontainer/cuda11.8-gcc11/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc11-cuda11.8",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda11.8",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.0-gcc10/devcontainer.json b/.devcontainer/cuda12.0-gcc10/devcontainer.json
index 659f5a0320c..1371a181a9d 100644
--- a/.devcontainer/cuda12.0-gcc10/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc10/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc10-cuda12.0",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc10-cuda12.0",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.0-gcc11/devcontainer.json b/.devcontainer/cuda12.0-gcc11/devcontainer.json
index 62a89b837dc..2096821c111 100644
--- a/.devcontainer/cuda12.0-gcc11/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc11/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc11-cuda12.0",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda12.0",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.0-gcc12/devcontainer.json b/.devcontainer/cuda12.0-gcc12/devcontainer.json
index 1eb084299de..e99c8debae8 100644
--- a/.devcontainer/cuda12.0-gcc12/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc12/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc12-cuda12.0",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.0",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.0-gcc9/devcontainer.json b/.devcontainer/cuda12.0-gcc9/devcontainer.json
index daa1ba6a92f..31548082329 100644
--- a/.devcontainer/cuda12.0-gcc9/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc9/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc9-cuda12.0",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc9-cuda12.0",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.0-llvm10/devcontainer.json b/.devcontainer/cuda12.0-llvm10/devcontainer.json
index 8bb371e0137..b4bf89b341a 100644
--- a/.devcontainer/cuda12.0-llvm10/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm10/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm10-cuda12.0",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm10-cuda12.0",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.0-llvm11/devcontainer.json b/.devcontainer/cuda12.0-llvm11/devcontainer.json
index ff1f07c59b2..b87d457cb73 100644
--- a/.devcontainer/cuda12.0-llvm11/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm11/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm11-cuda12.0",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm11-cuda12.0",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.0-llvm12/devcontainer.json b/.devcontainer/cuda12.0-llvm12/devcontainer.json
index 3053ac9c8b9..829ec1cb2e7 100644
--- a/.devcontainer/cuda12.0-llvm12/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm12/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm12-cuda12.0",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm12-cuda12.0",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.0-llvm13/devcontainer.json b/.devcontainer/cuda12.0-llvm13/devcontainer.json
index 0e736940583..60abc033bef 100644
--- a/.devcontainer/cuda12.0-llvm13/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm13/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm13-cuda12.0",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm13-cuda12.0",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.0-llvm14/devcontainer.json b/.devcontainer/cuda12.0-llvm14/devcontainer.json
index 63a6eff1708..a48b0bcd0cc 100644
--- a/.devcontainer/cuda12.0-llvm14/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm14/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm14-cuda12.0",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm14-cuda12.0",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.0-llvm9/devcontainer.json b/.devcontainer/cuda12.0-llvm9/devcontainer.json
index f4eb0a86f58..465478e431d 100644
--- a/.devcontainer/cuda12.0-llvm9/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm9/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm9-cuda12.0",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm9-cuda12.0",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.2-rapids-conda b/.devcontainer/cuda12.2-rapids-conda
deleted file mode 120000
index 74d19d8f3fc..00000000000
--- a/.devcontainer/cuda12.2-rapids-conda
+++ /dev/null
@@ -1 +0,0 @@
-../ci/rapids/cuda12.2-conda
\ No newline at end of file
diff --git a/.devcontainer/cuda12.5-gcc10/devcontainer.json b/.devcontainer/cuda12.5-gcc10/devcontainer.json
index 70a20f9cb90..5a59153bf39 100644
--- a/.devcontainer/cuda12.5-gcc10/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc10/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc10-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc10-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-gcc11/devcontainer.json b/.devcontainer/cuda12.5-gcc11/devcontainer.json
index 43f561a41c0..42b668abf18 100644
--- a/.devcontainer/cuda12.5-gcc11/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc11/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc11-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-gcc12/devcontainer.json b/.devcontainer/cuda12.5-gcc12/devcontainer.json
index 85ff2f26c67..d807d4cd30e 100644
--- a/.devcontainer/cuda12.5-gcc12/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc12/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc12-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-gcc13/devcontainer.json b/.devcontainer/cuda12.5-gcc13/devcontainer.json
index c1b2aac5185..01364fdbc23 100644
--- a/.devcontainer/cuda12.5-gcc13/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc13/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-gcc7/devcontainer.json b/.devcontainer/cuda12.5-gcc7/devcontainer.json
index ed1c7c6ea90..a6327695055 100644
--- a/.devcontainer/cuda12.5-gcc7/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc7/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc7-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc7-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-gcc8/devcontainer.json b/.devcontainer/cuda12.5-gcc8/devcontainer.json
index 5496286eb64..f0aff7ba7b1 100644
--- a/.devcontainer/cuda12.5-gcc8/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc8/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc8-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc8-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-gcc9/devcontainer.json b/.devcontainer/cuda12.5-gcc9/devcontainer.json
index 84c77a2fc9b..e050d233038 100644
--- a/.devcontainer/cuda12.5-gcc9/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc9/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc9-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc9-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-llvm10/devcontainer.json b/.devcontainer/cuda12.5-llvm10/devcontainer.json
index c1f68f9c1c7..0cda7b0a667 100644
--- a/.devcontainer/cuda12.5-llvm10/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm10/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm10-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm10-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-llvm11/devcontainer.json b/.devcontainer/cuda12.5-llvm11/devcontainer.json
index f7a9e773152..1a513873f1d 100644
--- a/.devcontainer/cuda12.5-llvm11/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm11/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm11-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm11-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-llvm12/devcontainer.json b/.devcontainer/cuda12.5-llvm12/devcontainer.json
index 4f1d2043747..a11a351e30f 100644
--- a/.devcontainer/cuda12.5-llvm12/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm12/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm12-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm12-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-llvm13/devcontainer.json b/.devcontainer/cuda12.5-llvm13/devcontainer.json
index 8904b179715..0136655f0c0 100644
--- a/.devcontainer/cuda12.5-llvm13/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm13/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm13-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm13-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-llvm14/devcontainer.json b/.devcontainer/cuda12.5-llvm14/devcontainer.json
index 0a01a2261c7..dd9d6a62f04 100644
--- a/.devcontainer/cuda12.5-llvm14/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm14/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm14-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm14-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-llvm15/devcontainer.json b/.devcontainer/cuda12.5-llvm15/devcontainer.json
index bb7378358cc..51fd6a14660 100644
--- a/.devcontainer/cuda12.5-llvm15/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm15/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm15-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm15-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-llvm16/devcontainer.json b/.devcontainer/cuda12.5-llvm16/devcontainer.json
index 12bf9b07d72..882025ddaf2 100644
--- a/.devcontainer/cuda12.5-llvm16/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm16/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm16-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm16-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-llvm17/devcontainer.json b/.devcontainer/cuda12.5-llvm17/devcontainer.json
index 6abd7c8292d..55fa86ff532 100644
--- a/.devcontainer/cuda12.5-llvm17/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm17/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm17-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm17-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-llvm9/devcontainer.json b/.devcontainer/cuda12.5-llvm9/devcontainer.json
index e8164f36eab..3b2a328c2ea 100644
--- a/.devcontainer/cuda12.5-llvm9/devcontainer.json
+++ b/.devcontainer/cuda12.5-llvm9/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-llvm9-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm9-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json b/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json
index 18b8a5c317d..5e4b04e19b4 100644
--- a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json
+++ b/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-oneapi2023.2.0-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-oneapi2023.2.0-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.devcontainer/cuda12.5-rapids-conda b/.devcontainer/cuda12.5-rapids-conda
new file mode 120000
index 00000000000..f187cbd1366
--- /dev/null
+++ b/.devcontainer/cuda12.5-rapids-conda
@@ -0,0 +1 @@
+../ci/rapids/cuda12.5-conda
\ No newline at end of file
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index c1b2aac5185..01364fdbc23 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5",
   "hostRequirements": {
     "gpu": "optional"
   },
diff --git a/.github/actions/workflow-build/action.yml b/.github/actions/workflow-build/action.yml
index 57a94989523..3842886589a 100644
--- a/.github/actions/workflow-build/action.yml
+++ b/.github/actions/workflow-build/action.yml
@@ -25,6 +25,15 @@ inputs:
     description: "Path to the matrix parser script (default if blank: build-workflow.py from action dir)"
     default: ""
     required: false
+  slack_token:
+    description: "The Slack token to use for notifications. No notifications will be sent if not provided."
+    required: false
+  slack_log:
+    description: "Slack channel ID for verbose notifications."
+    required: false
+  slack_alert:
+    description: "Slack channel ID for alert notifications."
+    required: false
 
 outputs:
   workflow:
@@ -35,6 +44,20 @@ runs:
   using: "composite"
   steps:
 
+    - name: Send Slack log notification
+      if: ${{inputs.slack_token != '' && inputs.slack_log != '' }}
+      uses: slackapi/slack-github-action@v1.26.0
+      env:
+        SLACK_BOT_TOKEN: ${{ inputs.slack_token }}
+        WORKFLOW_TYPE: ${{ github.workflow }} # nightly, weekly, pr, etc.
+        SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
+      with:
+        channel-id: ${{ inputs.slack_log }}
+        slack-message: |
+          Workflow '${{ env.WORKFLOW_TYPE }}' starting...
+
+          Details: ${{ env.SUMMARY_URL }}
+
     - name: Inspect changes
       if: ${{ inputs.inspect_changes_script != '' && inputs.inspect_changes_base_sha != '' }}
       id: inspect-changes
@@ -99,3 +122,18 @@ runs:
         name: workflow
         path: workflow/
         compression-level: 0
+
+    - name: Send Slack error notification
+      if: ${{ failure() && inputs.slack_token != '' && (inputs.slack_alert != '' || inputs.slack_log != '') }}
+      uses: slackapi/slack-github-action@v1.26.0
+      env:
+        SLACK_BOT_TOKEN: ${{ inputs.slack_token }}
+        WORKFLOW_TYPE: ${{ github.workflow }} # nightly, weekly, pr, etc.
+        SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
+        CHANNEL_SEP: ${{ (inputs.slack_log != '' && inputs.slack_alert != '') && ',' || ''}}
+      with:
+        channel-id: '${{ inputs.slack_log }}${{env.CHANNEL_SEP}}${{ inputs.slack_alert }}'
+        slack-message: |
+          Workflow '${{ env.WORKFLOW_TYPE }}' encountered an error while preparing to run.
+
+          Details: ${{ env.SUMMARY_URL }}
diff --git a/.github/actions/workflow-results/action.yml b/.github/actions/workflow-results/action.yml
index 3f5bc92afad..f14d6d496e8 100644
--- a/.github/actions/workflow-results/action.yml
+++ b/.github/actions/workflow-results/action.yml
@@ -8,11 +8,20 @@ inputs:
   pr_number:
     description: "The PR number to comment on, if applicable. No comment will be made if not provided."
     required: false
+  slack_token:
+    description: "The Slack token to use for notifications. No notifications will be sent if not provided."
+    required: false
+  slack_log:
+    description: "Slack channel ID for verbose notifications."
+    required: false
+  slack_alert:
+    description: "Slack channel ID for alert notifications."
+    required: false
 
 outputs:
   success:
     description: "Whether any jobs failed."
-    value: ${{ steps.check-dispatch.outputs.success }}
+    value: ${{ steps.check-success.outputs.success }}
 
 runs:
   using: "composite"
@@ -112,6 +121,10 @@ runs:
         printf "SUMMARY=%s\n" "$(cat final_summary.md | url_encode_string)" | tee -a "${GITHUB_OUTPUT}"
         echo "::endgroup::"
 
+        echo "::group::GHA Output: EXEC_SUMMARY"
+        printf "EXEC_SUMMARY=%s\n" "$(cat execution/heading.txt)" | tee -a "${GITHUB_OUTPUT}"
+        echo "::endgroup::"
+
         cp final_summary.md ${GITHUB_STEP_SUMMARY}
 
     - name: Comment on PR
@@ -140,7 +153,7 @@ runs:
           });
 
     - name: Check for job success
-      id: check-dispatch
+      id: check-success
       shell: bash --noprofile --norc -euo pipefail {0}
       run: |
         echo "::group::Checking for success artifacts"
@@ -162,3 +175,38 @@ runs:
         fi
 
         echo "success=true" >> "${GITHUB_OUTPUT}"
+
+    - name: Send Slack log notification
+      if: ${{ always() && inputs.slack_token != '' && inputs.slack_log != '' }}
+      uses: slackapi/slack-github-action@v1.26.0
+      env:
+        SLACK_BOT_TOKEN: ${{ inputs.slack_token }}
+        WORKFLOW_TYPE: ${{ github.workflow }} # nightly, weekly, pr, etc.
+        STATUS: ${{ steps.check-success.outcome }}
+        EXEC_SUMMARY: ${{ steps.final-summary.outputs.EXEC_SUMMARY }}
+        SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
+      with:
+        channel-id: ${{ inputs.slack_log }}
+        slack-message: |
+          Workflow '${{ env.WORKFLOW_TYPE }}' has finished with status `${{ env.STATUS }}`:
+
+          ${{ env.EXEC_SUMMARY }}
+
+          Details: ${{ env.SUMMARY_URL }}
+
+    - name: Send Slack alert notification
+      if: ${{ failure() && inputs.slack_token != '' && inputs.slack_alert != '' }}
+      uses: slackapi/slack-github-action@v1.26.0
+      env:
+        SLACK_BOT_TOKEN: ${{ inputs.slack_token }}
+        WORKFLOW_TYPE: ${{ github.workflow }} # nightly, weekly, pr, etc.
+        EXEC_SUMMARY: ${{ steps.final-summary.outputs.EXEC_SUMMARY }}
+        SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
+      with:
+        channel-id: ${{ inputs.slack_alert }}
+        slack-message: |
+          Workflow '${{ env.WORKFLOW_TYPE }}' has failed:
+
+          ${{ env.EXEC_SUMMARY }}
+
+          Details: ${{ env.SUMMARY_URL }}
diff --git a/.github/actions/workflow-results/parse-job-times.py b/.github/actions/workflow-results/parse-job-times.py
index b90bd227a30..b30d585a0a6 100755
--- a/.github/actions/workflow-results/parse-job-times.py
+++ b/.github/actions/workflow-results/parse-job-times.py
@@ -120,7 +120,7 @@ def main():
     for id, stats in result.items():
         job_seconds = stats['job_seconds']
         command_seconds = stats['command_seconds']
-        overhead = (job_seconds - command_seconds) * 100 / command_seconds
+        overhead = (job_seconds - command_seconds) * 100 / command_seconds if command_seconds > 0 else 100
         print(f"{stats['job_duration']:10} {stats['command_duration']:10} {overhead:10.0f} {stats['name']}")
     print("::endgroup::")
 
diff --git a/.github/workflows/build-rapids.yml b/.github/workflows/build-rapids.yml
index fe4a9697be8..2d0cfa6f761 100644
--- a/.github/workflows/build-rapids.yml
+++ b/.github/workflows/build-rapids.yml
@@ -2,6 +2,12 @@ name: Build all RAPIDS repositories
 
 on:
   workflow_call:
+    inputs:
+      enable_slack_alerts:
+        description: "If true, a message will be posted to the CCCL GHA CI Alert channel if the workflow fails."
+        required: false
+        default: false
+        type: boolean
 
 jobs:
   check-event:
@@ -16,6 +22,7 @@ jobs:
         run: |
           [[ '${{ github.event_name }}' == 'push' && '${{ github.repository }}' == 'NVIDIA/cccl' ]] || \
           [[ '${{ github.event_name }}' == 'schedule' && '${{ github.repository }}' == 'NVIDIA/cccl' ]] || \
+          [[ '${{ github.event_name }}' == 'workflow_dispatch' && '${{ github.repository }}' == 'NVIDIA/cccl' ]] || \
           [[ '${{ github.event_name }}' == 'pull_request' && '${{ github.repository }}' != 'NVIDIA/cccl' ]] \
           && echo "ok=true"  | tee -a $GITHUB_OUTPUT \
           || echo "ok=false" | tee -a $GITHUB_OUTPUT;
@@ -29,10 +36,10 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - { cuda: '12.2', libs: 'rmm KvikIO cudf cudf_kafka cuspatial',        }
-          - { cuda: '12.2', libs: 'rmm ucxx raft cuvs',                          }
-          - { cuda: '12.2', libs: 'rmm ucxx raft cumlprims_mg cuml',             }
-          - { cuda: '12.2', libs: 'rmm ucxx raft cugraph-ops wholegraph cugraph' }
+          - { cuda: '12.5', libs: 'rmm KvikIO cudf cudf_kafka cuspatial',        }
+          - { cuda: '12.5', libs: 'rmm ucxx raft cuvs',                          }
+          - { cuda: '12.5', libs: 'rmm ucxx raft cumlprims_mg cuml',             }
+          - { cuda: '12.5', libs: 'rmm ucxx raft cugraph-ops wholegraph cugraph' }
     permissions:
       id-token: write
       contents: read
@@ -54,20 +61,20 @@ jobs:
           CI: true
           RAPIDS_LIBS: ${{ matrix.libs }}
           # Uncomment any of these to customize the git repo and branch for a RAPIDS lib:
-          # RAPIDS_cmake_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
-          # RAPIDS_cudf_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
-          # RAPIDS_cudf_kafka_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
-          # RAPIDS_cugraph_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
-          # RAPIDS_cugraph_ops_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
-          # RAPIDS_cuml_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
-          # RAPIDS_cumlprims_mg_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
-          # RAPIDS_cuspatial_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
-          # RAPIDS_cuvs_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
-          # RAPIDS_KvikIO_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
-          # RAPIDS_raft_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
-          # RAPIDS_rmm_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
-          # RAPIDS_ucxx_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-0.39"}'
-          # RAPIDS_wholegraph_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}'
+          # RAPIDS_cmake_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
+          # RAPIDS_cudf_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
+          # RAPIDS_cudf_kafka_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
+          # RAPIDS_cugraph_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
+          # RAPIDS_cugraph_ops_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
+          # RAPIDS_cuml_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
+          # RAPIDS_cumlprims_mg_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
+          # RAPIDS_cuspatial_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
+          # RAPIDS_cuvs_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
+          # RAPIDS_KvikIO_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
+          # RAPIDS_raft_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
+          # RAPIDS_rmm_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
+          # RAPIDS_ucxx_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-0.40"}'
+          # RAPIDS_wholegraph_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}'
         run: |
           cat <<"EOF" > "$RUNNER_TEMP/ci-entrypoint.sh"
           #! /usr/bin/env bash
@@ -156,3 +163,22 @@ jobs:
             --volume "$RUNNER_TEMP/ci.sh:/ci.sh" \
             --volume "$RUNNER_TEMP/ci-entrypoint.sh:/ci-entrypoint.sh" \
             -- /ci-entrypoint.sh ./ci/rapids/rapids-entrypoint.sh /ci.sh
+
+  notify-failure:
+    name: Notify Slack of RAPIDS failure
+    if: ${{ failure() && inputs.enable_slack_alerts }}
+    needs: build-rapids
+    runs-on: ubuntu-latest
+    steps:
+      - name: Notify
+        uses: slackapi/slack-github-action@v1.26.0
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_NOTIFIER_BOT_TOKEN }}
+          WORKFLOW_TYPE: ${{ github.workflow }}
+          SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
+        with:
+          channel-id: ${{ secrets.SLACK_CHANNEL_CI_ALERT }}
+          slack-message: |
+            RAPIDS build in workflow '${{ env.WORKFLOW_TYPE }}' failed.
+
+            Details: ${{ env.SUMMARY_URL }}
diff --git a/.github/workflows/ci-workflow-nightly.yml b/.github/workflows/ci-workflow-nightly.yml
index c7406ef817e..fdf281b8063 100644
--- a/.github/workflows/ci-workflow-nightly.yml
+++ b/.github/workflows/ci-workflow-nightly.yml
@@ -21,6 +21,7 @@ defaults:
     shell: bash --noprofile --norc -euo pipefail {0}
 
 on:
+  workflow_dispatch:
   schedule:
     - cron: '0 7 * * *' # 7AM UTC, 12AM PST, 3AM EST
 
@@ -46,6 +47,9 @@ jobs:
         uses: ./.github/actions/workflow-build
         with:
           workflows: nightly
+          slack_token: ${{ secrets.SLACK_NOTIFIER_BOT_TOKEN }}
+          slack_log: ${{ secrets.SLACK_CHANNEL_CI_LOG }}
+          slack_alert: ${{ secrets.SLACK_CHANNEL_CI_ALERT }}
 
   dispatch-groups-linux-two-stage:
     name: ${{ matrix.name }}
@@ -128,6 +132,12 @@ jobs:
       - name: Check workflow success
         id: check-workflow
         uses: ./.github/actions/workflow-results
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          slack_token: ${{ secrets.SLACK_NOTIFIER_BOT_TOKEN }}
+          slack_log: ${{ secrets.SLACK_CHANNEL_CI_LOG }}
+          slack_alert: ${{ secrets.SLACK_CHANNEL_CI_ALERT }}
+
 
   build-rapids:
     name: Build RAPIDS
@@ -139,38 +149,5 @@ jobs:
       contents: read
       pull-requests: read
     uses: ./.github/workflows/build-rapids.yml
-
-  # Check all other job statuses. This job gates branch protection checks.
-  ci:
-    name: CI
-    # !! Important: This job is used for branch protection checks.
-    # !! Need to use always() instead of !cancelled() because skipped jobs count as success
-    # !! for Github branch protection checks. Yes, really: by default, branch protections
-    # !! can be bypassed by cancelling CI. See NVIDIA/cccl#605.
-    if: ${{ always() }}
-    needs:
-      - verify-workflow
-    runs-on: ubuntu-latest
-    steps:
-      - name: Check results
-        run: |
-          status="success"
-
-          check_result() {
-            name=$1
-            expected=$2
-            result=$3
-
-            echo "Checking if $name job result ('$result') is '$expected'..."
-            if [[ "$result" != "$expected" ]]; then
-              echo "$name job failed"
-
-              status="failed"
-            fi
-          }
-
-          check_result "verify-workflow"      "success" "${{needs.verify-workflow.result}}"
-
-          if [[ "$status" != "success" ]]; then
-            exit 1
-          fi
+    with:
+      enable_slack_alerts: true
diff --git a/benchmarks/scripts/analyze.py b/benchmarks/scripts/analyze.py
index 248388b6a0b..8006637c462 100755
--- a/benchmarks/scripts/analyze.py
+++ b/benchmarks/scripts/analyze.py
@@ -273,7 +273,8 @@ def case_top(alpha, N, algname, ct_point_name, case_dfs):
 
     for subbench in case_dfs:
         case_dfs[subbench] = extract_complete_variants(case_dfs[subbench])
-    print(extract_scores(case_dfs).head(N))
+    with pd.option_context('display.max_rows', None):
+        print(extract_scores(case_dfs).head(N))
 
 
 def top(args):
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index a7031beff97..fa2f5d92183 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -66,13 +66,13 @@ workflows:
     - {jobs: ['build'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',   std: [14],     project: ['libcudacxx']}
     - {jobs: ['build'], ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc12',  std: 'all',    project: ['libcudacxx']}
     - {jobs: ['build'], ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11],     project: ['libcudacxx']}
-    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',  std: [11, 20], project: ['libcudacxx']}
-    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang16', std: [17],     project: ['libcudacxx']}
+  # H100 runners are currently flakey, only build since those use CPU-only runners:
+    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',  std: [11, 20]}
+    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang16', std: [17]}
+
     - {jobs: ['test'],  ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',   std: [14],     project: ['cub', 'thrust']}
     - {jobs: ['test'],  ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc12',  std: 'all',    project: ['cub', 'thrust']}
     - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11],     project: ['cub', 'thrust']}
-    - {jobs: ['test'],  ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',  std: [11, 20], project: ['cub', 'thrust']}
-    - {jobs: ['test'],  ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang16', std: [17],     project: ['cub', 'thrust']}
    # - {jobs: ['test'],  ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',   std: [14]     }
    # - {jobs: ['test'],  ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc12',  std: 'all'    }
    # - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11]     }
@@ -99,7 +99,7 @@ workflows:
 
 
 # The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
-devcontainer_version: '24.08'
+devcontainer_version: '24.10'
 
 # All supported C++ standards:
 all_stds: [11, 14, 17, 20]
diff --git a/ci/rapids/cuda12.2-conda/devcontainer.json b/ci/rapids/cuda12.5-conda/devcontainer.json
similarity index 97%
rename from ci/rapids/cuda12.2-conda/devcontainer.json
rename to ci/rapids/cuda12.5-conda/devcontainer.json
index 9fb4d3a9086..8ec1a35d43a 100644
--- a/ci/rapids/cuda12.2-conda/devcontainer.json
+++ b/ci/rapids/cuda12.5-conda/devcontainer.json
@@ -1,13 +1,13 @@
 {
-  "image": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04",
+  "image": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04",
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-rapids-24.08-cuda12.2-conda"
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-rapids-24.10-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
@@ -15,7 +15,7 @@
   "containerEnv": {
     "CI": "${localEnv:CI}",
     "CUDAARCHS": "70-real",
-    "CUDA_VERSION": "12.2",
+    "CUDA_VERSION": "12.5",
     "DEFAULT_CONDA_ENV": "rapids",
     "PYTHONSAFEPATH": "1",
     "PYTHONUNBUFFERED": "1",
diff --git a/ci/update_rapids_version.sh b/ci/update_rapids_version.sh
new file mode 100755
index 00000000000..d1300d9e411
--- /dev/null
+++ b/ci/update_rapids_version.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+##########################
+# RAPIDS Version Updater #
+##########################
+
+## Usage
+# bash update_rapids_version.sh <new_version>
+
+# Format is YY.MM.PP - no leading 'v' or trailing 'a'
+NEXT_FULL_TAG=$1
+
+#Get <major>.<minor> for next version
+NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
+NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
+NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}')
+NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
+NEXT_UCXX_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
+
+# Need to distutils-normalize the versions for some use cases
+NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
+
+echo "Updating RAPIDS and devcontainers to $NEXT_FULL_TAG"
+
+# Inplace sed replace; workaround for Linux and Mac
+function sed_runner() {
+    sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak
+}
+
+# Update CI files
+sed_runner "/devcontainer_version/ s/'[0-9.]*'/'${NEXT_SHORT_TAG}'/g" ci/matrix.yaml
+for FILE in .github/workflows/*.yml; do
+  sed_runner "/rapidsai/ s/\"branch-.*\"/\"branch-${NEXT_SHORT_TAG}\"/g" "${FILE}"
+  sed_runner "/ucxx/ s/\"branch-.*\"/\"branch-${NEXT_UCXX_SHORT_TAG}\"/g" "${FILE}"
+done
+
+function update_devcontainer() {
+    sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${1}"
+    sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${1}"
+    sed_runner "s@\${localWorkspaceFolderBasename}-rapids-[0-9.]*@\${localWorkspaceFolderBasename}-rapids-${NEXT_SHORT_TAG}@g" "${1}"
+}
+
+# Update .devcontainer files
+find .devcontainer/ ci/rapids/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
+    update_devcontainer "${filename}"
+done
diff --git a/cub/CMakeLists.txt b/cub/CMakeLists.txt
index 6e66813b7bb..91068802d63 100644
--- a/cub/CMakeLists.txt
+++ b/cub/CMakeLists.txt
@@ -30,7 +30,7 @@ endif()
 
 # Support adding CUB to a parent project via add_subdirectory.
 # See examples/cmake/add_subdir/CMakeLists.txt for details.
-if (NOT CUB_TOPLEVEL_PROJECT AND NOT CUB_IN_THRUST)
+if (NOT CUB_TOPLEVEL_PROJECT)
   include(cmake/CubAddSubdir.cmake)
   return()
 endif()
@@ -51,12 +51,10 @@ mark_as_advanced(CUB_ENABLE_CPP_DIALECT_IN_NAMES)
 
 # This option is only used when CUB is built stand-alone; otherwise the Thrust
 # option has the same effect.
-if (NOT CUB_IN_THRUST)
-  option(CUB_IGNORE_DEPRECATED_API
-    "Suppress warnings about deprecated Thrust/CUB API."
-    OFF
-  )
-endif()
+option(CUB_IGNORE_DEPRECATED_API
+  "Suppress warnings about deprecated Thrust/CUB API."
+  OFF
+)
 
 # Check if we're actually building anything before continuing. If not, no need
 # to search for deps, etc. This is a common approach for packagers that just
diff --git a/cub/CONTRIBUTING.md b/cub/CONTRIBUTING.md
index 0b6813ea78f..4002779dcdc 100644
--- a/cub/CONTRIBUTING.md
+++ b/cub/CONTRIBUTING.md
@@ -17,7 +17,7 @@ changes. CUB's tests and examples can be built by configuring Thrust with the
 CMake option `THRUST_INCLUDE_CUB_CMAKE=ON`.
 
 This process is described in more detail in Thrust's
-[CONTRIBUTING.md](https://nvidia.github.io/thrust/contributing.html).
+[CONTRIBUTING.md](https://nvidia.github.io/cccl/thrust/contributing.html).
 
 The CMake options in the following section may be used to customize CUB's build
 process. Note that some of these are controlled by Thrust for compatibility and
@@ -63,8 +63,3 @@ The configuration options for CUB are:
   - Enable separable compilation on all targets that are agnostic of RDC.
   - Targets that explicitly require RDC to be enabled or disabled will ignore this setting.
   - Default is `OFF`.
-
-# Development Model
-
-CUB follows the same development model as Thrust, described
-[here](https://nvidia.github.io/thrust/releases/versioning.html).
diff --git a/cub/cmake/CubBuildTargetList.cmake b/cub/cmake/CubBuildTargetList.cmake
index 426eee5e269..f01c6244f4e 100644
--- a/cub/cmake/CubBuildTargetList.cmake
+++ b/cub/cmake/CubBuildTargetList.cmake
@@ -132,30 +132,15 @@ function(cub_build_target_list)
   # Handle dialect options:
   set(num_dialects_enabled 0)
   foreach (dialect IN LISTS CUB_CPP_DIALECT_OPTIONS)
-    if (CUB_IN_THRUST)
-      # Just use Thrust's settings:
-      if (THRUST_ENABLE_MULTICONFIG)
-        set(CUB_ENABLE_DIALECT_CPP${dialect}
-            ${THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect}}
-        )
-      else()
-        set(val OFF)
-        if (dialect EQUAL ${THRUST_CPP_DIALECT})
-          set(val ON)
-        endif()
-        set(CUB_ENABLE_DIALECT_CPP${dialect} ${val})
-      endif()
-    else()
-      # Create CMake options:
-      set(default_value OFF)
-      if (dialect EQUAL 14) # Default to just 14 on:
-        set(default_value ON)
-      endif()
-      option(CUB_ENABLE_DIALECT_CPP${dialect}
-        "Generate C++${dialect} build configurations."
-        ${default_value}
-      )
+    # Create CMake options:
+    set(default_value OFF)
+    if (dialect EQUAL 14) # Default to just 14 on:
+      set(default_value ON)
     endif()
+    option(CUB_ENABLE_DIALECT_CPP${dialect}
+      "Generate C++${dialect} build configurations."
+      ${default_value}
+    )
 
     if (CUB_ENABLE_DIALECT_CPP${dialect})
       math(EXPR num_dialects_enabled "${num_dialects_enabled} + 1")
@@ -188,14 +173,8 @@ function(cub_build_target_list)
   # Generic config flags:
   macro(add_flag_option flag docstring default)
     set(cub_opt "CUB_${flag}")
-    if (CUB_IN_THRUST)
-      set(thrust_opt "THRUST_${flag}")
-      # Use thrust's settings:
-      set(${cub_opt} ${${thrust_opt}})
-    else()
-      option(${cub_opt} "${docstring}" "${default}")
-      mark_as_advanced(${cub_opt})
-    endif()
+    option(${cub_opt} "${docstring}" "${default}")
+    mark_as_advanced(${cub_opt})
   endmacro()
   add_flag_option(IGNORE_DEPRECATED_CPP_DIALECT "Don't warn about any deprecated C++ standards and compilers." OFF)
   add_flag_option(IGNORE_DEPRECATED_CPP_11 "Don't warn about deprecated C++11." OFF)
diff --git a/cub/cmake/CubHeaderTesting.cmake b/cub/cmake/CubHeaderTesting.cmake
index 7cead875c08..f0ca17186ce 100644
--- a/cub/cmake/CubHeaderTesting.cmake
+++ b/cub/cmake/CubHeaderTesting.cmake
@@ -31,10 +31,6 @@ function(cub_add_header_test label definitions)
     cub_clone_target_properties(${headertest_target} ${cub_target})
     cub_configure_cuda_target(${headertest_target} RDC ${CUB_FORCE_RDC})
 
-    if (CUB_IN_THRUST)
-      thrust_fix_clang_nvcc_build_for(${headertest_target})
-    endif()
-
     add_dependencies(cub.all.headers ${headertest_target})
     add_dependencies(${config_prefix}.all ${headertest_target})
   endforeach()
diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh
new file mode 100644
index 00000000000..adf75535172
--- /dev/null
+++ b/cub/cub/agent/agent_merge.cuh
@@ -0,0 +1,230 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_merge_sort.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_merge_sort.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/system/cuda/detail/core/util.h>
+
+#include <cuda/std/__cccl/dialect.h>
+
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
+namespace merge
+{
+template <int ThreadsPerBlock,
+          int ItemsPerThread,
+          BlockLoadAlgorithm LoadAlgorithm,
+          CacheLoadModifier LoadCacheModifier,
+          BlockStoreAlgorithm StoreAlgorithm>
+struct agent_policy_t
+{
+  // do not change data member names, policy_wrapper_t depends on it
+  static constexpr int BLOCK_THREADS                   = ThreadsPerBlock;
+  static constexpr int ITEMS_PER_THREAD                = ItemsPerThread;
+  static constexpr int ITEMS_PER_TILE                  = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM   = LoadAlgorithm;
+  static constexpr CacheLoadModifier LOAD_MODIFIER     = LoadCacheModifier;
+  static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm;
+};
+
+// TODO(bgruber): can we unify this one with AgentMerge in agent_merge_sort.cuh?
+template <typename Policy,
+          typename KeysIt1,
+          typename ItemsIt1,
+          typename KeysIt2,
+          typename ItemsIt2,
+          typename KeysOutputIt,
+          typename ItemsOutputIt,
+          typename Offset,
+          typename CompareOp>
+struct agent_t
+{
+  using policy = Policy;
+
+  // key and value type are taken from the first input sequence (consistent with old Thrust behavior)
+  using key_type  = typename ::cuda::std::iterator_traits<KeysIt1>::value_type;
+  using item_type = typename ::cuda::std::iterator_traits<ItemsIt1>::value_type;
+
+  using keys_load_it1  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt1>::type;
+  using keys_load_it2  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt2>::type;
+  using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt1>::type;
+  using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt2>::type;
+
+  using block_load_keys1  = typename BlockLoadType<Policy, keys_load_it1>::type;
+  using block_load_keys2  = typename BlockLoadType<Policy, keys_load_it2>::type;
+  using block_load_items1 = typename BlockLoadType<Policy, items_load_it1>::type;
+  using block_load_items2 = typename BlockLoadType<Policy, items_load_it2>::type;
+
+  using block_store_keys  = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
+  using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
+
+  union temp_storages
+  {
+    typename block_load_keys1::TempStorage load_keys1;
+    typename block_load_keys2::TempStorage load_keys2;
+    typename block_load_items1::TempStorage load_items1;
+    typename block_load_items2::TempStorage load_items2;
+    typename block_store_keys::TempStorage store_keys;
+    typename block_store_items::TempStorage store_items;
+
+    key_type keys_shared[Policy::ITEMS_PER_TILE + 1];
+    item_type items_shared[Policy::ITEMS_PER_TILE + 1];
+  };
+
+  struct TempStorage : Uninitialized<temp_storages>
+  {};
+
+  static constexpr int items_per_thread  = Policy::ITEMS_PER_THREAD;
+  static constexpr int threads_per_block = Policy::BLOCK_THREADS;
+  static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE;
+
+  // Per thread data
+  temp_storages& storage;
+  keys_load_it1 keys1_in;
+  items_load_it1 items1_in;
+  Offset keys1_count;
+  keys_load_it2 keys2_in;
+  items_load_it2 items2_in;
+  Offset keys2_count;
+  KeysOutputIt keys_out;
+  ItemsOutputIt items_out;
+  CompareOp compare_op;
+  Offset* merge_partitions;
+
+  template <bool IsFullTile>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(Offset tile_idx, Offset tile_base, int num_remaining)
+  {
+    const Offset partition_beg = merge_partitions[tile_idx + 0];
+    const Offset partition_end = merge_partitions[tile_idx + 1];
+
+    const Offset diag0 = items_per_tile * tile_idx;
+    const Offset diag1 = (cub::min)(keys1_count + keys2_count, diag0 + items_per_tile);
+
+    // compute bounding box for keys1 & keys2
+    const Offset keys1_beg = partition_beg;
+    const Offset keys1_end = partition_end;
+    const Offset keys2_beg = diag0 - keys1_beg;
+    const Offset keys2_end = diag1 - keys1_end;
+
+    // number of keys per tile
+    const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
+    const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
+
+    key_type keys_loc[items_per_thread];
+    gmem_to_reg<threads_per_block, IsFullTile>(
+      keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
+    reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
+    CTA_SYNC();
+
+    // use binary search in shared memory to find merge path for each of thread.
+    // we can use int type here, because the number of items in shared memory is limited
+    const int diag0_loc = min<int>(num_keys1 + num_keys2, items_per_thread * threadIdx.x);
+
+    const int keys1_beg_loc =
+      MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
+    const int keys1_end_loc = num_keys1;
+    const int keys2_beg_loc = diag0_loc - keys1_beg_loc;
+    const int keys2_end_loc = num_keys2;
+
+    const int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
+    const int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
+
+    // perform serial merge
+    int indices[items_per_thread];
+    cub::SerialMerge(
+      &storage.keys_shared[0],
+      keys1_beg_loc,
+      keys2_beg_loc + num_keys1,
+      num_keys1_loc,
+      num_keys2_loc,
+      keys_loc,
+      indices,
+      compare_op);
+    CTA_SYNC();
+
+    // write keys
+    if (IsFullTile)
+    {
+      block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc);
+    }
+    else
+    {
+      block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc, num_remaining);
+    }
+
+    // if items are provided, merge them
+    static constexpr bool have_items = !std::is_same<item_type, NullType>::value;
+#ifdef _CCCL_CUDACC_BELOW_11_8
+    if (have_items) // nvcc 11.1 cannot handle #pragma unroll inside if constexpr but 11.8 can.
+                    // nvcc versions between may work
+#else
+    _CCCL_IF_CONSTEXPR (have_items)
+#endif
+    {
+      item_type items_loc[items_per_thread];
+      gmem_to_reg<threads_per_block, IsFullTile>(
+        items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
+      CTA_SYNC(); // block_store_keys above uses shared memory, so make sure all threads are done before we write to it
+      reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
+      CTA_SYNC();
+
+      // gather items from shared mem
+#pragma unroll
+      for (int i = 0; i < items_per_thread; ++i)
+      {
+        items_loc[i] = storage.items_shared[indices[i]];
+      }
+      CTA_SYNC();
+
+      // write from reg to gmem
+      if (IsFullTile)
+      {
+        block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc);
+      }
+      else
+      {
+        block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc, num_remaining);
+      }
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
+  {
+    // XXX with 8.5 chaging type to Offset (or long long) results in error!
+    // TODO(bgruber): is the above still true?
+    const int tile_idx     = static_cast<int>(blockIdx.x);
+    const Offset tile_base = tile_idx * items_per_tile;
+    // TODO(bgruber): random mixing of int and Offset
+    const int items_in_tile =
+      static_cast<int>(cub::min(static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
+    if (items_in_tile == items_per_tile)
+    {
+      consume_tile<true>(tile_idx, tile_base, items_per_tile); // full tile
+    }
+    else
+    {
+      consume_tile<false>(tile_idx, tile_base, items_in_tile); // partial tile
+    }
+  }
+};
+} // namespace merge
+} // namespace detail
+CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh
index d7c0df7a302..123abb2b986 100644
--- a/cub/cub/agent/agent_merge_sort.cuh
+++ b/cub/cub/agent/agent_merge_sort.cuh
@@ -172,9 +172,9 @@ struct AgentBlockSort
   _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(OffsetT tile_base, int num_remaining)
   {
     ValueT items_local[ITEMS_PER_THREAD];
-    if (!KEYS_ONLY)
+    _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
     {
-      if (IS_LAST_TILE)
+      _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
       {
         BlockLoadItems(storage.load_items)
           .Load(items_in + tile_base, items_local, num_remaining, *(items_in + tile_base));
@@ -188,7 +188,7 @@ struct AgentBlockSort
     }
 
     KeyT keys_local[ITEMS_PER_THREAD];
-    if (IS_LAST_TILE)
+    _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
     {
       BlockLoadKeys(storage.load_keys).Load(keys_in + tile_base, keys_local, num_remaining, *(keys_in + tile_base));
     }
@@ -199,7 +199,7 @@ struct AgentBlockSort
 
     CTA_SYNC();
 
-    if (IS_LAST_TILE)
+    _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
     {
       BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op, num_remaining, keys_local[0]);
     }
@@ -212,7 +212,7 @@ struct AgentBlockSort
 
     if (ping)
     {
-      if (IS_LAST_TILE)
+      _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
       {
         BlockStoreKeysIt(storage.store_keys_it).Store(keys_out_it + tile_base, keys_local, num_remaining);
       }
@@ -221,11 +221,11 @@ struct AgentBlockSort
         BlockStoreKeysIt(storage.store_keys_it).Store(keys_out_it + tile_base, keys_local);
       }
 
-      if (!KEYS_ONLY)
+      _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
       {
         CTA_SYNC();
 
-        if (IS_LAST_TILE)
+        _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
         {
           BlockStoreItemsIt(storage.store_items_it).Store(items_out_it + tile_base, items_local, num_remaining);
         }
@@ -237,7 +237,7 @@ struct AgentBlockSort
     }
     else
     {
-      if (IS_LAST_TILE)
+      _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
       {
         BlockStoreKeysRaw(storage.store_keys_raw).Store(keys_out_raw + tile_base, keys_local, num_remaining);
       }
@@ -246,11 +246,11 @@ struct AgentBlockSort
         BlockStoreKeysRaw(storage.store_keys_raw).Store(keys_out_raw + tile_base, keys_local);
       }
 
-      if (!KEYS_ONLY)
+      _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
       {
         CTA_SYNC();
 
-        if (IS_LAST_TILE)
+        _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
         {
           BlockStoreItemsRaw(storage.store_items_raw).Store(items_out_raw + tile_base, items_local, num_remaining);
         }
@@ -316,25 +316,25 @@ struct AgentPartition
 
   _CCCL_DEVICE _CCCL_FORCEINLINE void Process()
   {
-    OffsetT merged_tiles_number = target_merged_tiles_number / 2;
+    const OffsetT merged_tiles_number = target_merged_tiles_number / 2;
 
     // target_merged_tiles_number is a power of two.
-    OffsetT mask = target_merged_tiles_number - 1;
+    const OffsetT mask = target_merged_tiles_number - 1;
 
     // The first tile number in the tiles group being merged, equal to:
     // target_merged_tiles_number * (partition_idx / target_merged_tiles_number)
-    OffsetT list  = ~mask & partition_idx;
-    OffsetT start = items_per_tile * list;
-    OffsetT size  = items_per_tile * merged_tiles_number;
+    const OffsetT list  = ~mask & partition_idx;
+    const OffsetT start = items_per_tile * list;
+    const OffsetT size  = items_per_tile * merged_tiles_number;
 
     // Tile number within the tile group being merged, equal to:
     // partition_idx / target_merged_tiles_number
-    OffsetT local_tile_idx = mask & partition_idx;
+    const OffsetT local_tile_idx = mask & partition_idx;
 
-    OffsetT keys1_beg = (cub::min)(keys_count, start);
-    OffsetT keys1_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(start, size));
-    OffsetT keys2_beg = keys1_end;
-    OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size));
+    const OffsetT keys1_beg = (cub::min)(keys_count, start);
+    const OffsetT keys1_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(start, size));
+    const OffsetT keys2_beg = keys1_end;
+    const OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size));
 
     // The last partition (which is one-past-the-last-tile) is only to mark the end of keys1_end for the merge stage
     if (partition_idx + 1 == num_partitions)
@@ -343,30 +343,77 @@ struct AgentPartition
     }
     else
     {
-      OffsetT partition_at = (cub::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx);
+      const OffsetT partition_at = (cub::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx);
 
       OffsetT partition_diag =
         ping
-          ? MergePath<KeyT>(
-              keys_ping + keys1_beg,
-              keys_ping + keys2_beg,
-              keys1_end - keys1_beg,
-              keys2_end - keys2_beg,
-              partition_at,
-              compare_op)
-          : MergePath<KeyT>(
-              keys_pong + keys1_beg,
-              keys_pong + keys2_beg,
-              keys1_end - keys1_beg,
-              keys2_end - keys2_beg,
-              partition_at,
-              compare_op);
+          ? MergePath(keys_ping + keys1_beg,
+                      keys_ping + keys2_beg,
+                      keys1_end - keys1_beg,
+                      keys2_end - keys2_beg,
+                      partition_at,
+                      compare_op)
+          : MergePath(keys_pong + keys1_beg,
+                      keys_pong + keys2_beg,
+                      keys1_end - keys1_beg,
+                      keys2_end - keys2_beg,
+                      partition_at,
+                      compare_op);
 
       merge_partitions[partition_idx] = keys1_beg + partition_diag;
     }
   }
 };
 
+namespace detail
+{
+/**
+ * \brief Concatenates up to ITEMS_PER_THREAD elements from input{1,2} into output array
+ *
+ * Reads data in a coalesced fashion [BLOCK_THREADS * item + tid] and
+ * stores the result in output[item].
+ */
+template <int BLOCK_THREADS, bool IS_FULL_TILE, int ITEMS_PER_THREAD, class T, class It1, class It2>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+gmem_to_reg(T (&output)[ITEMS_PER_THREAD], It1 input1, It2 input2, int count1, int count2)
+{
+  _CCCL_IF_CONSTEXPR (IS_FULL_TILE)
+  {
+#pragma unroll
+    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+    {
+      const int idx = BLOCK_THREADS * item + threadIdx.x;
+      // It1 and It2 could have different value types. Convert after load.
+      output[item] = (idx < count1) ? static_cast<T>(input1[idx]) : static_cast<T>(input2[idx - count1]);
+    }
+  }
+  else
+  {
+#pragma unroll
+    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+    {
+      const int idx = BLOCK_THREADS * item + threadIdx.x;
+      if (idx < count1 + count2)
+      {
+        output[item] = (idx < count1) ? static_cast<T>(input1[idx]) : static_cast<T>(input2[idx - count1]);
+      }
+    }
+  }
+}
+
+/// \brief Stores data in a coalesced fashion in[item] -> out[BLOCK_THREADS * item + tid]
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD, class T, class It>
+_CCCL_DEVICE _CCCL_FORCEINLINE void reg_to_shared(It output, T (&input)[ITEMS_PER_THREAD])
+{
+#pragma unroll
+  for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+  {
+    const int idx = BLOCK_THREADS * item + threadIdx.x;
+    output[idx]   = input[item];
+  }
+}
+} // namespace detail
+
 /// \brief The agent is responsible for merging N consecutive sorted arrays into N/2 sorted arrays.
 template <typename Policy,
           typename KeyIteratorT,
@@ -444,81 +491,36 @@ struct AgentMerge
   // Utility functions
   //---------------------------------------------------------------------
 
-  /**
-   * \brief Concatenates up to ITEMS_PER_THREAD elements from input{1,2} into output array
-   *
-   * Reads data in a coalesced fashion [BLOCK_THREADS * item + tid] and
-   * stores the result in output[item].
-   */
-  template <bool IS_FULL_TILE, class T, class It1, class It2>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void
-  gmem_to_reg(T (&output)[ITEMS_PER_THREAD], It1 input1, It2 input2, int count1, int count2)
-  {
-    if (IS_FULL_TILE)
-    {
-#pragma unroll
-      for (int item = 0; item < ITEMS_PER_THREAD; ++item)
-      {
-        int idx      = BLOCK_THREADS * item + threadIdx.x;
-        output[item] = (idx < count1) ? input1[idx] : input2[idx - count1];
-      }
-    }
-    else
-    {
-#pragma unroll
-      for (int item = 0; item < ITEMS_PER_THREAD; ++item)
-      {
-        int idx = BLOCK_THREADS * item + threadIdx.x;
-        if (idx < count1 + count2)
-        {
-          output[item] = (idx < count1) ? input1[idx] : input2[idx - count1];
-        }
-      }
-    }
-  }
-
-  /// \brief Stores data in a coalesced fashion in[item] -> out[BLOCK_THREADS * item + tid]
-  template <class T, class It>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void reg_to_shared(It output, T (&input)[ITEMS_PER_THREAD])
-  {
-#pragma unroll
-    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
-    {
-      int idx     = BLOCK_THREADS * item + threadIdx.x;
-      output[idx] = input[item];
-    }
-  }
-
   template <bool IS_FULL_TILE>
   _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(int tid, OffsetT tile_idx, OffsetT tile_base, int count)
   {
-    OffsetT partition_beg = merge_partitions[tile_idx + 0];
-    OffsetT partition_end = merge_partitions[tile_idx + 1];
+    const OffsetT partition_beg = merge_partitions[tile_idx + 0];
+    const OffsetT partition_end = merge_partitions[tile_idx + 1];
 
     // target_merged_tiles_number is a power of two.
-    OffsetT merged_tiles_number = target_merged_tiles_number / 2;
+    const OffsetT merged_tiles_number = target_merged_tiles_number / 2;
 
-    OffsetT mask = target_merged_tiles_number - 1;
+    const OffsetT mask = target_merged_tiles_number - 1;
 
     // The first tile number in the tiles group being merged, equal to:
     // target_merged_tiles_number * (tile_idx / target_merged_tiles_number)
-    OffsetT list  = ~mask & tile_idx;
-    OffsetT start = ITEMS_PER_TILE * list;
-    OffsetT size  = ITEMS_PER_TILE * merged_tiles_number;
+    const OffsetT list  = ~mask & tile_idx;
+    const OffsetT start = ITEMS_PER_TILE * list;
+    const OffsetT size  = ITEMS_PER_TILE * merged_tiles_number;
 
-    OffsetT diag = ITEMS_PER_TILE * tile_idx - start;
+    const OffsetT diag = ITEMS_PER_TILE * tile_idx - start;
 
-    OffsetT keys1_beg = partition_beg - start;
-    OffsetT keys1_end = partition_end - start;
+    const OffsetT keys1_beg = partition_beg - start;
+    OffsetT keys1_end       = partition_end - start;
 
-    OffsetT keys_end_dist_from_start = keys_count - start;
-    OffsetT max_keys2                = (keys_end_dist_from_start > size) ? (keys_end_dist_from_start - size) : 0;
+    const OffsetT keys_end_dist_from_start = keys_count - start;
+    const OffsetT max_keys2                = (keys_end_dist_from_start > size) ? (keys_end_dist_from_start - size) : 0;
 
     // We have the following invariants:
     // diag >= keys1_beg, because diag is the distance of the total merge path so far (keys1 + keys2)
     // diag+ITEMS_PER_TILE >= keys1_end, because diag+ITEMS_PER_TILE is the distance of the merge path for the next tile
     // and keys1_end is key1's component of that path
-    OffsetT keys2_beg = (cub::min)(max_keys2, diag - keys1_beg);
+    const OffsetT keys2_beg = (cub::min)(max_keys2, diag - keys1_beg);
     OffsetT keys2_end =
       (cub::min)(max_keys2, detail::safe_add_bound_to_max(diag, static_cast<OffsetT>(ITEMS_PER_TILE)) - keys1_end);
 
@@ -530,32 +532,32 @@ struct AgentMerge
     }
 
     // number of keys per tile
-    //
-    int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
-    int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
+    const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
+    const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
 
     // load keys1 & keys2
     KeyT keys_local[ITEMS_PER_THREAD];
     if (ping)
     {
-      gmem_to_reg<IS_FULL_TILE>(
+      detail::gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
         keys_local, keys_in_ping + start + keys1_beg, keys_in_ping + start + size + keys2_beg, num_keys1, num_keys2);
     }
     else
     {
-      gmem_to_reg<IS_FULL_TILE>(
+      detail::gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
         keys_local, keys_in_pong + start + keys1_beg, keys_in_pong + start + size + keys2_beg, num_keys1, num_keys2);
     }
-    reg_to_shared(&storage.keys_shared[0], keys_local);
+    detail::reg_to_shared<BLOCK_THREADS>(&storage.keys_shared[0], keys_local);
 
     // preload items into registers already
     //
     ValueT items_local[ITEMS_PER_THREAD];
-    if (!KEYS_ONLY)
+    (void) items_local; // TODO(bgruber): replace by [[maybe_unused]] in C++17
+    _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
     {
       if (ping)
       {
-        gmem_to_reg<IS_FULL_TILE>(
+        detail::gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
           items_local,
           items_in_ping + start + keys1_beg,
           items_in_ping + start + size + keys2_beg,
@@ -564,7 +566,7 @@ struct AgentMerge
       }
       else
       {
-        gmem_to_reg<IS_FULL_TILE>(
+        detail::gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
           items_local,
           items_in_pong + start + keys1_beg,
           items_in_pong + start + size + keys2_beg,
@@ -580,16 +582,16 @@ struct AgentMerge
     // we can use int type here, because the number of
     // items in shared memory is limited
     //
-    int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid);
+    const int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid);
 
-    int keys1_beg_local = MergePath<KeyT>(
+    const int keys1_beg_local = MergePath(
       &storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_local, compare_op);
-    int keys1_end_local = num_keys1;
-    int keys2_beg_local = diag0_local - keys1_beg_local;
-    int keys2_end_local = num_keys2;
+    const int keys1_end_local = num_keys1;
+    const int keys2_beg_local = diag0_local - keys1_beg_local;
+    const int keys2_end_local = num_keys2;
 
-    int num_keys1_local = keys1_end_local - keys1_beg_local;
-    int num_keys2_local = keys2_end_local - keys2_beg_local;
+    const int num_keys1_local = keys1_end_local - keys1_beg_local;
+    const int num_keys2_local = keys2_end_local - keys2_beg_local;
 
     // perform serial merge
     //
@@ -608,10 +610,9 @@ struct AgentMerge
     CTA_SYNC();
 
     // write keys
-    //
     if (ping)
     {
-      if (IS_FULL_TILE)
+      _CCCL_IF_CONSTEXPR (IS_FULL_TILE)
       {
         BlockStoreKeysPing(storage.store_keys_ping).Store(keys_out_ping + tile_base, keys_local);
       }
@@ -622,7 +623,7 @@ struct AgentMerge
     }
     else
     {
-      if (IS_FULL_TILE)
+      _CCCL_IF_CONSTEXPR (IS_FULL_TILE)
       {
         BlockStoreKeysPong(storage.store_keys_pong).Store(keys_out_pong + tile_base, keys_local);
       }
@@ -633,11 +634,16 @@ struct AgentMerge
     }
 
     // if items are provided, merge them
-    if (!KEYS_ONLY)
+#ifdef _CCCL_CUDACC_BELOW_11_8
+    if (!KEYS_ONLY) // nvcc 11.1 cannot handle #pragma unroll inside if constexpr but 11.8 can.
+                    // nvcc versions between may work
+#else
+    _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
+#endif
     {
       CTA_SYNC();
 
-      reg_to_shared(&storage.items_shared[0], items_local);
+      detail::reg_to_shared<BLOCK_THREADS>(&storage.items_shared[0], items_local);
 
       CTA_SYNC();
 
@@ -655,7 +661,7 @@ struct AgentMerge
       //
       if (ping)
       {
-        if (IS_FULL_TILE)
+        _CCCL_IF_CONSTEXPR (IS_FULL_TILE)
         {
           BlockStoreItemsPing(storage.store_items_ping).Store(items_out_ping + tile_base, items_local);
         }
@@ -666,7 +672,7 @@ struct AgentMerge
       }
       else
       {
-        if (IS_FULL_TILE)
+        _CCCL_IF_CONSTEXPR (IS_FULL_TILE)
         {
           BlockStoreItemsPong(storage.store_items_pong).Store(items_out_pong + tile_base, items_local);
         }
@@ -711,11 +717,12 @@ struct AgentMerge
 
   _CCCL_DEVICE _CCCL_FORCEINLINE void Process()
   {
-    int tile_idx      = static_cast<int>(blockIdx.x);
-    int num_tiles     = static_cast<int>(gridDim.x);
-    OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE;
-    int tid           = static_cast<int>(threadIdx.x);
-    int items_in_tile = static_cast<int>((cub::min)(static_cast<OffsetT>(ITEMS_PER_TILE), keys_count - tile_base));
+    const int tile_idx      = static_cast<int>(blockIdx.x);
+    const int num_tiles     = static_cast<int>(gridDim.x);
+    const OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE;
+    const int tid           = static_cast<int>(threadIdx.x);
+    const int items_in_tile =
+      static_cast<int>((cub::min)(static_cast<OffsetT>(ITEMS_PER_TILE), keys_count - tile_base));
 
     if (tile_idx < num_tiles - 1)
     {
diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh
index b75016ac7fa..2fb15e9059b 100644
--- a/cub/cub/block/block_discontinuity.cuh
+++ b/cub/cub/block/block_discontinuity.cuh
@@ -28,7 +28,7 @@
 
 /**
  * @file
- * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](../index.html#sec0) methods for
  * flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
  */
 
diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index 256c7fb4888..a781d68e68b 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,8 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file The cub::BlockExchange class provides :ref:`collective <collective-primitives>` methods for
-//!       rearranging data partitioned across a CUDA thread block.
+//! @file
+//! The cub::BlockExchange class provides :ref:`collective <collective-primitives>` methods for
+//! rearranging data partitioned across a CUDA thread block.
 
 #pragma once
 
@@ -55,11 +56,10 @@ CUB_NAMESPACE_BEGIN
 //! Overview
 //! +++++++++++++++++++++++++++++++++++++++++++++
 //!
-//! - It is commonplace for blocks of threads to rearrange data items between
-//!   threads.  For example, the device-accessible memory subsystem prefers access patterns
-//!   where data items are "striped" across threads (where consecutive threads access consecutive items),
-//!   yet most block-wide operations prefer a "blocked" partitioning of items across threads
-//!   (where consecutive items belong to a single thread).
+//! - It is commonplace for blocks of threads to rearrange data items between threads.  For example, the
+//!   device-accessible memory subsystem prefers access patterns where data items are "striped" across threads (where
+//!   consecutive threads access consecutive items), yet most block-wide operations prefer a "blocked" partitioning of
+//!   items across threads (where consecutive items belong to a single thread).
 //! - BlockExchange supports the following types of data exchanges:
 //!
 //!   - Transposing between :ref:`blocked <flexible-data-arrangement>` and :ref:`striped <flexible-data-arrangement>`
@@ -76,8 +76,8 @@ CUB_NAMESPACE_BEGIN
 //!
 //! @blockcollective{BlockExchange}
 //!
-//! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
-//! of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+//! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement of 512 integer items
+//! partitioned across 128 threads where each thread owns 4 items.
 //!
 //! .. code-block:: c++
 //!
@@ -98,9 +98,8 @@ CUB_NAMESPACE_BEGIN
 //!        // Collectively exchange data into a blocked arrangement across threads
 //!        BlockExchange(temp_storage).StripedToBlocked(thread_data);
 //!
-//! Suppose the set of striped input ``thread_data`` across the block of threads is
-//! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }``.
-//! The corresponding output ``thread_data`` in those threads will be
+//! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
+//! [1,129,257,385], ..., [127,255,383,511] }``. The corresponding output ``thread_data`` in those threads will be
 //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
 //!
 //! Performance Considerations
@@ -112,33 +111,33 @@ CUB_NAMESPACE_BEGIN
 //! +++++++++++++++++++++++++++++++++++++++++++++
 //!
 //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
-//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to
-//! the storage required by BlockExchange.
+//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage required
+//! by BlockExchange.
 //! @endrst
 //!
 //! @tparam T
-//!   The data type to be exchanged
+//!    The data type to be exchanged
 //!
 //! @tparam BLOCK_DIM_X
-//!   The thread block length in threads along the X dimension
+//!    The thread block length in threads along the X dimension
 //!
 //! @tparam ITEMS_PER_THREAD
-//!   The number of items partitioned onto each thread.
+//!    The number of items partitioned onto each thread.
 //!
 //! @tparam WARP_TIME_SLICING
-//!   **[optional]** When `true`, only use enough shared memory for a single warp's worth of tile data,
-//!   time-slicing the block-wide exchange over multiple synchronized rounds.
-//!   Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false)
+//!    **[optional]** When `true`, only use enough shared memory for a single warp's worth of
+//! tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint
+//! at the expense of decreased parallelism. (Default: false)
 //!
 //! @tparam BLOCK_DIM_Y
-//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!    **[optional]** The thread block length in threads along the Y dimension (default: 1)
 //!
 //! @tparam BLOCK_DIM_Z
-//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!    **[optional]** The thread block length in threads along the Z dimension (default: 1)
 //!
 //! @tparam LEGACY_PTX_ARCH
-//!   <b>[optional]</b> Unused.
-template <typename InputT,
+//!    <b>[optional]</b> Unused.
+template <typename T,
           int BLOCK_DIM_X,
           int ITEMS_PER_THREAD,
           bool WARP_TIME_SLICING = false,
@@ -147,56 +146,42 @@ template <typename InputT,
           int LEGACY_PTX_ARCH    = 0>
 class BlockExchange
 {
-private:
-  /// Constants
-  enum
-  {
-    /// The thread block size in threads
-    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-    LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0),
-    WARP_THREADS     = 1 << LOG_WARP_THREADS,
-    WARPS            = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-    LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0),
-    SMEM_BANKS     = 1 << LOG_SMEM_BANKS,
-
-    TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-    TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1,
-
-    TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
-    TIME_SLICED_ITEMS   = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-    WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
-    WARP_TIME_SLICED_ITEMS   = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-    // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise
-    // we can typically use 128b loads)
-    INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
-    PADDING_ITEMS  = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
-  };
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; ///< The thread block size in threads
+  static constexpr int WARP_THREADS  = CUB_WARP_THREADS(0);
+  static constexpr int WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS; // TODO(bgruber): use ceil_div in
+                                                                                  // C++14
+  static constexpr int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0);
+
+  static constexpr int TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr int TIME_SLICES         = WARP_TIME_SLICING ? WARPS : 1;
+  static constexpr int TIME_SLICED_THREADS = WARP_TIME_SLICING ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS;
+  static constexpr int TIME_SLICED_ITEMS   = TIME_SLICED_THREADS * ITEMS_PER_THREAD;
+  static constexpr int WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS);
+  static constexpr int WARP_TIME_SLICED_ITEMS   = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD;
+
+  // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise
+  // we can typically use 128b loads)
+  static constexpr bool INSERT_PADDING = ITEMS_PER_THREAD > 4 && PowerOfTwo<ITEMS_PER_THREAD>::VALUE;
+  static constexpr int PADDING_ITEMS   = INSERT_PADDING ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0;
 
   /// Shared memory storage layout type
-  struct __align__(16) _TempStorage
+  struct alignas(16) _TempStorage
   {
-    InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
+    T buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
   };
 
 public:
   /// @smemstorage{BlockExchange}
-  struct TempStorage : Uninitialized<_TempStorage>
-  {};
+  using TempStorage = Uninitialized<_TempStorage>;
 
 private:
-  /// Shared storage reference
   _TempStorage& temp_storage;
 
-  /// Linear thread-id
-  unsigned int linear_tid;
-  unsigned int lane_id;
-  unsigned int warp_id;
-  unsigned int warp_offset;
+  // TODO(bgruber): can we use signed int here? Only these variables are unsigned:
+  unsigned int linear_tid  = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+  unsigned int lane_id     = LaneId();
+  unsigned int warp_id     = WARPS == 1 ? 0 : linear_tid / WARP_THREADS;
+  unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS;
 
   /// Internal storage allocator
   _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
@@ -205,8 +190,8 @@ private:
     return private_storage;
   }
 
-  //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement.
-  //!        Specialized for no timeslicing.
+  //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. Specialized for no
+  //!        timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -215,35 +200,37 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<false> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-      if (INSERT_PADDING)
+      int item_offset = linear_tid * ITEMS_PER_THREAD + i;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
     CTA_SYNC();
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-      if (INSERT_PADDING)
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
-  //! @brief Transposes data items from **blocked** arrangement to **striped**
-  //!        arrangement. Specialized for warp-timeslicing.
+  //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. Specialized for
+  //!        warp-timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -252,51 +239,51 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
   {
-    InputT temp_items[ITEMS_PER_THREAD];
+    T temp_items[ITEMS_PER_THREAD];
 
 #pragma unroll
-    for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+    for (int slice = 0; slice < TIME_SLICES; slice++)
     {
-      const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
-      const int SLICE_OOB    = SLICE_OFFSET + TIME_SLICED_ITEMS;
+      const int slice_offset = slice * TIME_SLICED_ITEMS;
+      const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
 
       CTA_SYNC();
 
-      if (warp_id == SLICE)
+      if (warp_id == slice)
       {
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-          if (INSERT_PADDING)
+          int item_offset = lane_id * ITEMS_PER_THREAD + i;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset += item_offset >> LOG_SMEM_BANKS;
           }
-          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
       }
 
       CTA_SYNC();
 
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
         // Read a strip of items
-        const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
-        const int STRIP_OOB    = STRIP_OFFSET + BLOCK_THREADS;
+        const int strip_offset = i * BLOCK_THREADS;
+        const int strip_oob    = strip_offset + BLOCK_THREADS;
 
-        if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+        if (slice_offset < strip_oob && slice_oob > strip_offset)
         {
-          int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-          if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+          int item_offset = strip_offset + linear_tid - slice_offset;
+          if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS)
           {
-            if (INSERT_PADDING)
+            _CCCL_IF_CONSTEXPR (INSERT_PADDING)
             {
               item_offset += item_offset >> LOG_SMEM_BANKS;
             }
-            temp_items[ITEM] = temp_storage.buff[item_offset];
+            temp_items[i] = temp_storage.buff[item_offset];
           }
         }
       }
@@ -304,14 +291,14 @@ private:
 
 // Copy
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      output_items[ITEM] = temp_items[ITEM];
+      output_items[i] = temp_items[i];
     }
   }
 
-  //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement.
-  //!        Specialized for no timeslicing
+  //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. Specialized for no
+  //!        timeslicing
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -320,35 +307,37 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<false> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
-      if (INSERT_PADDING)
+      int item_offset = warp_offset + i + (lane_id * ITEMS_PER_THREAD);
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
     WARP_SYNC(0xffffffff);
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-      if (INSERT_PADDING)
+      int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
-  //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement.
-  //!        Specialized for warp-timeslicing
+  //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. Specialized for
+  //!        warp-timeslicing
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -357,71 +346,71 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
   {
     if (warp_id == 0)
     {
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
-        int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-        if (INSERT_PADDING)
+        int item_offset = i + lane_id * ITEMS_PER_THREAD;
+        _CCCL_IF_CONSTEXPR (INSERT_PADDING)
         {
           item_offset += item_offset >> LOG_SMEM_BANKS;
         }
-        detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+        detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
       }
 
       WARP_SYNC(0xffffffff);
 
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
-        int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-        if (INSERT_PADDING)
+        int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
+        _CCCL_IF_CONSTEXPR (INSERT_PADDING)
         {
           item_offset += item_offset >> LOG_SMEM_BANKS;
         }
-        output_items[ITEM] = temp_storage.buff[item_offset];
+        output_items[i] = temp_storage.buff[item_offset];
       }
     }
 
 #pragma unroll
-    for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
+    for (int slice = 1; slice < TIME_SLICES; ++slice)
     {
       CTA_SYNC();
 
-      if (warp_id == SLICE)
+      if (warp_id == slice)
       {
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-          if (INSERT_PADDING)
+          int item_offset = i + lane_id * ITEMS_PER_THREAD;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset += item_offset >> LOG_SMEM_BANKS;
           }
-          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
 
         WARP_SYNC(0xffffffff);
 
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-          if (INSERT_PADDING)
+          int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset += item_offset >> LOG_SMEM_BANKS;
           }
-          output_items[ITEM] = temp_storage.buff[item_offset];
+          output_items[i] = temp_storage.buff[item_offset];
         }
       }
     }
   }
 
-  //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement.
-  //!        Specialized for no timeslicing.
+  //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. Specialized for no
+  //!        timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -430,36 +419,38 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<false> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-      if (INSERT_PADDING)
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
     CTA_SYNC();
 
 // No timeslicing
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-      if (INSERT_PADDING)
+      int item_offset = linear_tid * ITEMS_PER_THREAD + i;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
-  //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement.
-  //!        Specialized for warp-timeslicing.
+  //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. Specialized for
+  //!        warp-timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -468,67 +459,67 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
   {
     // Warp time-slicing
-    InputT temp_items[ITEMS_PER_THREAD];
+    T temp_items[ITEMS_PER_THREAD];
 
 #pragma unroll
-    for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+    for (int slice = 0; slice < TIME_SLICES; slice++)
     {
-      const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
-      const int SLICE_OOB    = SLICE_OFFSET + TIME_SLICED_ITEMS;
+      const int slice_offset = slice * TIME_SLICED_ITEMS;
+      const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
 
       CTA_SYNC();
 
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
         // Write a strip of items
-        const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
-        const int STRIP_OOB    = STRIP_OFFSET + BLOCK_THREADS;
+        const int strip_offset = i * BLOCK_THREADS;
+        const int strip_oob    = strip_offset + BLOCK_THREADS;
 
-        if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+        if (slice_offset < strip_oob && slice_oob > strip_offset)
         {
-          int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-          if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+          int item_offset = strip_offset + linear_tid - slice_offset;
+          if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS)
           {
-            if (INSERT_PADDING)
+            _CCCL_IF_CONSTEXPR (INSERT_PADDING)
             {
               item_offset += item_offset >> LOG_SMEM_BANKS;
             }
-            detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+            detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
           }
         }
       }
 
       CTA_SYNC();
 
-      if (warp_id == SLICE)
+      if (warp_id == slice)
       {
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-          if (INSERT_PADDING)
+          int item_offset = lane_id * ITEMS_PER_THREAD + i;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset += item_offset >> LOG_SMEM_BANKS;
           }
-          temp_items[ITEM] = temp_storage.buff[item_offset];
+          temp_items[i] = temp_storage.buff[item_offset];
         }
       }
     }
 
 // Copy
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      output_items[ITEM] = temp_items[ITEM];
+      output_items[i] = temp_items[i];
     }
   }
 
-  //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement.
-  //!        Specialized for no timeslicing
+  //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. Specialized for no
+  //!        timeslicing
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -537,35 +528,37 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<false> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-      if (INSERT_PADDING)
+      int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
     WARP_SYNC(0xffffffff);
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
-      if (INSERT_PADDING)
+      int item_offset = warp_offset + i + (lane_id * ITEMS_PER_THREAD);
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset += item_offset >> LOG_SMEM_BANKS;
       }
-      detail::uninitialized_copy_single(output_items + ITEM, temp_storage.buff[item_offset]);
+      detail::uninitialized_copy_single(output_items + i, temp_storage.buff[item_offset]);
     }
   }
 
-  //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement.
-  //!        Specialized for warp-timeslicing
+  //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. Specialized for
+  //! warp-timeslicing
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -574,44 +567,43 @@ private:
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
+    const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
   {
 #pragma unroll
-    for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+    for (int slice = 0; slice < TIME_SLICES; ++slice)
     {
       CTA_SYNC();
 
-      if (warp_id == SLICE)
+      if (warp_id == slice)
       {
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-          if (INSERT_PADDING)
+          int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset += item_offset >> LOG_SMEM_BANKS;
           }
-          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
 
         WARP_SYNC(0xffffffff);
 
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-          if (INSERT_PADDING)
+          int item_offset = i + lane_id * ITEMS_PER_THREAD;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset += item_offset >> LOG_SMEM_BANKS;
           }
-          output_items[ITEM] = temp_storage.buff[item_offset];
+          output_items[i] = temp_storage.buff[item_offset];
         }
       }
     }
   }
 
-  //! @brief Exchanges data items annotated by rank into **blocked** arrangement.
-  //!        Specialized for no timeslicing.
+  //! @brief Exchanges data items annotated by rank into **blocked** arrangement. Specialized for no timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -623,38 +615,37 @@ private:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD],
     Int2Type<false> /*time_slicing*/)
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = ranks[ITEM];
-      if (INSERT_PADDING)
+      int item_offset = ranks[i];
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
     CTA_SYNC();
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-      if (INSERT_PADDING)
+      int item_offset = linear_tid * ITEMS_PER_THREAD + i;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
-  //! @brief Exchanges data items annotated by rank into **blocked** arrangement.
-  //!        Specialized for warp-timeslicing.
+  //! @brief Exchanges data items annotated by rank into **blocked** arrangement. Specialized for warp-timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -666,61 +657,60 @@ private:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT ranks[ITEMS_PER_THREAD],
     Int2Type<true> /*time_slicing*/)
   {
-    InputT temp_items[ITEMS_PER_THREAD];
+    T temp_items[ITEMS_PER_THREAD];
 
 #pragma unroll
-    for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+    for (int slice = 0; slice < TIME_SLICES; slice++)
     {
       CTA_SYNC();
 
-      const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
+      const int slice_offset = TIME_SLICED_ITEMS * slice;
 
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
-        int item_offset = ranks[ITEM] - SLICE_OFFSET;
-        if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+        int item_offset = ranks[i] - slice_offset;
+        if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS)
         {
-          if (INSERT_PADDING)
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
           }
-          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
       }
 
       CTA_SYNC();
 
-      if (warp_id == SLICE)
+      if (warp_id == slice)
       {
 #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
         {
-          int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-          if (INSERT_PADDING)
+          int item_offset = lane_id * ITEMS_PER_THREAD + i;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
           }
-          temp_items[ITEM] = temp_storage.buff[item_offset];
+          temp_items[i] = temp_storage.buff[item_offset];
         }
       }
     }
 
 // Copy
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      output_items[ITEM] = temp_items[ITEM];
+      output_items[i] = temp_items[i];
     }
   }
 
-  //! @brief Exchanges data items annotated by rank into **striped** arrangement.
-  //!        Specialized for no timeslicing.
+  //! @brief Exchanges data items annotated by rank into **striped** arrangement. Specialized for no timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -732,38 +722,37 @@ private:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD],
     Int2Type<false> /*time_slicing*/)
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = ranks[ITEM];
-      if (INSERT_PADDING)
+      int item_offset = ranks[i];
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
     }
 
     CTA_SYNC();
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-      if (INSERT_PADDING)
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
-  //! @brief Exchanges data items annotated by rank into **striped** arrangement.
-  //!        Specialized for warp-timeslicing.
+  //! @brief Exchanges data items annotated by rank into **striped** arrangement. Specialized for warp-timeslicing.
   //!
   //! @param[in] input_items
   //!   Items to exchange, converting between **blocked** and **striped** arrangements.
@@ -775,54 +764,54 @@ private:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD],
     Int2Type<true> /*time_slicing*/)
   {
-    InputT temp_items[ITEMS_PER_THREAD];
+    T temp_items[ITEMS_PER_THREAD];
 
 #pragma unroll
-    for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+    for (int slice = 0; slice < TIME_SLICES; slice++)
     {
-      const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
-      const int SLICE_OOB    = SLICE_OFFSET + TIME_SLICED_ITEMS;
+      const int slice_offset = slice * TIME_SLICED_ITEMS;
+      const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
 
       CTA_SYNC();
 
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
-        int item_offset = ranks[ITEM] - SLICE_OFFSET;
-        if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+        int item_offset = ranks[i] - slice_offset;
+        if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS)
         {
-          if (INSERT_PADDING)
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
           {
             item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
           }
-          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]);
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
         }
       }
 
       CTA_SYNC();
 
 #pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
       {
         // Read a strip of items
-        const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
-        const int STRIP_OOB    = STRIP_OFFSET + BLOCK_THREADS;
+        const int strip_offset = i * BLOCK_THREADS;
+        const int strip_oob    = strip_offset + BLOCK_THREADS;
 
-        if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+        if (slice_offset < strip_oob && slice_oob > strip_offset)
         {
-          int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-          if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+          int item_offset = strip_offset + linear_tid - slice_offset;
+          if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS)
           {
-            if (INSERT_PADDING)
+            _CCCL_IF_CONSTEXPR (INSERT_PADDING)
             {
               item_offset += item_offset >> LOG_SMEM_BANKS;
             }
-            temp_items[ITEM] = temp_storage.buff[item_offset];
+            temp_items[i] = temp_storage.buff[item_offset];
           }
         }
       }
@@ -830,9 +819,9 @@ private:
 
 // Copy
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      output_items[ITEM] = temp_items[ITEM];
+      output_items[i] = temp_items[i];
     }
   }
 
@@ -840,29 +829,15 @@ public:
   //! @name Collective constructors
   //! @{
 
-  /**
-   * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
-   */
+  //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
   _CCCL_DEVICE _CCCL_FORCEINLINE BlockExchange()
       : temp_storage(PrivateStorage())
-      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-      , lane_id(LaneId())
-      , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
-      , warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
   {}
 
-  /**
-   * @brief Collective constructor using the specified memory allocation as temporary storage.
-   *
-   * @param[in] temp_storage
-   *   Reference to memory allocation having layout type TempStorage
-   */
+  //! @brief Collective constructor using the specified memory allocation as temporary storage.
+  //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage
   _CCCL_DEVICE _CCCL_FORCEINLINE BlockExchange(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
-      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-      , lane_id(LaneId())
-      , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
-      , warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
   {}
 
   //! @} end member group
@@ -899,10 +874,9 @@ public:
   //!        // Collectively exchange data into a blocked arrangement across threads
   //!        BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
   //!
-  //! Suppose the set of striped input ``thread_data`` across the block of threads is
-  //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` after loading from
-  //! device-accessible memory. The corresponding output ``thread_data`` in those threads will be
-  //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+  //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
+  //! [1,129,257,385], ..., [127,255,383,511] }`` after loading from device-accessible memory. The corresponding output
+  //! ``thread_data`` in those threads will be ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
   //! @endrst
   //!
   //! @param[in] input_items
@@ -912,7 +886,7 @@ public:
   //!   Items from exchange, converting between **striped** and **blocked** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  StripedToBlocked(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
   {
     StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
   }
@@ -950,11 +924,10 @@ public:
   //!        // Store data striped across block threads into an ordered tile
   //!        cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
   //!
-  //! Suppose the set of blocked input ``thread_data`` across the block of threads is
-  //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
-  //! The corresponding output ``thread_data`` in those threads will be
-  //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` in
-  //! preparation for storing to device-accessible memory.
+  //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
+  //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
+  //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` in preparation for storing to device-accessible
+  //! memory.
   //! @endrst
   //!
   //! @param[in] input_items
@@ -964,7 +937,7 @@ public:
   //!   Items from exchange, converting between **striped** and **blocked** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  BlockedToStriped(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
   {
     BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
   }
@@ -1001,12 +974,11 @@ public:
   //!        // Collectively exchange data into a blocked arrangement across threads
   //!        BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
   //!
-  //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is
-  //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }``
-  //! after loading from device-accessible memory. (The first 128 items are striped across
-  //! the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-  //! The corresponding output ``thread_data`` in those threads will be
-  //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+  //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is ``{ [0,32,64,96],
+  //! [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` after loading from device-accessible memory. (The first 128
+  //! items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp,
+  //! etc.) The corresponding output ``thread_data`` in those threads will be ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11],
+  //! ..., [508,509,510,511] }``.
   //! @endrst
   //!
   //! @param[in] input_items
@@ -1016,7 +988,7 @@ public:
   //!   Items from exchange, converting between **striped** and **blocked** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  WarpStripedToBlocked(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
   {
     WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
   }
@@ -1056,12 +1028,11 @@ public:
   //!        // Store data striped across warp threads into an ordered tile
   //!        cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
   //!
-  //! Suppose the set of blocked input ``thread_data`` across the block of threads is
-  //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
-  //! The corresponding output ``thread_data`` in those threads will be
-  //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }``
-  //! in preparation for storing to device-accessible memory. (The first 128 items are striped
-  //! across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+  //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
+  //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
+  //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` in preparation for storing to
+  //! device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128
+  //! items are striped across the second warp, etc.)
   //! @endrst
   //!
   //! @param[in] input_items
@@ -1071,7 +1042,7 @@ public:
   //!   Items from exchange, converting between **striped** and **blocked** arrangements.
   template <typename OutputT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  BlockedToWarpStriped(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
   {
     BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
   }
@@ -1099,7 +1070,7 @@ public:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD])
   {
@@ -1126,7 +1097,7 @@ public:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD])
   {
@@ -1153,35 +1124,35 @@ public:
   //!   Corresponding scatter ranks
   template <typename OutputT, typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedGuarded(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD])
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = ranks[ITEM];
-      if (INSERT_PADDING)
+      int item_offset = ranks[i];
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      if (ranks[ITEM] >= 0)
+      if (ranks[i] >= 0)
       {
-        temp_storage.buff[item_offset] = input_items[ITEM];
+        temp_storage.buff[item_offset] = input_items[i];
       }
     }
 
     CTA_SYNC();
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-      if (INSERT_PADDING)
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
@@ -1211,36 +1182,36 @@ public:
   //!   Corresponding flag denoting item validity
   template <typename OutputT, typename OffsetT, typename ValidFlag>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged(
-    InputT (&input_items)[ITEMS_PER_THREAD],
+    const T (&input_items)[ITEMS_PER_THREAD],
     OutputT (&output_items)[ITEMS_PER_THREAD],
     OffsetT (&ranks)[ITEMS_PER_THREAD],
     ValidFlag (&is_valid)[ITEMS_PER_THREAD])
   {
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = ranks[ITEM];
-      if (INSERT_PADDING)
+      int item_offset = ranks[i];
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      if (is_valid[ITEM])
+      if (is_valid[i])
       {
-        temp_storage.buff[item_offset] = input_items[ITEM];
+        temp_storage.buff[item_offset] = input_items[i];
       }
     }
 
     CTA_SYNC();
 
 #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
     {
-      int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-      if (INSERT_PADDING)
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
       {
         item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
       }
-      output_items[ITEM] = temp_storage.buff[item_offset];
+      output_items[i] = temp_storage.buff[item_offset];
     }
   }
 
@@ -1248,97 +1219,75 @@ public:
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   */
-  _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(InputT (&items)[ITEMS_PER_THREAD])
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(T (&items)[ITEMS_PER_THREAD])
   {
     StripedToBlocked(items, items);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   */
-  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(InputT (&items)[ITEMS_PER_THREAD])
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(T (&items)[ITEMS_PER_THREAD])
   {
     BlockedToStriped(items, items);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   */
-  _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(InputT (&items)[ITEMS_PER_THREAD])
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(T (&items)[ITEMS_PER_THREAD])
   {
     WarpStripedToBlocked(items, items);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   */
-  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(InputT (&items)[ITEMS_PER_THREAD])
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(T (&items)[ITEMS_PER_THREAD])
   {
     BlockedToWarpStriped(items, items);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   *
-   * @param[in] ranks
-   *   Corresponding scatter ranks
-   */
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  ///
+  /// @param[in] ranks
+  ///   Corresponding scatter ranks
   template <typename OffsetT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void
-  ScatterToBlocked(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
   {
     ScatterToBlocked(items, items, ranks);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   *
-   * @param[in] ranks
-   *   Corresponding scatter ranks
-   */
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  /// @param[in] ranks
+  ///   Corresponding scatter ranks
   template <typename OffsetT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void
-  ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
   {
     ScatterToStriped(items, items, ranks);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   *
-   * @param[in] ranks
-   *   Corresponding scatter ranks
-   */
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  /// @param[in] ranks
+  ///   Corresponding scatter ranks
   template <typename OffsetT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  ScatterToStripedGuarded(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
+  ScatterToStripedGuarded(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
   {
     ScatterToStripedGuarded(items, items, ranks);
   }
 
-  /**
-   * @param[in-out] items
-   *   Items to exchange, converting between **striped** and **blocked** arrangements.
-   *
-   * @param[in] ranks
-   *   Corresponding scatter ranks
-   *
-   * @param[in] is_valid
-   *   Corresponding flag denoting item validity
-   */
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  /// @param[in] ranks
+  ///   Corresponding scatter ranks
+  /// @param[in] is_valid
+  ///   Corresponding flag denoting item validity
   template <typename OffsetT, typename ValidFlag>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged(
-    InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD])
+    T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD])
   {
     ScatterToStriped(items, items, ranks, is_valid);
   }
diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh
index 3553ec79da6..d5726f240f6 100644
--- a/cub/cub/block/block_histogram.cuh
+++ b/cub/cub/block/block_histogram.cuh
@@ -28,7 +28,7 @@
 
 /**
  * @file
- * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for
+ * The cub::BlockHistogram class provides [<em>collective</em>](../index.html#sec0) methods for
  * constructing block-wide histograms from data samples partitioned across a CUDA thread block.
  */
 
diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh
index 87adeb54515..76c073f1b54 100644
--- a/cub/cub/block/block_load.cuh
+++ b/cub/cub/block/block_load.cuh
@@ -26,7 +26,7 @@
  *
  ******************************************************************************/
 
-//! @file Operations for reading linear tiles of data into the CUDA thread block.
+//! @file block_load.cuh Operations for reading linear tiles of data into the CUDA thread block.
 
 #pragma once
 
@@ -54,7 +54,6 @@ CUB_NAMESPACE_BEGIN
 //! Load a linear segment of items into a blocked arrangement across the thread block.
 //!
 //! @blocked
-//!
 //! @endrst
 //!
 //! @tparam T
@@ -63,27 +62,27 @@ CUB_NAMESPACE_BEGIN
 //! @tparam ITEMS_PER_THREAD
 //!   **[inferred]** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **[inferred]** The random-access iterator type for input iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
 //!
-//! @param[in] block_itr
+//! @param[in] block_src_it
 //!   The thread block's base input iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
-template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
+//! @param[out] dst_items
+//!   Destination to load data into
+template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
 _CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
 {
 // Load directly in thread-blocked order
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+    dst_items[i] = block_src_it[linear_tid * ITEMS_PER_THREAD + i];
   }
 }
 
@@ -100,31 +99,32 @@ LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEM
 //! @tparam ITEMS_PER_THREAD
 //!   **[inferred]** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **[inferred]** The random-access iterator type for input iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 //!
-//! @param[in] valid_items
-//!   Number of valid items to load
-template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+//! @param[in] block_items_end
+//!   First out-of-bounds index when loading from block_src_it
+template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
+  int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
 {
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
+    const auto src_pos = linear_tid * ITEMS_PER_THREAD + i;
+    if (src_pos < block_items_end)
     {
-      items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+      dst_items[i] = block_src_it[src_pos];
     }
   }
 }
@@ -143,35 +143,39 @@ LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEM
 //! @tparam ITEMS_PER_THREAD
 //!   **[inferred]** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **[inferred]** The random-access iterator type for input \iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
 //!
-//! @param[in] block_itr
+//! @param[in] block_src_it
 //!   The thread block's base input iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 //!
-//! @param[in] valid_items
-//!   Number of valid items to load
+//! @param[in] block_items_end
+//!   First out-of-bounds index when loading from block_src_it
 //!
 //! @param[in] oob_default
 //!   Default value to assign out-of-bound items
-template <typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT>
+template <typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
 _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
-  int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+  int linear_tid,
+  RandomAccessIterator block_src_it,
+  T (&dst_items)[ITEMS_PER_THREAD],
+  int block_items_end,
+  DefaultT oob_default)
 {
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = oob_default;
+    dst_items[i] = oob_default;
   }
 
-  LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+  LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
 }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
@@ -179,58 +183,44 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
 //! @brief Internal implementation for load vectorization
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
 //!
-//! @param[in] block_ptr
+//! @param[in] block_src_ptr
 //!   Input pointer for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 template <CacheLoadModifier MODIFIER, typename T, int ITEMS_PER_THREAD>
 _CCCL_DEVICE _CCCL_FORCEINLINE void
-InternalLoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD])
+InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD])
 {
-  // Biggest memory access word that T is a whole multiple of
-  using DeviceWord = typename UnitWord<T>::DeviceWord;
-
+  // Find biggest memory access word that T is a whole multiple of
+  using device_word_t = typename UnitWord<T>::DeviceWord;
   _CCCL_DIAG_PUSH
 #  if defined(CUB_CLANG_VERSION) && CUB_CLANG_VERSION >= 100000
   _CCCL_DIAG_SUPPRESS_CLANG("-Wsizeof-array-div")
 #  endif // defined(CUB_CLANG_VERSION) && CUB_CLANG_VERSION >= 100000
-  enum
-  {
-    TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
-
-    VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ? 4
-                : (TOTAL_WORDS % 2 == 0) ? 2
-                                         : 1,
-
-    VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
-  };
+  constexpr int total_words = static_cast<int>(sizeof(dst_items) / sizeof(device_word_t));
   _CCCL_DIAG_POP
+  constexpr int vector_size        = (total_words % 4 == 0) ? 4 : (total_words % 2 == 0) ? 2 : 1;
+  constexpr int vectors_per_thread = total_words / vector_size;
+  using vector_t                   = typename CubVector<device_word_t, vector_size>::Type;
 
-  // Vector type
-  using Vector = typename CubVector<DeviceWord, VECTOR_SIZE>::Type;
-
-  // Vector items
-  Vector vec_items[VECTORS_PER_THREAD];
-
-  // Aliased input ptr
-  Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
-
-// Load directly in thread-blocked order
+  // Load into an array of vectors in thread-blocked order
+  vector_t vec_items[vectors_per_thread];
+  const vector_t* vec_ptr = reinterpret_cast<const vector_t*>(block_src_ptr) + linear_tid * vectors_per_thread;
 #  pragma unroll
-  for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
+  for (int i = 0; i < vectors_per_thread; i++)
   {
-    vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
+    vec_items[i] = ThreadLoad<MODIFIER>(vec_ptr + i);
   }
 
-// Copy
+// Copy to destination
 #  pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
+    dst_items[i] = *(reinterpret_cast<T*>(vec_items) + i);
   }
 }
 
@@ -258,19 +248,19 @@ InternalLoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITE
 //!   **[inferred]** The number of consecutive items partitioned onto each thread.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//!   linear_tid` for 2D thread blocks)
 //!
-//! @param[in] block_ptr
-//!   Input pointer for loading from
+//! @param[in] block_src_ptr
+//!   The thread block's base pointer for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!  destination to load data into
 template <typename T, int ITEMS_PER_THREAD>
 _CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD])
+LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD])
 {
-  InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+  InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_src_ptr, dst_items);
 }
 
 //! @} end member group
@@ -293,43 +283,41 @@ LoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_T
 //! @tparam ITEMS_PER_THREAD
 //!   **[inferred]** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **[inferred]** The random-access iterator type for input iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
-template <int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
+//! @param[out] dst_items
+//!   Destination to load data into
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
 _CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
 {
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
+    dst_items[i] = block_src_it[linear_tid + i * BLOCK_THREADS];
   }
 }
 
 namespace detail
 {
-
-template <int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT, typename TransformOpT>
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator, typename TransformOpT>
 _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
-  int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], TransformOpT transform_op)
+  int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], TransformOpT transform_op)
 {
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = transform_op(block_itr[linear_tid + ITEM * BLOCK_THREADS]);
+    dst_items[i] = transform_op(block_src_it[linear_tid + i * BLOCK_THREADS]);
   }
 }
-
 } // namespace detail
 
 //! @rst
@@ -348,32 +336,32 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
 //! @tparam ITEMS_PER_THREAD
 //!   **inferred** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
-//!   **inferred** The random-access iterator type for input \iterator.
+//! @tparam RandomAccessIterator
+//!   **inferred** The random-access iterator type for input iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) +
+//! linear_tid</tt> for 2D thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 //!
-//! @param[in] valid_items
+//! @param[in] block_items_end
 //!   Number of valid items to load
-//!
-template <int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
+  int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
 {
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
+    const auto src_pos = linear_tid + i * BLOCK_THREADS;
+    if (src_pos < block_items_end)
     {
-      items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
+      dst_items[i] = block_src_it[src_pos];
     }
   }
 }
@@ -395,35 +383,39 @@ LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEM
 //! @tparam ITEMS_PER_THREAD
 //!   **inferred** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **inferred** The random-access iterator type for input \iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//! linear_tid` for 2D thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 //!
-//! @param[in] valid_items
+//! @param[in] block_items_end
 //!   Number of valid items to load
 //!
 //! @param[in] oob_default
 //!   Default value to assign out-of-bound items
-template <int BLOCK_THREADS, typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT>
+template <int BLOCK_THREADS, typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
 _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
-  int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+  int linear_tid,
+  RandomAccessIterator block_src_it,
+  T (&dst_items)[ITEMS_PER_THREAD],
+  int block_items_end,
+  DefaultT oob_default)
 {
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = oob_default;
+    dst_items[i] = oob_default;
   }
 
-  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
 }
 
 //! @} end member group
@@ -448,31 +440,31 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
 //! @tparam ITEMS_PER_THREAD
 //!   **inferred** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **inferred** The random-access iterator type for input iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//! linear_tid` for 2D thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
-template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
+//! @param[out] dst_items
+//!   Destination to load data into
+template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
 _CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
 {
-  int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-  int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+  const int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+  const int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+  const int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
 
 // Load directly in warp-striped order
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]);
+    new (&dst_items[i]) T(block_src_it[warp_offset + tid + (i * CUB_PTX_WARP_THREADS)]);
   }
 }
 
@@ -494,36 +486,37 @@ LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[
 //! @tparam ITEMS_PER_THREAD
 //!   **inferred** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **inferred** The random-access iterator type for input \iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//! linear_tid` for 2D thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 //!
-//! @param[in] valid_items
+//! @param[in] block_items_end
 //!   Number of valid items to load
-template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE void
-LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
+  int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
 {
-  int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-  int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+  const int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+  const int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+  const int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
 
 // Load directly in warp-striped order
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+    const auto src_pos = warp_offset + tid + (i * CUB_PTX_WARP_THREADS);
+    if (src_pos < block_items_end)
     {
-      new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]);
+      new (&dst_items[i]) T(block_src_it[src_pos]);
     }
   }
 }
@@ -547,42 +540,46 @@ LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[
 //! @tparam ITEMS_PER_THREAD
 //!   **inferred** The number of consecutive items partitioned onto each thread.
 //!
-//! @tparam InputIteratorT
+//! @tparam RandomAccessIterator
 //!   **inferred** The random-access iterator type for input \iterator.
 //!
 //! @param[in] linear_tid
-//!   A suitable 1D thread-identifier for the calling thread
-//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//! linear_tid` for 2D thread blocks)
 //!
-//! @param[in] block_itr
-//!   The thread block's base input iterator for loading from
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
 //!
-//! @param[out] items
-//!   Data to load
+//! @param[out] dst_items
+//!   Destination to load data into
 //!
-//! @param[in] valid_items
+//! @param[in] block_items_end
 //!   Number of valid items to load
 //!
 //! @param[in] oob_default
 //!   Default value to assign out-of-bound items
-template <typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT>
+template <typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
 _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
-  int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+  int linear_tid,
+  RandomAccessIterator block_src_it,
+  T (&dst_items)[ITEMS_PER_THREAD],
+  int block_items_end,
+  DefaultT oob_default)
 {
 // Load directly in warp-striped order
 #pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
   {
-    items[ITEM] = oob_default;
+    dst_items[i] = oob_default;
   }
 
-  LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+  LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
 }
 
 //! @} end member group
 
-//! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a
-//!        linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+//! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data
+//!        from memory into a blocked arrangement across a CUDA thread block.
 enum BlockLoadAlgorithm
 {
   //! @rst
@@ -594,8 +591,8 @@ enum BlockLoadAlgorithm
   //! Performance Considerations
   //! ++++++++++++++++++++++++++
   //!
-  //! The utilization of memory transactions (coalescing) decreases as the
-  //! access stride between threads increases (i.e., the number items per thread).
+  //! The utilization of memory transactions (coalescing) decreases as the access stride between threads increases
+  //! (i.e., the number items per thread).
   //! @endrst
   BLOCK_LOAD_DIRECT,
 
@@ -608,8 +605,7 @@ enum BlockLoadAlgorithm
   //! Performance Considerations
   //! ++++++++++++++++++++++++++
   //!
-  //! The utilization of memory transactions (coalescing) doesn't depend on
-  //! the number of items per thread.
+  //! The utilization of memory transactions (coalescing) doesn't depend on the number of items per thread.
   //!
   //! @endrst
   BLOCK_LOAD_STRIPED,
@@ -618,22 +614,20 @@ enum BlockLoadAlgorithm
   //! Overview
   //! ++++++++++++++++++++++++++
   //!
-  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read
-  //! from memory using CUDA's built-in vectorized loads as a coalescing optimization.
-  //! For example, ``ld.global.v4.s32`` instructions will be generated
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read from memory using CUDA's built-in
+  //! vectorized loads as a coalescing optimization. For example, ``ld.global.v4.s32`` instructions will be generated
   //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``.
   //!
   //! Performance Considerations
   //! ++++++++++++++++++++++++++
   //!
-  //! - The utilization of memory transactions (coalescing) remains high until the the
-  //!   access stride between threads (i.e., the number items per thread) exceeds the
-  //!   maximum vector load width (typically 4 items or 64B, whichever is lower).
-  //! - The following conditions will prevent vectorization and loading will fall
-  //!   back to cub::BLOCK_LOAD_DIRECT:
+  //! - The utilization of memory transactions (coalescing) remains high until the the access stride between threads
+  //!   (i.e., the number items per thread) exceeds the maximum vector load width (typically 4 items or 64B, whichever
+  //!   is lower).
+  //! - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
   //!
   //!   - ``ITEMS_PER_THREAD`` is odd
-  //!   - The ``InputIteratorT`` is not a simple pointer type
+  //!   - The ``RandomAccessIterator`` is not a simple pointer type
   //!   - The block input offset is not quadword-aligned
   //!   - The data type ``T`` is not a built-in primitive or CUDA vector type
   //!     (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
@@ -645,16 +639,15 @@ enum BlockLoadAlgorithm
   //! Overview
   //! ++++++++++++++++++++++++++
   //!
-  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read efficiently from memory and then
-  //! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
+  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read efficiently from memory and then locally
+  //! transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
   //!
   //! Performance Considerations
   //! ++++++++++++++++++++++++++
   //!
-  //! - The utilization of memory transactions (coalescing) remains high regardless
-  //!   of items loaded per thread.
-  //! - The local reordering incurs slightly longer latencies and throughput than the
-  //!   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+  //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
+  //! - The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_LOAD_DIRECT and
+  //!   cub::BLOCK_LOAD_VECTORIZE alternatives.
   //!
   //! @endrst
   BLOCK_LOAD_TRANSPOSE,
@@ -675,8 +668,8 @@ enum BlockLoadAlgorithm
   //! ++++++++++++++++++++++++++
   //!
   //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
-  //! - The local reordering incurs slightly larger latencies than the
-  //!   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+  //! - The local reordering incurs slightly larger latencies than the direct cub::BLOCK_LOAD_DIRECT and
+  //!   cub::BLOCK_LOAD_VECTORIZE alternatives.
   //! - Provisions more shared storage, but incurs smaller latencies than the
   //!   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
   //!
@@ -687,10 +680,10 @@ enum BlockLoadAlgorithm
   //! Overview
   //! ++++++++++++++++++++++++++
   //!
-  //! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement <flexible-data-arrangement>`
-  //! of data is read directly from memory and then is locally transposed into a
-  //! :ref:`blocked arrangement <flexible-data-arrangement>`. To reduce the shared memory requirement, only one
-  //! warp's worth of shared memory is provisioned and is subsequently time-sliced among warps.
+  //! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read
+  //! directly from memory and then is locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
+  //! To reduce the shared memory requirement, only one warp's worth of shared memory is provisioned and is subsequently
+  //! time-sliced among warps.
   //!
   //! Usage Considerations
   //! ++++++++++++++++++++++++++
@@ -700,10 +693,9 @@ enum BlockLoadAlgorithm
   //! Performance Considerations
   //! ++++++++++++++++++++++++++
   //!
-  //! - The utilization of memory transactions (coalescing) remains high regardless
-  //!   of items loaded per thread.
-  //! - Provisions less shared memory temporary storage, but incurs larger
-  //!   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
+  //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
+  //! - Provisions less shared memory temporary storage, but incurs larger latencies than the BLOCK_LOAD_WARP_TRANSPOSE
+  //!   alternative.
   //!
   //! @endrst
   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
@@ -711,15 +703,15 @@ enum BlockLoadAlgorithm
 
 //! @rst
 //! The BlockLoad class provides :ref:`collective <collective-primitives>` data movement methods for loading a linear
-//! segment of items from memory into a :ref:`blocked arrangement <flexible-data-arrangement>` across a
-//! CUDA thread block.
+//! segment of items from memory into a :ref:`blocked arrangement <flexible-data-arrangement>` across a CUDA thread
+//! block.
 //!
 //! Overview
 //! +++++++++++++++++++++++++++++++++++++++++++++
 //!
-//! - The BlockLoad class provides a single data movement abstraction that can be specialized
-//!   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
-//!   performance policies for different architectures, data types, granularity sizes, etc.
+//! - The BlockLoad class provides a single data movement abstraction that can be specialized to implement different
+//!   cub::BlockLoadAlgorithm strategies. This facilitates different performance policies for different architectures,
+//!   data types, granularity sizes, etc.
 //! - BlockLoad can be optionally specialized by different data movement strategies:
 //!
 //!   #. :cpp:enumerator:`cub::BLOCK_LOAD_DIRECT`:
@@ -746,10 +738,9 @@ enum BlockLoadAlgorithm
 //!
 //! @blockcollective{BlockLoad}
 //!
-//! The code snippet below illustrates the loading of a linear
-//! segment of 512 integers into a "blocked" arrangement across 128 threads where each
-//! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``,
-//! meaning memory references are efficiently coalesced using a warp-striped access
+//! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement
+//! across 128 threads where each thread owns 4 consecutive items. The load is specialized for
+//! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
 //! pattern (after which items are locally reordered among threads).
 //!
 //! .. code-block:: c++
@@ -768,21 +759,20 @@ enum BlockLoadAlgorithm
 //!        int thread_data[4];
 //!        BlockLoad(temp_storage).Load(d_data, thread_data);
 //!
-//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``.
-//! The set of ``thread_data`` across the block of threads in those threads will be
-//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in
+//! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
 //!
 //! Re-using dynamically allocating shared memory
 //! +++++++++++++++++++++++++++++++++++++++++++++
 //!
-//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of
-//! dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
-//! This example can be easily adapted to the storage required by BlockLoad.
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage required
+//! by BlockLoad.
 //!
 //! @endrst
 //!
-//! @tparam InputT
-//!   The data type to read into (which must be convertible from the input iterator's value type).
+//! @tparam T
+// The data type to read into (which must be convertible from the input iterator's value type).
 //!
 //! @tparam BLOCK_DIM_X
 //!   The thread block length in threads along the X dimension
@@ -793,20 +783,15 @@ enum BlockLoadAlgorithm
 //! @tparam ALGORITHM
 //!   **[optional]** cub::BlockLoadAlgorithm tuning policy. default: ``cub::BLOCK_LOAD_DIRECT``.
 //!
-//! @tparam WARP_TIME_SLICING
-//!   **[optional]** Whether or not only one warp's worth of shared memory should be
-//!   allocated and time-sliced among block-warps during any load-related data transpositions
-//!   (versus each warp having its own storage). (default: false)
-//!
 //! @tparam BLOCK_DIM_Y
 //!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
 //!
 //! @tparam BLOCK_DIM_Z
-//!  **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
 //!
 //! @tparam LEGACY_PTX_ARCH
-//!  **[optional]** Unused.
-template <typename InputT,
+//!   **[optional]** Unused.
+template <typename T,
           int BLOCK_DIM_X,
           int ITEMS_PER_THREAD,
           BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
@@ -815,621 +800,273 @@ template <typename InputT,
           int LEGACY_PTX_ARCH          = 0>
 class BlockLoad
 {
-private:
-  /// Constants
-  enum
-  {
-    /// The thread block size in threads
-    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-  };
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; // total threads in the block
 
-  /// Load helper
   template <BlockLoadAlgorithm _POLICY, int DUMMY>
-  struct LoadInternal;
+  struct LoadInternal; // helper to dispatch the load algorithm
 
-  /**
-   * BLOCK_LOAD_DIRECT specialization of load helper
-   */
   template <int DUMMY>
   struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
   {
-    /// Shared memory storage layout type
     using TempStorage = NullType;
-
-    /// Linear thread-id
     int linear_tid;
 
-    /// Constructor
     _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
         : linear_tid(linear_tid)
     {}
 
-    /**
-     * @brief Load a linear segment of items from memory
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      LoadDirectBlocked(linear_tid, block_itr, items);
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
-    template <typename InputIteratorT>
+    template <typename RandomAccessIterator>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
     {
-      LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     *        assignment of out-of-bound elements
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
-    template <typename InputIteratorT, typename DefaultT>
+    template <typename RandomAccessIterator, typename DefaultT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
     {
-      LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
     }
   };
 
-  /**
-   * BLOCK_LOAD_STRIPED specialization of load helper
-   */
   template <int DUMMY>
   struct LoadInternal<BLOCK_LOAD_STRIPED, DUMMY>
   {
-    /// Shared memory storage layout type
     using TempStorage = NullType;
-
-    /// Linear thread-id
     int linear_tid;
 
-    /// Constructor
     _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
         : linear_tid(linear_tid)
     {}
 
-    /**
-     * @brief Load a linear segment of items from memory
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
-    template <typename InputIteratorT>
+    template <typename RandomAccessIterator>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
     {
-      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     *        assignment of out-of-bound elements
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
-    template <typename InputIteratorT, typename DefaultT>
+    template <typename RandomAccessIterator, typename DefaultT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
     {
-      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
     }
   };
 
-  /**
-   * BLOCK_LOAD_VECTORIZE specialization of load helper
-   */
   template <int DUMMY>
   struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
   {
-    /// Shared memory storage layout type
     using TempStorage = NullType;
-
-    /// Linear thread-id
     int linear_tid;
 
-    /// Constructor
     _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
         : linear_tid(linear_tid)
     {}
 
-    /**
-     * @brief Load a linear segment of items from memory, specialized for native pointer types
-     * (attempts vectorization)
-     *
-     * @param[in] block_ptr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD])
+    // attempts vectorization (pointer)
+    template <typename>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const T* block_ptr, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+      InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, specialized for native pointer types
-     * (attempts vectorization)
-     *
-     * @param[in] block_ptr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD])
+    // any other iterator, no vectorization
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, specialized for native pointer types
-     *        (attempts vectorization)
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
+    // attempts vectorization (cache modified iterator)
     template <CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
+      InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_src_it.ptr, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, specialized for opaque input iterators
-     *        (skips vectorization)
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename _InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(_InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
-    {
-      LoadDirectBlocked(linear_tid, block_itr, items);
-    }
-
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range (skips vectorization)
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
-    template <typename InputIteratorT>
+    // skips vectorization
+    template <typename RandomAccessIterator>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
     {
-      LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     *        assignment of out-of-bound elements (skips vectorization)
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
-    template <typename InputIteratorT, typename DefaultT>
+    // skips vectorization
+    template <typename RandomAccessIterator, typename DefaultT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
     {
-      LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
     }
   };
 
-  /**
-   * BLOCK_LOAD_TRANSPOSE specialization of load helper
-   */
   template <int DUMMY>
   struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
   {
-    // BlockExchange utility type for keys
-    using BlockExchange = BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
-
-    /// Shared memory storage layout type
-    struct _TempStorage : BlockExchange::TempStorage
-    {};
+    using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+    using _TempStorage  = typename BlockExchange::TempStorage;
+    using TempStorage   = Uninitialized<_TempStorage>;
 
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage>
-    {};
-
-    /// Thread reference to shared storage
     _TempStorage& temp_storage;
-
-    /// Linear thread-id
     int linear_tid;
 
-    /// Constructor
     _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
         : temp_storage(temp_storage.Alias())
         , linear_tid(linear_tid)
     {}
 
-    /**
-     * @brief Load a linear segment of items from memory
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
-      BlockExchange(temp_storage).StripedToBlocked(items, items);
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items);
+      BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
-    template <typename InputIteratorT>
+    template <typename RandomAccessIterator>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
     {
-      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
-      BlockExchange(temp_storage).StripedToBlocked(items, items);
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
+      BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     * assignment of out-of-bound elements
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
-    template <typename InputIteratorT, typename DefaultT>
+    template <typename RandomAccessIterator, typename DefaultT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
     {
-      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
-      BlockExchange(temp_storage).StripedToBlocked(items, items);
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
+      BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
     }
   };
 
-  /**
-   * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
-   */
   template <int DUMMY>
   struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
   {
-    enum
-    {
-      WARP_THREADS = CUB_WARP_THREADS(0)
-    };
-
-    // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-    static_assert(int(BLOCK_THREADS) % int(WARP_THREADS) == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
+    static constexpr int WARP_THREADS = CUB_WARP_THREADS(0);
+    static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
-    // BlockExchange utility type for keys
-    using BlockExchange = BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+    using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+    using _TempStorage  = typename BlockExchange::TempStorage;
+    using TempStorage   = Uninitialized<_TempStorage>;
 
-    /// Shared memory storage layout type
-    struct _TempStorage : BlockExchange::TempStorage
-    {};
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage>
-    {};
-
-    /// Thread reference to shared storage
     _TempStorage& temp_storage;
-
-    /// Linear thread-id
     int linear_tid;
 
-    /// Constructor
     _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
         : temp_storage(temp_storage.Alias())
         , linear_tid(linear_tid)
     {}
 
-    /**
-     * @brief Load a linear segment of items from memory
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      LoadDirectWarpStriped(linear_tid, block_itr, items);
-      BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
-    template <typename InputIteratorT>
+    template <typename RandomAccessIterator>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
     {
-      LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-      BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     *        assignment of out-of-bound elements
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
-    template <typename InputIteratorT, typename DefaultT>
+    template <typename RandomAccessIterator, typename DefaultT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
     {
-      LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
-      BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
     }
   };
 
-  /**
-   * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
-   */
   template <int DUMMY>
   struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
   {
-    enum
-    {
-      WARP_THREADS = CUB_WARP_THREADS(0)
-    };
+    static constexpr int WARP_THREADS = CUB_WARP_THREADS(0);
+    static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
-    // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-    static_assert(int(BLOCK_THREADS) % int(WARP_THREADS) == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
+    using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+    using _TempStorage  = typename BlockExchange::TempStorage;
+    using TempStorage   = Uninitialized<_TempStorage>;
 
-    // BlockExchange utility type for keys
-    using BlockExchange = BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z>;
-
-    /// Shared memory storage layout type
-    struct _TempStorage : BlockExchange::TempStorage
-    {};
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage>
-    {};
-
-    /// Thread reference to shared storage
     _TempStorage& temp_storage;
-
-    /// Linear thread-id
     int linear_tid;
 
-    /// Constructor
     _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
         : temp_storage(temp_storage.Alias())
         , linear_tid(linear_tid)
     {}
 
-    /**
-     * @brief Load a linear segment of items from memory
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     */
-    template <typename InputIteratorT>
-    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
     {
-      LoadDirectWarpStriped(linear_tid, block_itr, items);
-      BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     */
-    template <typename InputIteratorT>
+    template <typename RandomAccessIterator>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
     {
-      LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-      BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
     }
 
-    /**
-     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
-     *        assignment of out-of-bound elements
-     *
-     * @param[in] block_itr
-     *   The thread block's base input iterator for loading from
-     *
-     * @param[out] items
-     *   Data to load
-     *
-     * @param[in] valid_items
-     *   Number of valid items to load
-     *
-     * @param[in] oob_default
-     *   Default value to assign out-of-bound items
-     */
-    template <typename InputIteratorT, typename DefaultT>
+    template <typename RandomAccessIterator, typename DefaultT>
     _CCCL_DEVICE _CCCL_FORCEINLINE void
-    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
     {
-      LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
-      BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
     }
   };
 
-  /// Internal load implementation to use
-  using InternalLoad = LoadInternal<ALGORITHM, 0>;
-
-  /// Shared memory storage layout type
+  using InternalLoad = LoadInternal<ALGORITHM, 0>; // load implementation to use
   using _TempStorage = typename InternalLoad::TempStorage;
 
-  /// Internal storage allocator
+  // Internal storage allocator
   _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
   {
     __shared__ _TempStorage private_storage;
     return private_storage;
   }
 
-  /// Thread reference to shared storage
   _TempStorage& temp_storage;
-
-  /// Linear thread-id
   int linear_tid;
 
 public:
   /// @smemstorage{BlockLoad}
-  struct TempStorage : Uninitialized<_TempStorage>
-  {};
+  using TempStorage = Uninitialized<_TempStorage>;
 
   //! @name Collective constructors
   //! @{
 
-  /**
-   * @brief Collective constructor using a private static allocation of shared memory as temporary
-   *        storage.
-   */
+  /// @brief Collective constructor using a private static allocation of shared memory as temporary storage.
   _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad()
       : temp_storage(PrivateStorage())
       , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
   {}
 
-  /**
-   * @brief Collective constructor using the specified memory allocation as temporary storage.
-   *
-   * @param[in] temp_storage
-   *   Reference to memory allocation having layout type TempStorage
-   */
+  /// @brief Collective constructor using the specified memory allocation as temporary storage.
+  /// @param[in] temp_storage Reference to memory allocation having layout type TempStorage
   _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad(TempStorage& temp_storage)
       : temp_storage(temp_storage.Alias())
       , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
@@ -1448,10 +1085,9 @@ public:
   //! Snippet
   //! +++++++
   //!
-  //! The code snippet below illustrates the loading of a linear
-  //! segment of 512 integers into a "blocked" arrangement across 128 threads where each
-  //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``,
-  //! meaning memory references are efficiently coalesced using a warp-striped access
+  //! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement
+  //! across 128 threads where each thread owns 4 consecutive items. The load is specialized for
+  //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
   //! pattern (after which items are locally reordered among threads).
   //!
   //! .. code-block:: c++
@@ -1470,21 +1106,20 @@ public:
   //!        int thread_data[4];
   //!        BlockLoad(temp_storage).Load(d_data, thread_data);
   //!
-  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``.
-  //! The set of ``thread_data`` across the block of threads in those threads will be
-  //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads
+  //! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
   //!
   //! @endrst
   //!
-  //! @param[in] block_itr
-  //!   The thread block's base input iterator for loading from
+  //! @param[in] block_src_it
+  //!   The thread block's base iterator for loading from
   //!
-  //! @param[out] items
-  //!   Data to load
-  template <typename InputIteratorT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+  //! @param[out] dst_items
+  //!   Destination to load data into
+  template <typename RandomAccessIterator>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
   {
-    InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+    InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items);
   }
 
   //! @rst
@@ -1497,17 +1132,16 @@ public:
   //! Snippet
   //! +++++++
   //!
-  //! The code snippet below illustrates the guarded loading of a linear
-  //! segment of 512 integers into a "blocked" arrangement across 128 threads where each
-  //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``,
-  //! meaning memory references are efficiently coalesced using a warp-striped access
+  //! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked"
+  //! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
+  //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
   //! pattern (after which items are locally reordered among threads).
   //!
   //! .. code-block:: c++
   //!
   //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
   //!
-  //!    __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+  //!    __global__ void ExampleKernel(int *d_data, int block_items_end, ...)
   //!    {
   //!        // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
   //!        using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
@@ -1517,32 +1151,32 @@ public:
   //!
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[4];
-  //!        BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
+  //!        BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end);
   //!
-  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``valid_items`` is ``5``.
-  //! The set of ``thread_data`` across the block of threads in those threads will be
-  //! ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``, with only the first two threads
-  //! being unmasked to load portions of valid data (and other items remaining unassigned).
+  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of
+  //! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``,
+  //! with only the first two threads being unmasked to load portions of valid data (and other items remaining
+  //! unassigned).
   //!
   //! @endrst
   //!
-  //! @param[in] block_itr
-  //!   The thread block's base input iterator for loading from
+  //! @param[in] block_src_it
+  //!   The thread block's base iterator for loading from
   //!
-  //! @param[out] items
-  //!   Data to load
+  //! @param[out] dst_items
+  //!   Destination to load data into
   //!
-  //! @param[in] valid_items
+  //! @param[in] block_items_end
   //!   Number of valid items to load
-  template <typename InputIteratorT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+  template <typename RandomAccessIterator>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
   {
-    InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+    InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end);
   }
 
   //! @rst
-  //! Load a linear segment of items from memory, guarded by range, with a fall-back
-  //! assignment of out-of-bound elements
+  //! Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
   //!
   //! - @blocked
   //! - @smemreuse
@@ -1550,17 +1184,16 @@ public:
   //! Snippet
   //! +++++++
   //!
-  //! The code snippet below illustrates the guarded loading of a linear
-  //! segment of 512 integers into a "blocked" arrangement across 128 threads where each
-  //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``,
-  //! meaning memory references are efficiently coalesced using a warp-striped access
+  //! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked"
+  //! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
+  //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
   //! pattern (after which items are locally reordered among threads).
   //!
   //! .. code-block:: c++
   //!
   //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
   //!
-  //!    __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+  //!    __global__ void ExampleKernel(int *d_data, int block_items_end, ...)
   //!    {
   //!        // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
   //!        using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
@@ -1570,35 +1203,34 @@ public:
   //!
   //!        // Load a segment of consecutive items that are blocked across threads
   //!        int thread_data[4];
-  //!        BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
+  //!        BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1);
   //!
-  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``
-  //! ``valid_items`` is ``5``, and the out-of-bounds default is ``-1``.
-  //! The set of ``thread_data`` across the block of threads in those threads will be
-  //! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads
-  //! being unmasked to load portions of valid data (and other items are assigned ``-1``)
+  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds
+  //! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be
+  //! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads being unmasked to load
+  //! portions of valid data (and other items are assigned ``-1``)
   //!
   //! @endrst
   //!
-  //! @param[in] block_itr
-  //!   The thread block's base input iterator for loading from
+  //! @param[in] block_src_it
+  //!   The thread block's base iterator for loading from
   //!
-  //! @param[out] items
-  //!   Data to load
+  //! @param[out] dst_items
+  //!   Destination to load data into
   //!
-  //! @param[in] valid_items
+  //! @param[in] block_items_end
   //!   Number of valid items to load
   //!
   //! @param[in] oob_default
   //!   Default value to assign out-of-bound items
-  template <typename InputIteratorT, typename DefaultT>
+  template <typename RandomAccessIterator, typename DefaultT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void
-  Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+  Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
   {
-    InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
+    InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end, oob_default);
   }
 
-  //@}  end member group
+  //! @}  end member group
 };
 
 template <class Policy, class It, class T = cub::detail::value_t<It>>
diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
index 86a81652461..29510db5e97 100644
--- a/cub/cub/block/block_merge_sort.cuh
+++ b/cub/cub/block/block_merge_sort.cuh
@@ -47,26 +47,26 @@
 
 CUB_NAMESPACE_BEGIN
 
-// Additional details of the Merge-Path Algorithm can be found in:
-// S. Odeh, O. Green, Z. Mwassi, O. Shmueli, Y. Birk, " Merge Path - Parallel
-// Merging Made Simple", Multithreaded Architectures and Applications (MTAAP)
-// Workshop, IEEE 26th International Parallel & Distributed Processing
-// Symposium (IPDPS), 2012
-template <typename KeyT, typename KeyIteratorT, typename OffsetT, typename BinaryPred>
-_CCCL_DEVICE _CCCL_FORCEINLINE OffsetT MergePath(
-  KeyIteratorT keys1, KeyIteratorT keys2, OffsetT keys1_count, OffsetT keys2_count, OffsetT diag, BinaryPred binary_pred)
+// This implements the DiagonalIntersection algorithm from Merge-Path. Additional details can be found in:
+// * S. Odeh, O. Green, Z. Mwassi, O. Shmueli, Y. Birk, "Merge Path - Parallel Merging Made Simple", Multithreaded
+//   Architectures and Applications (MTAAP) Workshop, IEEE 26th International Parallel & Distributed Processing
+//   Symposium (IPDPS), 2012
+// * S. Odeh, O. Green, Y. Birk, "Merge Path - A Visually Intuitive Approach to Parallel Merging", 2014, URL:
+//   https://arxiv.org/abs/1406.2628
+template <typename KeyIt1, typename KeyIt2, typename OffsetT, typename BinaryPred>
+_CCCL_DEVICE _CCCL_FORCEINLINE OffsetT
+MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count, OffsetT diag, BinaryPred binary_pred)
 {
   OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count;
   OffsetT keys1_end   = (cub::min)(diag, keys1_count);
 
   while (keys1_begin < keys1_end)
   {
-    OffsetT mid = cub::MidPoint<OffsetT>(keys1_begin, keys1_end);
-    KeyT key1   = keys1[mid];
-    KeyT key2   = keys2[diag - 1 - mid];
-    bool pred   = binary_pred(key2, key1);
-
-    if (pred)
+    const OffsetT mid = cub::MidPoint<OffsetT>(keys1_begin, keys1_end);
+    // pull copies of the keys before calling binary_pred so proxy references are unwrapped
+    const detail::value_t<KeyIt1> key1 = keys1[mid];
+    const detail::value_t<KeyIt2> key2 = keys2[diag - 1 - mid];
+    if (binary_pred(key2, key1))
     {
       keys1_end = mid;
     }
@@ -78,9 +78,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT MergePath(
   return keys1_begin;
 }
 
-template <typename KeyT, typename CompareOp, int ITEMS_PER_THREAD>
+template <typename KeyIt, typename KeyT, typename CompareOp, int ITEMS_PER_THREAD>
 _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
-  KeyT* keys_shared,
+  KeyIt keys_shared,
   int keys1_beg,
   int keys2_beg,
   int keys1_count,
@@ -89,8 +89,8 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
   int (&indices)[ITEMS_PER_THREAD],
   CompareOp compare_op)
 {
-  int keys1_end = keys1_beg + keys1_count;
-  int keys2_end = keys2_beg + keys2_count;
+  const int keys1_end = keys1_beg + keys1_count;
+  const int keys2_end = keys2_beg + keys2_count;
 
   KeyT key1 = keys_shared[keys1_beg];
   KeyT key2 = keys_shared[keys2_beg];
@@ -98,11 +98,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
 #pragma unroll
   for (int item = 0; item < ITEMS_PER_THREAD; ++item)
   {
-    bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
-
+    const bool p  = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
     output[item]  = p ? key2 : key1;
     indices[item] = p ? keys2_beg++ : keys1_beg++;
-
     if (p)
     {
       key2 = keys_shared[keys2_beg];
@@ -437,7 +435,7 @@ public:
       int keys1_count = keys1_end - keys1_beg;
       int keys2_count = keys2_end - keys2_beg;
 
-      int partition_diag = MergePath<KeyT>(
+      int partition_diag = MergePath(
         &temp_storage.keys_shared[keys1_beg],
         &temp_storage.keys_shared[keys2_beg],
         keys1_count,
@@ -723,10 +721,9 @@ private:
  * `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`.
  *
  * @par Re-using dynamically allocating shared memory
- * The following example under the examples/block folder illustrates usage of
+ * The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of
  * dynamically shared memory with BlockReduce and how to re-purpose
- * the same memory region:
- * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ * the same memory region.
  *
  * This example can be easily adapted to the storage required by BlockMergeSort.
  */
diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh
index 964f4fbe0e7..48650992918 100644
--- a/cub/cub/block/block_radix_sort.cuh
+++ b/cub/cub/block/block_radix_sort.cuh
@@ -28,7 +28,7 @@
 
 /**
  * @file
- * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix
+ * The cub::BlockRadixSort class provides [<em>collective</em>](../index.html#sec0) methods for radix
  * sorting of items partitioned across a CUDA thread block.
  */
 
@@ -142,7 +142,7 @@ CUB_NAMESPACE_BEGIN
 //! @blockcollective{BlockRadixSort}
 //!
 //! The code snippet below illustrates a sort of 512 integer keys that
-//! are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+//! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
 //! where each thread owns 4 consecutive items.
 //!
 //! .. tab-set-code::
@@ -199,10 +199,8 @@ CUB_NAMESPACE_BEGIN
 //! Re-using dynamically allocating shared memory
 //! --------------------------------------------------
 //!
-//! The following example under the examples/block folder illustrates usage of
-//! dynamically shared memory with BlockReduce and how to re-purpose
-//! the same memory region:
-//! <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region.
 //!
 //! This example can be easily adapted to the storage required by BlockRadixSort.
 //! @endrst
@@ -986,7 +984,7 @@ public:
   //! +++++++
   //!
   //! The code snippet below illustrates a sort of 512 integer keys that
-  //! are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+  //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
   //! where each thread owns 4 consecutive keys.
   //!
   //! .. code-block:: c++
@@ -1590,7 +1588,7 @@ public:
   //! +++++++
   //!
   //! The code snippet below illustrates a sort of 512 integer keys and values that
-  //! are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128
+  //! are initially partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128
   //! threads where each thread owns 4 consecutive pairs.  The final partitioning is striped.
   //!
   //! .. code-block:: c++
diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh
index a06b7c185fb..df7ab6e8143 100644
--- a/cub/cub/block/block_scan.cuh
+++ b/cub/cub/block/block_scan.cuh
@@ -1011,7 +1011,7 @@ public:
   //! +++++++
   //!
   //! The code snippet below illustrates an exclusive prefix max scan of 512 integer
-  //! items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+  //! items that are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3)
   //! across 128 threads where each thread owns 4 consecutive items.
   //!
   //! .. code-block:: c++
@@ -2180,7 +2180,7 @@ public:
   //! +++++++
   //!
   //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-  //! are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+  //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
   //! where each thread owns 4 consecutive items.
   //!
   //! .. code-block:: c++
@@ -2314,7 +2314,7 @@ public:
   //! +++++++
   //!
   //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-  //! are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+  //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
   //! where each thread owns 4 consecutive items.
   //!
   //! .. code-block:: c++
diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh
index ea199e76850..f02ae6c0024 100644
--- a/cub/cub/cub.cuh
+++ b/cub/cub/cub.cuh
@@ -64,6 +64,7 @@
 #include <cub/device/device_for.cuh>
 #include <cub/device/device_histogram.cuh>
 #include <cub/device/device_memcpy.cuh>
+#include <cub/device/device_merge.cuh>
 #include <cub/device/device_merge_sort.cuh>
 #include <cub/device/device_partition.cuh>
 #include <cub/device/device_radix_sort.cuh>
diff --git a/cub/cub/detail/detect_cuda_runtime.cuh b/cub/cub/detail/detect_cuda_runtime.cuh
index 44ee811192d..211e31345da 100644
--- a/cub/cub/detail/detect_cuda_runtime.cuh
+++ b/cub/cub/detail/detect_cuda_runtime.cuh
@@ -44,7 +44,10 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda_runtime_api.h>
+// CUDA headers might not be present when using NVRTC, see NVIDIA/cccl#2095 for detail
+#if !defined(_CCCL_COMPILER_NVRTC)
+#  include <cuda_runtime_api.h>
+#endif // !_CCCL_COMPILER_NVRTC
 
 #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:
 
diff --git a/cub/cub/detail/uninitialized_copy.cuh b/cub/cub/detail/uninitialized_copy.cuh
index 9a3f01e2c0a..326826c0d1a 100644
--- a/cub/cub/detail/uninitialized_copy.cuh
+++ b/cub/cub/detail/uninitialized_copy.cuh
@@ -58,6 +58,7 @@ template <typename T,
           typename ::cuda::std::enable_if<::cuda::std::is_trivially_copyable<T>::value, int>::type = 0>
 _CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val)
 {
+  // gevtushenko: placement new should work here as well, but the code generated for copy assignment is sometimes better
   *ptr = ::cuda::std::forward<U>(val);
 }
 
diff --git a/cub/cub/device/device_merge.cuh b/cub/cub/device/device_merge.cuh
new file mode 100644
index 00000000000..bf110f2f40f
--- /dev/null
+++ b/cub/cub/device/device_merge.cuh
@@ -0,0 +1,197 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_merge.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <cuda/std/functional>
+
+CUB_NAMESPACE_BEGIN
+
+//! DeviceMerge provides device-wide, parallel operations for merging two sorted sequences of values (called keys) or
+//! key-value pairs in device-accessible memory. The sorting order is determined by a comparison functor (default:
+//! less-than), which has to establish a [strict weak ordering].
+//!
+//! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+struct DeviceMerge
+{
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Merges two sorted sequences of values (called keys) into a sorted output sequence. Merging is unstable,
+  //! which means any two equivalent values (neither value is ordered before the other) may be written to the ouput
+  //! sequence in any order.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! The code snippet below illustrates the merging of two device vectors of `int` keys.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin merge-keys
+  //!     :end-before: example-end merge-keys
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyIteratorIn1 **[deduced]** Random access iterator to the first sorted input sequence. Must have the same
+  //! value type as KeyIteratorIn2.
+  //! @tparam KeyIteratorIn2 **[deduced]** Random access iterator to the second sorted input sequence. Must have the
+  //! same value type as KeyIteratorIn1.
+  //! @tparam KeyIteratorOut **[deduced]** Random access iterator to the output sequence.
+  //! @tparam CompareOp **[deduced]** Binary predicate to compare the input iterator's value types. Must have a
+  //! signature equivalent to `bool operator()(Key lhs, Key rhs)` and establish a [strict weak ordering].
+  //!
+  //! @param[in] d_temp_storage Device-accessible allocation of temporary storage. When `nullptr`, the required
+  //! allocation size is written to `temp_storage_bytes` and no work is done.
+  //! @param[in,out] temp_storage_bytes Reference to size in bytes of `d_temp_storage` allocation.
+  //! @param[in] keys_in1 Iterator to the beginning of the first sorted input sequence.
+  //! @param[in] num_keys1 Number of keys in the first input sequence.
+  //! @param[in] keys_in2 Iterator to the beginning of the second sorted input sequence.
+  //! @param[in] num_keys2 Number of keys in the second input sequence.
+  //! @param[out] keys_out Iterator to the beginning of the output sequence.
+  //! @param[in] compare_op Comparison function object, returning true if the first argument is ordered before the
+  //! second. Must establish a [strict weak ordering].
+  //! @param[in] stream **[optional]** CUDA stream to launch kernels into. Default is stream<sub>0</sub>.
+  //!
+  //! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+  template <typename KeyIteratorIn1,
+            typename KeyIteratorIn2,
+            typename KeyIteratorOut,
+            typename CompareOp = ::cuda::std::less<>>
+  CUB_RUNTIME_FUNCTION static cudaError_t MergeKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorIn1 keys_in1,
+    int num_keys1,
+    KeyIteratorIn2 keys_in2,
+    int num_keys2,
+    KeyIteratorOut keys_out,
+    CompareOp compare_op = {},
+    cudaStream_t stream  = nullptr)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys");
+    return detail::merge::
+      dispatch_t<KeyIteratorIn1, NullType*, KeyIteratorIn2, NullType*, KeyIteratorOut, NullType*, int, CompareOp>::
+        dispatch(
+          d_temp_storage,
+          temp_storage_bytes,
+          keys_in1,
+          nullptr,
+          num_keys1,
+          keys_in2,
+          nullptr,
+          num_keys2,
+          keys_out,
+          nullptr,
+          compare_op,
+          stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Merges two sorted sequences of key-value pairs into a sorted output sequence. Merging is unstable,
+  //! which means any two equivalent values (neither value is ordered before the other) may be written to the ouput
+  //! sequence in any order.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! The code snippet below illustrates the merging of two device vectors of `int` keys.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin merge-pairs
+  //!     :end-before: example-end merge-pairs
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyIteratorIn1 **[deduced]** Random access iterator to the keys of the first sorted input sequence. Must
+  //! have the same value type as KeyIteratorIn2.
+  //! @tparam ValueIteratorIn1 **[deduced]** Random access iterator to the values of the first sorted input sequence.
+  //! Must have the same value type as ValueIteratorIn2.
+  //! @tparam KeyIteratorIn2 **[deduced]** Random access iterator to the second sorted input sequence. Must have the
+  //! same value type as KeyIteratorIn1.
+  //! @tparam ValueIteratorIn2 **[deduced]** Random access iterator to the values of the second sorted input sequence.
+  //! Must have the same value type as ValueIteratorIn1.
+  //! @tparam KeyIteratorOut **[deduced]** Random access iterator to the keys of the output sequence.
+  //! @tparam ValueIteratorOut **[deduced]** Random access iterator to the values of the output sequence.
+  //! @tparam CompareOp **[deduced]** Binary predicate to compare the key input iterator's value types. Must have a
+  //! signature equivalent to `bool operator()(Key lhs, Key rhs)` and establish a [strict weak ordering].
+  //!
+  //! @param[in] d_temp_storage Device-accessible allocation of temporary storage. When `nullptr`, the required
+  //! allocation size is written to `temp_storage_bytes` and no work is done.
+  //! @param[in,out] temp_storage_bytes Reference to size in bytes of `d_temp_storage` allocation.
+  //! @param[in] keys_in1 Iterator to the beginning of the keys of the first sorted input sequence.
+  //! @param[in] values_in1 Iterator to the beginning of the values of the first sorted input sequence.
+  //! @param[in] num_pairs1 Number of key-value pairs in the first input sequence.
+  //! @param[in] keys_in2 Iterator to the beginning of the keys of the second sorted input sequence.
+  //! @param[in] values_in2 Iterator to the beginning of the values of the second sorted input sequence.
+  //! @param[in] num_pairs2 Number of key-value pairs in the second input sequence.
+  //! @param[out] keys_out Iterator to the beginning of the keys of the output sequence.
+  //! @param[out] values_out Iterator to the beginning of the values of the output sequence.
+  //! @param[in] compare_op Comparison function object, returning true if the first argument is ordered before the
+  //! second. Must establish a [strict weak ordering].
+  //! @param[in] stream **[optional]** CUDA stream to launch kernels into. Default is stream<sub>0</sub>.
+  //!
+  //! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+  template <typename KeyIteratorIn1,
+            typename ValueIteratorIn1,
+            typename KeyIteratorIn2,
+            typename ValueIteratorIn2,
+            typename KeyIteratorOut,
+            typename ValueIteratorOut,
+            typename CompareOp = ::cuda::std::less<>>
+  CUB_RUNTIME_FUNCTION static cudaError_t MergePairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorIn1 keys_in1,
+    ValueIteratorIn1 values_in1,
+    int num_pairs1,
+    KeyIteratorIn2 keys_in2,
+    ValueIteratorIn2 values_in2,
+    int num_pairs2,
+    KeyIteratorOut keys_out,
+    ValueIteratorOut values_out,
+    CompareOp compare_op = {},
+    cudaStream_t stream  = nullptr)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs");
+    return detail::merge::dispatch_t<
+      KeyIteratorIn1,
+      ValueIteratorIn1,
+      KeyIteratorIn2,
+      ValueIteratorIn2,
+      KeyIteratorOut,
+      ValueIteratorOut,
+      int,
+      CompareOp>::dispatch(d_temp_storage,
+                           temp_storage_bytes,
+                           keys_in1,
+                           values_in1,
+                           num_pairs1,
+                           keys_in2,
+                           values_in2,
+                           num_pairs2,
+                           keys_out,
+                           values_out,
+                           compare_op,
+                           stream);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index ec3a739abac..54062bd9ea3 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -1035,7 +1035,7 @@ struct DeviceReduce
   //!    struct CustomMin
   //!    {
   //!        template <typename T>
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        T operator()(const T &a, const T &b) const {
   //!            return (b < a) ? b : a;
   //!        }
diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh
index d72dabaab2d..c8a36f0255e 100644
--- a/cub/cub/device/device_scan.cuh
+++ b/cub/cub/device/device_scan.cuh
@@ -326,7 +326,7 @@ struct DeviceScan
   //!    struct CustomMin
   //!    {
   //!        template <typename T>
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        T operator()(const T &a, const T &b) const {
   //!            return (b < a) ? b : a;
   //!        }
@@ -471,7 +471,7 @@ struct DeviceScan
   //!    struct CustomMin
   //!    {
   //!        template <typename T>
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        T operator()(const T &a, const T &b) const {
   //!            return (b < a) ? b : a;
   //!        }
@@ -595,7 +595,7 @@ struct DeviceScan
   //!    struct CustomMin
   //!    {
   //!        template <typename T>
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        T operator()(const T &a, const T &b) const {
   //!            return (b < a) ? b : a;
   //!        }
@@ -751,7 +751,7 @@ struct DeviceScan
   //!    struct CustomMin
   //!    {
   //!        template <typename T>
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        T operator()(const T &a, const T &b) const {
   //!            return (b < a) ? b : a;
   //!        }
@@ -1078,7 +1078,7 @@ struct DeviceScan
   //!    struct CustomMin
   //!    {
   //!        template <typename T>
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        T operator()(const T &a, const T &b) const {
   //!            return (b < a) ? b : a;
   //!        }
@@ -1307,7 +1307,7 @@ struct DeviceScan
   //!    struct CustomMin
   //!    {
   //!        template <typename T>
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        T operator()(const T &a, const T &b) const {
   //!            return (b < a) ? b : a;
   //!        }
@@ -1584,7 +1584,7 @@ struct DeviceScan
   //!    struct CustomMin
   //!    {
   //!        template <typename T>
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        T operator()(const T &a, const T &b) const {
   //!            return (b < a) ? b : a;
   //!        }
@@ -1594,7 +1594,7 @@ struct DeviceScan
   //!    struct CustomEqual
   //!    {
   //!        template <typename T>
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        T operator()(const T &a, const T &b) const {
   //!            return a == b;
   //!        }
@@ -1950,7 +1950,7 @@ struct DeviceScan
   //!    struct CustomMin
   //!    {
   //!        template <typename T>
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        T operator()(const T &a, const T &b) const {
   //!            return (b < a) ? b : a;
   //!        }
@@ -1960,7 +1960,7 @@ struct DeviceScan
   //!    struct CustomEqual
   //!    {
   //!        template <typename T>
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        T operator()(const T &a, const T &b) const {
   //!            return a == b;
   //!        }
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index 715cfbcea0c..3113d6ca828 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -380,10 +380,10 @@ struct DeviceSelect
   //!    {
   //!        int compare;
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        LessThan(int compare) : compare(compare) {}
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        bool operator()(const int &a) const {
   //!            return (a < compare);
   //!        }
@@ -534,10 +534,10 @@ struct DeviceSelect
   //!    {
   //!        int compare;
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        LessThan(int compare) : compare(compare) {}
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        bool operator()(const int &a) const {
   //!            return (a < compare);
   //!        }
diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh
index 93e51f2293f..6d6d1264828 100644
--- a/cub/cub/device/device_spmv.cuh
+++ b/cub/cub/device/device_spmv.cuh
@@ -67,7 +67,8 @@ CUB_NAMESPACE_BEGIN
 //!
 //!  - ``A`` is an ``m * n`` sparse matrix whose non-zero structure is specified in
 //!    `compressed-storage-row (CSR) format
-//!    <http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29>`_ (i.e., three arrays:
+//!    <https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)>`_ (i.e., three
+//!    arrays:
 //!    ``values``, ``row_offsets``, and ``column_indices``)
 //!  - ``x`` and ``y`` are dense vectors
 //!
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
new file mode 100644
index 00000000000..2c16d851448
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -0,0 +1,355 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_merge.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_type.cuh>
+#include <cub/util_vsmem.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
+namespace merge
+{
+_LIBCUDACXX_INLINE_VAR constexpr int fallback_BLOCK_THREADS    = 64;
+_LIBCUDACXX_INLINE_VAR constexpr int fallback_ITEMS_PER_THREAD = 1;
+
+template <typename DefaultPolicy, class... Args>
+class choose_merge_agent
+{
+  using default_agent_t = agent_t<DefaultPolicy, Args...>;
+  using fallback_agent_t =
+    agent_t<policy_wrapper_t<DefaultPolicy, fallback_BLOCK_THREADS, fallback_ITEMS_PER_THREAD>, Args...>;
+
+  // Use fallback if merge agent exceeds maximum shared memory, but the fallback agent still fits
+  static constexpr bool use_fallback = sizeof(typename default_agent_t::TempStorage) > max_smem_per_block
+                                    && sizeof(typename fallback_agent_t::TempStorage) <= max_smem_per_block;
+
+public:
+  using type = ::cuda::std::__conditional_t<use_fallback, fallback_agent_t, default_agent_t>;
+};
+
+// Computes the merge path intersections at equally wide intervals. The approach is outlined in the paper:
+// Odeh et al, "Merge Path - Parallel Merging Made Simple" * doi : 10.1109 / IPDPSW .2012.202
+// The algorithm is the same as AgentPartition for merge sort, but that agent handles a lot more.
+template <typename MaxPolicy,
+          typename KeyIt1,
+          typename ValueIt1,
+          typename KeyIt2,
+          typename ValueIt2,
+          typename KeyIt3,
+          typename ValueIt3,
+          typename Offset,
+          typename CompareOp>
+CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
+  KeyIt1 keys1,
+  Offset keys1_count,
+  KeyIt2 keys2,
+  Offset keys2_count,
+  Offset num_partitions,
+  Offset* merge_partitions,
+  CompareOp compare_op)
+{
+  // items_per_tile must be the same of the merge kernel later, so we have to consider whether a fallback agent will be
+  // selected for the merge agent that changes the tile size
+  constexpr int items_per_tile =
+    choose_merge_agent<typename MaxPolicy::ActivePolicy::merge_policy,
+                       KeyIt1,
+                       ValueIt1,
+                       KeyIt2,
+                       ValueIt2,
+                       KeyIt3,
+                       ValueIt3,
+                       Offset,
+                       CompareOp>::type::policy::ITEMS_PER_TILE;
+  const Offset partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (partition_idx < num_partitions)
+  {
+    const Offset partition_at       = (cub::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
+    merge_partitions[partition_idx] = cub::MergePath(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op);
+  }
+}
+
+template <typename MaxPolicy,
+          typename KeyIt1,
+          typename ValueIt1,
+          typename KeyIt2,
+          typename ValueIt2,
+          typename KeyIt3,
+          typename ValueIt3,
+          typename Offset,
+          typename CompareOp>
+__launch_bounds__(
+  choose_merge_agent<typename MaxPolicy::ActivePolicy::merge_policy,
+                     KeyIt1,
+                     ValueIt1,
+                     KeyIt2,
+                     ValueIt2,
+                     KeyIt3,
+                     ValueIt3,
+                     Offset,
+                     CompareOp>::type::policy::BLOCK_THREADS)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void device_merge_kernel(
+    KeyIt1 keys1,
+    ValueIt1 items1,
+    Offset num_keys1,
+    KeyIt2 keys2,
+    ValueIt2 items2,
+    Offset num_keys2,
+    KeyIt3 keys_result,
+    ValueIt3 items_result,
+    CompareOp compare_op,
+    Offset* merge_partitions,
+    vsmem_t global_temp_storage)
+{
+  // the merge agent loads keys into a local array of KeyIt1::value_type, on which the comparisons are performed
+  using key_t = value_t<KeyIt1>;
+  static_assert(::cuda::std::__invokable<CompareOp, key_t, key_t>::value,
+                "Comparison operator cannot compare two keys");
+  static_assert(
+    ::cuda::std::is_convertible<typename ::cuda::std::__invoke_of<CompareOp, key_t, key_t>::type, bool>::value,
+    "Comparison operator must be convertible to bool");
+
+  using MergeAgent = typename choose_merge_agent<
+    typename MaxPolicy::ActivePolicy::merge_policy,
+    KeyIt1,
+    ValueIt1,
+    KeyIt2,
+    ValueIt2,
+    KeyIt3,
+    ValueIt3,
+    Offset,
+    CompareOp>::type;
+  using MergePolicy = typename MergeAgent::policy;
+
+  using THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator;
+  using vsmem_helper_t = vsmem_helper_impl<MergeAgent>;
+  __shared__ typename vsmem_helper_t::static_temp_storage_t shared_temp_storage;
+  auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
+  MergeAgent{
+    temp_storage.Alias(),
+    make_load_iterator(MergePolicy{}, keys1),
+    make_load_iterator(MergePolicy{}, items1),
+    num_keys1,
+    make_load_iterator(MergePolicy{}, keys2),
+    make_load_iterator(MergePolicy{}, items2),
+    num_keys2,
+    keys_result,
+    items_result,
+    compare_op,
+    merge_partitions}();
+  vsmem_helper_t::discard_temp_storage(temp_storage);
+}
+
+template <typename KeyT, typename ValueT>
+struct device_merge_policy_hub
+{
+  static constexpr bool has_values = !::cuda::std::is_same<ValueT, NullType>::value;
+
+  using tune_type = char[has_values ? sizeof(KeyT) + sizeof(ValueT) : sizeof(KeyT)];
+
+  struct policy300 : ChainedPolicy<300, policy300, policy300>
+  {
+    using merge_policy =
+      agent_policy_t<128,
+                     Nominal4BItemsToItems<tune_type>(7),
+                     BLOCK_LOAD_WARP_TRANSPOSE,
+                     LOAD_DEFAULT,
+                     BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  struct policy350 : ChainedPolicy<350, policy350, policy300>
+  {
+    using merge_policy =
+      agent_policy_t<256,
+                     Nominal4BItemsToItems<tune_type>(11),
+                     BLOCK_LOAD_WARP_TRANSPOSE,
+                     LOAD_LDG,
+                     BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  struct policy520 : ChainedPolicy<520, policy520, policy350>
+  {
+    using merge_policy =
+      agent_policy_t<512,
+                     Nominal4BItemsToItems<tune_type>(13),
+                     BLOCK_LOAD_WARP_TRANSPOSE,
+                     LOAD_LDG,
+                     BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  struct policy600 : ChainedPolicy<600, policy600, policy520>
+  {
+    using merge_policy =
+      agent_policy_t<512,
+                     Nominal4BItemsToItems<tune_type>(15),
+                     BLOCK_LOAD_WARP_TRANSPOSE,
+                     LOAD_DEFAULT,
+                     BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  using max_policy = policy600;
+};
+
+template <typename KeyIt1,
+          typename ValueIt1,
+          typename KeyIt2,
+          typename ValueIt2,
+          typename KeyIt3,
+          typename ValueIt3,
+          typename Offset,
+          typename CompareOp,
+          typename PolicyHub = device_merge_policy_hub<value_t<KeyIt1>, value_t<ValueIt1>>>
+struct dispatch_t
+{
+  void* d_temp_storage;
+  std::size_t& temp_storage_bytes;
+  KeyIt1 d_keys1;
+  ValueIt1 d_values1;
+  Offset num_items1;
+  KeyIt2 d_keys2;
+  ValueIt2 d_values2;
+  Offset num_items2;
+  KeyIt3 d_keys_out;
+  ValueIt3 d_values_out;
+  CompareOp compare_op;
+  cudaStream_t stream;
+
+  template <typename ActivePolicy>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using max_policy_t   = typename PolicyHub::max_policy;
+    using merge_policy_t = typename ActivePolicy::merge_policy;
+    using agent_t =
+      typename choose_merge_agent<merge_policy_t, KeyIt1, ValueIt1, KeyIt2, ValueIt2, KeyIt3, ValueIt3, Offset, CompareOp>::
+        type;
+
+    const auto num_tiles = cub::DivideAndRoundUp(num_items1 + num_items2, agent_t::policy::ITEMS_PER_TILE);
+    void* allocations[2] = {nullptr, nullptr};
+    {
+      const std::size_t merge_partitions_size      = (1 + num_tiles) * sizeof(Offset);
+      const std::size_t virtual_shared_memory_size = num_tiles * vsmem_helper_impl<agent_t>::vsmem_per_block;
+      const std::size_t allocation_sizes[2]        = {merge_partitions_size, virtual_shared_memory_size};
+      const auto error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+    }
+
+    // Return if only temporary storage was requested or there is no work to be done
+    if (d_temp_storage == nullptr || num_tiles == 0)
+    {
+      return cudaSuccess;
+    }
+
+    auto merge_partitions = static_cast<Offset*>(allocations[0]);
+
+    // parition the merge path
+    {
+      const Offset num_partitions               = num_tiles + 1;
+      constexpr int threads_per_partition_block = 256; // TODO(bgruber): no policy?
+      const int partition_grid_size =
+        static_cast<int>(cub::DivideAndRoundUp(num_partitions, threads_per_partition_block));
+
+      auto error = CubDebug(
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+          partition_grid_size, threads_per_partition_block, 0, stream)
+          .doit(device_partition_merge_path_kernel<
+                  max_policy_t,
+                  KeyIt1,
+                  ValueIt1,
+                  KeyIt2,
+                  ValueIt2,
+                  KeyIt3,
+                  ValueIt3,
+                  Offset,
+                  CompareOp>,
+                d_keys1,
+                num_items1,
+                d_keys2,
+                num_items2,
+                num_partitions,
+                merge_partitions,
+                compare_op));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+      error = CubDebug(DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+    }
+
+    // merge
+    if (num_tiles > 0)
+    {
+      auto vshmem_ptr = vsmem_t{allocations[1]};
+      auto error      = CubDebug(
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+          static_cast<int>(num_tiles), static_cast<int>(agent_t::policy::BLOCK_THREADS), 0, stream)
+          .doit(
+            device_merge_kernel<max_policy_t, KeyIt1, ValueIt1, KeyIt2, ValueIt2, KeyIt3, ValueIt3, Offset, CompareOp>,
+            d_keys1,
+            d_values1,
+            num_items1,
+            d_keys2,
+            d_values2,
+            num_items2,
+            d_keys_out,
+            d_values_out,
+            compare_op,
+            merge_partitions,
+            vshmem_ptr));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+      error = CubDebug(DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+    }
+
+    return cudaSuccess;
+  }
+
+  template <typename... Args>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(Args&&... args)
+  {
+    int ptx_version = 0;
+    auto error      = CubDebug(PtxVersion(ptx_version));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+    dispatch_t dispatch{::cuda::std::forward<Args>(args)...};
+    error = CubDebug(PolicyHub::max_policy::Invoke(ptx_version, dispatch));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    return cudaSuccess;
+  }
+};
+} // namespace merge
+} // namespace detail
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index 11939b632c7..59deb2e529f 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -510,19 +510,17 @@ struct DispatchMergeSort : SelectedPolicy
       constexpr auto tile_size = merge_sort_helper_t::policy_t::ITEMS_PER_TILE;
       const auto num_tiles     = cub::DivideAndRoundUp(num_items, tile_size);
 
-      const auto merge_partitions_size = static_cast<std::size_t>(1 + num_tiles) * sizeof(OffsetT);
-
-      const auto temporary_keys_storage_size = static_cast<std::size_t>(num_items * sizeof(KeyT));
-
+      const auto merge_partitions_size         = static_cast<std::size_t>(1 + num_tiles) * sizeof(OffsetT);
+      const auto temporary_keys_storage_size   = static_cast<std::size_t>(num_items * sizeof(KeyT));
       const auto temporary_values_storage_size = static_cast<std::size_t>(num_items * sizeof(ValueT)) * !KEYS_ONLY;
 
       /**
        * Merge sort supports large types, which can lead to excessive shared memory size requirements. In these cases,
        * merge sort allocates virtual shared memory that resides in global memory.
        */
-      std::size_t block_sort_smem_size       = num_tiles * BlockSortVSmemHelperT::vsmem_per_block;
-      std::size_t merge_smem_size            = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block;
-      std::size_t virtual_shared_memory_size = (cub::max)(block_sort_smem_size, merge_smem_size);
+      const std::size_t block_sort_smem_size       = num_tiles * BlockSortVSmemHelperT::vsmem_per_block;
+      const std::size_t merge_smem_size            = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block;
+      const std::size_t virtual_shared_memory_size = (cub::max)(block_sort_smem_size, merge_smem_size);
 
       void* allocations[4]            = {nullptr, nullptr, nullptr, nullptr};
       std::size_t allocation_sizes[4] = {
@@ -555,9 +553,9 @@ struct DispatchMergeSort : SelectedPolicy
        */
       bool ping = num_passes % 2 == 0;
 
-      auto merge_partitions = reinterpret_cast<OffsetT*>(allocations[0]);
-      auto keys_buffer      = reinterpret_cast<KeyT*>(allocations[1]);
-      auto items_buffer     = reinterpret_cast<ValueT*>(allocations[2]);
+      auto merge_partitions = static_cast<OffsetT*>(allocations[0]);
+      auto keys_buffer      = static_cast<KeyT*>(allocations[1]);
+      auto items_buffer     = static_cast<ValueT*>(allocations[2]);
 
       // Invoke DeviceMergeSortBlockSortKernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -617,7 +615,7 @@ struct DispatchMergeSort : SelectedPolicy
 
       for (int pass = 0; pass < num_passes; ++pass, ping = !ping)
       {
-        OffsetT target_merged_tiles_number = OffsetT(2) << pass;
+        const OffsetT target_merged_tiles_number = OffsetT(2) << pass;
 
         // Partition
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -706,9 +704,7 @@ struct DispatchMergeSort : SelectedPolicy
     {
       // Get PTX version
       int ptx_version = 0;
-
-      error = CubDebug(PtxVersion(ptx_version));
-
+      error           = CubDebug(PtxVersion(ptx_version));
       if (cudaSuccess != error)
       {
         break;
diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh
index a572fa50561..3db8d73031b 100644
--- a/cub/cub/thread/thread_load.cuh
+++ b/cub/cub/thread/thread_load.cuh
@@ -102,11 +102,11 @@ enum CacheLoadModifier
  * @tparam MODIFIER
  *   <b>[inferred]</b> CacheLoadModifier enumeration
  *
- * @tparam InputIteratorT
- *   <b>[inferred]</b> Input iterator type \iterator
+ * @tparam RandomAccessIterator
+ *   <b>[inferred]</b> The input's iterator type \iterator
  */
-template <CacheLoadModifier MODIFIER, typename InputIteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<InputIteratorT> ThreadLoad(InputIteratorT itr);
+template <CacheLoadModifier MODIFIER, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator> ThreadLoad(RandomAccessIterator itr);
 
 //@}  end member group
 
@@ -125,9 +125,9 @@ struct IterateThreadLoad
     IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
   }
 
-  template <typename InputIteratorT, typename T>
+  template <typename RandomAccessIterator, typename T>
   CUB_DEPRECATED_BECAUSE("Use UnrolledCopy() instead")
-  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(InputIteratorT itr, T* vals)
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator itr, T* vals)
   {
     vals[COUNT] = itr[COUNT];
     IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
@@ -142,8 +142,8 @@ struct IterateThreadLoad<MAX, MAX>
   static _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T const* /*ptr*/, T* /*vals*/)
   {}
 
-  template <typename InputIteratorT, typename T>
-  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(InputIteratorT /*itr*/, T* /*vals*/)
+  template <typename RandomAccessIterator, typename T>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator /*itr*/, T* /*vals*/)
   {}
 };
 
@@ -309,9 +309,9 @@ _CUB_LOAD_ALL(LOAD_LDG, global.nc)
 /**
  * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
  */
-template <typename InputIteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<InputIteratorT>
-ThreadLoad(InputIteratorT itr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<false> /*is_pointer*/)
+template <typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator>
+ThreadLoad(RandomAccessIterator itr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<false> /*is_pointer*/)
 {
   return *itr;
 }
@@ -320,7 +320,8 @@ ThreadLoad(InputIteratorT itr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<fal
  * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
  */
 template <typename T>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<true> /*is_pointer*/)
+_CCCL_DEVICE _CCCL_FORCEINLINE T
+ThreadLoad(const T* ptr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<true> /*is_pointer*/)
 {
   return *ptr;
 }
@@ -329,9 +330,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type<LOAD_DEFAULT> /*mod
  * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
  */
 template <typename T>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type<true> /*is_primitive*/)
+_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type<true> /*is_primitive*/)
 {
-  T retval = *reinterpret_cast<volatile T*>(ptr);
+  T retval = *reinterpret_cast<const volatile T*>(ptr);
   return retval;
 }
 
@@ -339,16 +340,15 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type<true
  * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
  */
 template <typename T>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type<false> /*is_primitive*/)
+_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type<false> /*is_primitive*/)
 {
-  // Word type for memcopying
-  using VolatileWord = typename UnitWord<T>::VolatileWord;
-
+  // Word type for memcpying
+  using VolatileWord              = typename UnitWord<T>::VolatileWord;
   constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
 
   T retval;
   VolatileWord* words = reinterpret_cast<VolatileWord*>(&retval);
-  UnrolledCopy<VOLATILE_MULTIPLE>(reinterpret_cast<volatile VolatileWord*>(ptr), words);
+  UnrolledCopy<VOLATILE_MULTIPLE>(reinterpret_cast<const volatile VolatileWord*>(ptr), words);
   return retval;
 }
 
@@ -356,9 +356,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type<fals
  * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
  */
 template <typename T>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type<LOAD_VOLATILE> /*modifier*/, Int2Type<true> /*is_pointer*/)
+_CCCL_DEVICE _CCCL_FORCEINLINE T
+ThreadLoad(const T* ptr, Int2Type<LOAD_VOLATILE> /*modifier*/, Int2Type<true> /*is_pointer*/)
 {
-  // Apply tags for partial-specialization
   return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
 }
 
@@ -368,25 +368,18 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type<LOAD_VOLATILE> /*mo
 template <typename T, int MODIFIER>
 _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T const* ptr, Int2Type<MODIFIER> /*modifier*/, Int2Type<true> /*is_pointer*/)
 {
-  using DeviceWord = typename UnitWord<T>::DeviceWord;
-
+  using DeviceWord              = typename UnitWord<T>::DeviceWord;
   constexpr int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
 
   DeviceWord words[DEVICE_MULTIPLE];
-  UnrolledThreadLoad<DEVICE_MULTIPLE, CacheLoadModifier(MODIFIER)>(
-    reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)), words);
-
+  UnrolledThreadLoad<DEVICE_MULTIPLE, CacheLoadModifier(MODIFIER)>(reinterpret_cast<const DeviceWord*>(ptr), words);
   return *reinterpret_cast<T*>(words);
 }
 
-/**
- * ThreadLoad definition for generic modifiers
- */
-template <CacheLoadModifier MODIFIER, typename InputIteratorT>
-_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<InputIteratorT> ThreadLoad(InputIteratorT itr)
+template <CacheLoadModifier MODIFIER, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator> ThreadLoad(RandomAccessIterator itr)
 {
-  // Apply tags for partial-specialization
-  return ThreadLoad(itr, Int2Type<MODIFIER>(), Int2Type<::cuda::std::is_pointer<InputIteratorT>::value>());
+  return ThreadLoad(itr, Int2Type<MODIFIER>(), Int2Type<::cuda::std::is_pointer<RandomAccessIterator>::value>());
 }
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
diff --git a/cub/cub/thread/thread_search.cuh b/cub/cub/thread/thread_search.cuh
index d22ca7ff0a5..802d4ec96f8 100644
--- a/cub/cub/thread/thread_search.cuh
+++ b/cub/cub/thread/thread_search.cuh
@@ -97,6 +97,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void MergePathSearch(
  * @param[in] val
  *   Search key
  */
+// TODO(bgruber): deprecate once ::cuda::std::lower_bound is made public
 template <typename InputIteratorT, typename OffsetT, typename T>
 _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT LowerBound(InputIteratorT input, OffsetT num_items, T val)
 {
@@ -131,6 +132,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT LowerBound(InputIteratorT input, OffsetT
  * @param[in] val
  *   Search key
  */
+// TODO(bgruber): deprecate once ::cuda::std::upper_bound is made public
 template <typename InputIteratorT, typename OffsetT, typename T>
 _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT UpperBound(InputIteratorT input, OffsetT num_items, T val)
 {
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index 2998608d567..e1cc4d53724 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -72,6 +72,7 @@ namespace detail
  * @brief Helper class template that allows overwriting the `BLOCK_THREAD` and `ITEMS_PER_THREAD`
  * configurations of a given policy.
  */
+// TODO(bgruber): this should be called something like "override_policy"
 template <typename PolicyT, int BLOCK_THREADS_, int ITEMS_PER_THREAD_ = PolicyT::ITEMS_PER_THREAD>
 struct policy_wrapper_t : PolicyT
 {
@@ -155,9 +156,10 @@ CUB_RUNTIME_FUNCTION inline int DeviceCountUncached()
 
 /**
  * \brief Cache for an arbitrary value produced by a nullary function.
+ * deprecated [Since 2.6.0]
  */
 template <typename T, T (*Function)()>
-struct ValueCache
+struct CUB_DEPRECATED ValueCache
 {
   T const value;
 
@@ -170,13 +172,11 @@ struct ValueCache
   {}
 };
 
-// Host code, only safely usable in C++11 or newer, where thread-safe
-// initialization of static locals is guaranteed.  This is a separate function
-// to avoid defining a local static in a host/device function.
+// Host code. This is a separate function to avoid defining a local static in a host/device function.
 _CCCL_HOST inline int DeviceCountCachedValue()
 {
-  static ValueCache<int, DeviceCountUncached> cache;
-  return cache.value;
+  static int count = DeviceCountUncached();
+  return count;
 }
 
 /**
@@ -211,7 +211,7 @@ struct PerDeviceAttributeCache
   // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the
   // `DeviceEntryInitializing` state, and then proceeds to the
   // `DeviceEntryReady` state. These are the only state transitions allowed;
-  // e.g. a linear sequence of transitions.
+  // i.e. a linear sequence of transitions.
   enum DeviceEntryStatus
   {
     DeviceEntryEmpty = 0,
@@ -372,7 +372,6 @@ _CCCL_HOST inline cudaError_t PtxVersionUncached(int& ptx_version, int device)
 template <typename Tag>
 _CCCL_HOST inline PerDeviceAttributeCache& GetPerDeviceAttributeCache()
 {
-  // C++11 guarantees that initialization of static locals is thread safe.
   static PerDeviceAttributeCache cache;
   return cache;
 }
@@ -383,17 +382,15 @@ struct SmVersionCacheTag
 {};
 
 /**
- * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
+ * \brief Retrieves the PTX virtual architecture that will be used on \p device (major * 100 + minor * 10).
  *
  * \note This function may cache the result internally.
- *
  * \note This function is thread safe.
  */
 _CCCL_HOST inline cudaError_t PtxVersion(int& ptx_version, int device)
 {
   auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
-    // If this call fails, then we get the error code back in the payload,
-    // which we check with `CubDebug` below.
+    // If this call fails, then we get the error code back in the payload, which we check with `CubDebug` below.
     [=](int& pv) {
       return PtxVersionUncached(pv, device);
     },
@@ -408,37 +405,23 @@ _CCCL_HOST inline cudaError_t PtxVersion(int& ptx_version, int device)
 }
 
 /**
- * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
+ * \brief Retrieves the PTX virtual architecture that will be used on the current device (major * 100 + minor * 10).
  *
  * \note This function may cache the result internally.
- *
  * \note This function is thread safe.
  */
 CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version)
 {
   cudaError_t result = cudaErrorUnknown;
-  NV_IF_TARGET(
-    NV_IS_HOST,
-    (auto const device  = CurrentDevice();
-     auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
-       // If this call fails, then we get the error code back in the payload,
-       // which we check with `CubDebug` below.
-       [=](int& pv) {
-         return PtxVersionUncached(pv, device);
-       },
-       device);
-
-     if (!CubDebug(payload.error)) { ptx_version = payload.attribute; }
-
-     result = payload.error;),
-    ( // NV_IS_DEVICE:
-      result = PtxVersionUncached(ptx_version);));
-
+  NV_IF_TARGET(NV_IS_HOST,
+               (result = PtxVersion(ptx_version, CurrentDevice());),
+               ( // NV_IS_DEVICE:
+                 result = PtxVersionUncached(ptx_version);));
   return result;
 }
 
 /**
- * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
+ * \brief Retrieves the SM version (i.e. compute capability) of \p device (major * 100 + minor * 10)
  */
 CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
 {
@@ -464,10 +447,9 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int d
 }
 
 /**
- * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
+ * \brief Retrieves the SM version (i.e. compute capability) of \p device (major * 100 + minor * 10).
  *
  * \note This function may cache the result internally.
- *
  * \note This function is thread safe.
  */
 CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
@@ -477,8 +459,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device =
   NV_IF_TARGET(
     NV_IS_HOST,
     (auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
-       // If this call fails, then we get the error code back in
-       // the payload, which we check with `CubDebug` below.
+       // If this call fails, then we get the error code back in the payload, which we check with `CubDebug` below.
        [=](int& pv) {
          return SmVersionUncached(pv, device);
        },
@@ -565,9 +546,8 @@ CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream)
 CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
 {
   has_uva           = false;
-  cudaError_t error = cudaSuccess;
   int device        = -1;
-  error             = CubDebug(cudaGetDevice(&device));
+  cudaError_t error = CubDebug(cudaGetDevice(&device));
   if (cudaSuccess != error)
   {
     return error;
diff --git a/cub/cub/util_math.cuh b/cub/cub/util_math.cuh
index 454447b4b0d..e5b8444466d 100644
--- a/cub/cub/util_math.cuh
+++ b/cub/cub/util_math.cuh
@@ -80,6 +80,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT safe_add_bound_to_max(OffsetT lhs, O
 template <typename NumeratorT, typename DenominatorT>
 _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr NumeratorT DivideAndRoundUp(NumeratorT n, DenominatorT d)
 {
+  // TODO(bgruber): implement using ::cuda::ceil_div
   static_assert(
     cub::detail::is_integral_or_enum<NumeratorT>::value && cub::detail::is_integral_or_enum<DenominatorT>::value,
     "DivideAndRoundUp is only intended for integral types.");
diff --git a/cub/cub/warp/warp_exchange.cuh b/cub/cub/warp/warp_exchange.cuh
index 712d0a6bcd3..79f422f5abe 100644
--- a/cub/cub/warp/warp_exchange.cuh
+++ b/cub/cub/warp/warp_exchange.cuh
@@ -27,7 +27,7 @@
 
 /**
  * @file
- * The cub::WarpExchange class provides [<em>collective</em>](index.html#sec0)
+ * The cub::WarpExchange class provides [<em>collective</em>](../index.html#sec0)
  * methods for rearranging data partitioned across a CUDA warp.
  */
 
@@ -68,7 +68,7 @@ using InternalWarpExchangeImpl =
 } // namespace detail
 
 /**
- * @brief The WarpExchange class provides [<em>collective</em>](index.html#sec0)
+ * @brief The WarpExchange class provides [<em>collective</em>](../index.html#sec0)
  *        methods for rearranging data partitioned across a CUDA warp.
  *
  * @tparam T
@@ -94,10 +94,10 @@ using InternalWarpExchangeImpl =
  *   partitioning of items across threads (where consecutive items belong to a
  *   single thread).
  * - WarpExchange supports the following types of data exchanges:
- *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and
- *     [<em>striped</em>](index.html#sec5sec3) arrangements
+ *   - Transposing between [<em>blocked</em>](../index.html#sec5sec3) and
+ *     [<em>striped</em>](../index.html#sec5sec3) arrangements
  *   - Scattering ranked items to a
- *     [<em>striped arrangement</em>](index.html#sec5sec3)
+ *     [<em>striped arrangement</em>](../index.html#sec5sec3)
  *
  * @par A Simple Example
  * @par
diff --git a/cub/examples/CMakeLists.txt b/cub/examples/CMakeLists.txt
index 3865b6f057f..aa766336d3e 100644
--- a/cub/examples/CMakeLists.txt
+++ b/cub/examples/CMakeLists.txt
@@ -35,10 +35,6 @@ function(cub_add_example target_name_var example_name example_src cub_target)
   cub_configure_cuda_target(${example_target} RDC ${CUB_FORCE_RDC})
   target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples")
 
-  if (CUB_IN_THRUST)
-    thrust_fix_clang_nvcc_build_for(${example_target})
-  endif()
-
   # Add to the active configuration's meta target
   add_dependencies(${config_meta_target} ${example_target})
 
diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt
index 2187c66f84c..48a0142801a 100644
--- a/cub/test/CMakeLists.txt
+++ b/cub/test/CMakeLists.txt
@@ -202,10 +202,6 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
       else()
         target_compile_definitions(${config_c2h_target} PRIVATE C2H_HAS_CURAND=0)
       endif()
-
-      if (CUB_IN_THRUST)
-        thrust_fix_clang_nvcc_build_for(${config_c2h_target})
-      endif()
     endif() # config_c2h_target
 
     if (CUB_SEPARATE_CATCH2)
@@ -240,10 +236,6 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
           target_link_options(${config_c2run_target} PRIVATE "-cuda")
         endif()
 
-        if (CUB_IN_THRUST)
-          thrust_fix_clang_nvcc_build_for(${config_c2run_target})
-        endif()
-
         add_test(NAME ${config_c2run_target}
           COMMAND "$<TARGET_FILE:${config_c2run_target}>"
         )
@@ -265,10 +257,6 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
       target_compile_definitions(${test_target} PRIVATE NVRTC_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}")
     endif()
 
-    if (CUB_IN_THRUST)
-      thrust_fix_clang_nvcc_build_for(${test_target})
-    endif()
-
     if ("${test_target}" MATCHES "test.iterator")
       target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists
     endif()
@@ -297,9 +285,6 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
       target_link_libraries(${test_target} nvtx3-cpp)
     endif()
 
-    if (CUB_IN_THRUST)
-      thrust_fix_clang_nvcc_build_for(${test_target})
-    endif()
 
     _cub_is_fail_test(is_fail_test "${test_src}")
     if (is_fail_test)
diff --git a/cub/test/catch2_test_device_merge.cu b/cub/test/catch2_test_device_merge.cu
new file mode 100644
index 00000000000..abc8b1a5ce8
--- /dev/null
+++ b/cub/test/catch2_test_device_merge.cu
@@ -0,0 +1,463 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_merge.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sort.h>
+
+#include <algorithm>
+
+#include "catch2_test_helper.h"
+#include "catch2_test_launch_helper.h"
+#include <test_util.h>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergePairs, merge_pairs);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergeKeys, merge_keys);
+
+// TODO(bgruber): replace the following by the CUB device API directly, once we have figured out how to handle different
+// offset types
+namespace detail
+{
+template <typename KeyIteratorIn1,
+          typename KeyIteratorIn2,
+          typename KeyIteratorOut,
+          typename Offset,
+          typename CompareOp = ::cuda::std::less<>>
+CUB_RUNTIME_FUNCTION static cudaError_t merge_keys_custom_offset_type(
+  void* d_temp_storage,
+  std::size_t& temp_storage_bytes,
+  KeyIteratorIn1 keys_in1,
+  Offset num_keys1,
+  KeyIteratorIn2 keys_in2,
+  Offset num_keys2,
+  KeyIteratorOut keys_out,
+  CompareOp compare_op = {},
+  cudaStream_t stream  = 0)
+{
+  CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys");
+  return cub::detail::merge::dispatch_t<
+    KeyIteratorIn1,
+    cub::NullType*,
+    KeyIteratorIn2,
+    cub::NullType*,
+    KeyIteratorOut,
+    cub::NullType*,
+    Offset,
+    CompareOp>::dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         keys_in1,
+                         nullptr,
+                         num_keys1,
+                         keys_in2,
+                         nullptr,
+                         num_keys2,
+                         keys_out,
+                         nullptr,
+                         compare_op,
+                         stream);
+}
+
+template <typename KeyIteratorIn1,
+          typename ValueIteratorIn1,
+          typename KeyIteratorIn2,
+          typename ValueIteratorIn2,
+          typename KeyIteratorOut,
+          typename ValueIteratorOut,
+          typename Offset,
+          typename CompareOp = ::cuda::std::less<>>
+CUB_RUNTIME_FUNCTION static cudaError_t merge_pairs_custom_offset_type(
+  void* d_temp_storage,
+  std::size_t& temp_storage_bytes,
+  KeyIteratorIn1 keys_in1,
+  ValueIteratorIn1 values_in1,
+  Offset num_pairs1,
+  KeyIteratorIn2 keys_in2,
+  ValueIteratorIn2 values_in2,
+  Offset num_pairs2,
+  KeyIteratorOut keys_out,
+  ValueIteratorOut values_out,
+  CompareOp compare_op = {},
+  cudaStream_t stream  = 0)
+{
+  CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs");
+  return cub::detail::merge::dispatch_t<
+    KeyIteratorIn1,
+    ValueIteratorIn1,
+    KeyIteratorIn2,
+    ValueIteratorIn2,
+    KeyIteratorOut,
+    ValueIteratorOut,
+    Offset,
+    CompareOp>::dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         keys_in1,
+                         values_in1,
+                         num_pairs1,
+                         keys_in2,
+                         values_in2,
+                         num_pairs2,
+                         keys_out,
+                         values_out,
+                         compare_op,
+                         stream);
+}
+} // namespace detail
+
+DECLARE_LAUNCH_WRAPPER(detail::merge_keys_custom_offset_type, merge_keys_custom_offset_type);
+DECLARE_LAUNCH_WRAPPER(detail::merge_pairs_custom_offset_type, merge_pairs_custom_offset_type);
+
+using types = c2h::type_list<std::uint8_t, std::int16_t, std::uint32_t, double>;
+
+// gevtushenko: there is no code path in CUB and Thrust that leads to unsigned offsets, so let's safe some compile time
+using offset_types = c2h::type_list<std::int32_t, std::int64_t>;
+
+template <typename Key,
+          typename Offset,
+          typename CompareOp = ::cuda::std::less<Key>,
+          typename MergeKeys = decltype(::merge_keys)>
+void test_keys(Offset size1 = 3623, Offset size2 = 6346, CompareOp compare_op = {}, MergeKeys merge_keys = ::merge_keys)
+{
+  CAPTURE(c2h::type_name<Key>(), c2h::type_name<Offset>(), size1, size2);
+
+  c2h::device_vector<Key> keys1_d(size1);
+  c2h::device_vector<Key> keys2_d(size2);
+
+  c2h::gen(CUB_SEED(1), keys1_d);
+  c2h::gen(CUB_SEED(1), keys2_d);
+
+  thrust::sort(c2h::device_policy, keys1_d.begin(), keys1_d.end(), compare_op);
+  thrust::sort(c2h::device_policy, keys2_d.begin(), keys2_d.end(), compare_op);
+  // CAPTURE(keys1_d, keys2_d);
+
+  c2h::device_vector<Key> result_d(size1 + size2);
+  merge_keys(thrust::raw_pointer_cast(keys1_d.data()),
+             static_cast<Offset>(keys1_d.size()),
+             thrust::raw_pointer_cast(keys2_d.data()),
+             static_cast<Offset>(keys2_d.size()),
+             thrust::raw_pointer_cast(result_d.data()),
+             compare_op);
+
+  c2h::host_vector<Key> keys1_h = keys1_d;
+  c2h::host_vector<Key> keys2_h = keys2_d;
+  c2h::host_vector<Key> reference_h(size1 + size2);
+  std::merge(keys1_h.begin(), keys1_h.end(), keys2_h.begin(), keys2_h.end(), reference_h.begin(), compare_op);
+
+  // FIXME(bgruber): comparing std::vectors (slower than thrust vectors) but compiles a lot faster
+  CHECK((detail::to_vec(reference_h) == detail::to_vec(c2h::host_vector<Key>(result_d))));
+}
+
+CUB_TEST("DeviceMerge::MergeKeys key types", "[merge][device]", types)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using offset_t = int;
+  test_keys<key_t, offset_t>();
+}
+
+using large_type_fallb = c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t, c2h::huge_data<56>::type>;
+using large_type_vsmem = c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t, c2h::huge_data<774>::type>;
+
+struct fallback_test_policy_hub
+{
+  struct max_policy : cub::ChainedPolicy<100, max_policy, max_policy>
+  {
+    using merge_policy = cub::detail::merge::
+      agent_policy_t<128, 7, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+};
+
+// TODO(bgruber): This test alone increases compile time from 1m16s to 8m43s. What's going on?
+CUB_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_list<large_type_vsmem, large_type_fallb>)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using offset_t = int;
+
+  constexpr auto agent_sm = sizeof(key_t) * 128 * 7;
+  constexpr auto fallback_sm =
+    sizeof(key_t) * cub::detail::merge::fallback_BLOCK_THREADS * cub::detail::merge::fallback_ITEMS_PER_THREAD;
+  static_assert(agent_sm > cub::detail::max_smem_per_block,
+                "key_t is not big enough to exceed SM and trigger fallback policy");
+  static_assert(
+    ::cuda::std::is_same<key_t, large_type_fallb>::value == (fallback_sm <= cub::detail::max_smem_per_block),
+    "SM consumption by fallback policy should fit into max_smem_per_block");
+
+  test_keys<key_t, offset_t>(
+    3623,
+    6346,
+    ::cuda::std::less<key_t>{},
+    [](const key_t* k1, offset_t s1, const key_t* k2, offset_t s2, key_t* r, ::cuda::std::less<key_t> co) {
+      using dispatch_t = cub::detail::merge::dispatch_t<
+        const key_t*,
+        const cub::NullType*,
+        const key_t*,
+        const cub::NullType*,
+        key_t*,
+        cub::NullType*,
+        offset_t,
+        ::cuda::std::less<key_t>,
+        fallback_test_policy_hub>; // use a fixed policy for this test so the needed shared memory is deterministic
+
+      std::size_t temp_storage_bytes = 0;
+      dispatch_t::dispatch(
+        nullptr, temp_storage_bytes, k1, nullptr, s1, k2, nullptr, s2, r, nullptr, co, cudaStream_t{0});
+
+      c2h::device_vector<char> temp_storage(temp_storage_bytes);
+      dispatch_t::dispatch(
+        thrust::raw_pointer_cast(temp_storage.data()),
+        temp_storage_bytes,
+        k1,
+        nullptr,
+        s1,
+        k2,
+        nullptr,
+        s2,
+        r,
+        nullptr,
+        co,
+        cudaStream_t{0});
+    });
+}
+
+CUB_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]", offset_types)
+{
+  using key_t    = int;
+  using offset_t = c2h::get<0, TestType>;
+  test_keys<key_t, offset_t>(3623, 6346, ::cuda::std::less<>{}, merge_keys_custom_offset_type);
+}
+
+CUB_TEST("DeviceMerge::MergeKeys input sizes", "[merge][device]")
+{
+  using key_t    = int;
+  using offset_t = int;
+  // TODO(bgruber): maybe less combinations
+  const auto size1 = offset_t{GENERATE(0, 1, 23, 123, 3234)};
+  const auto size2 = offset_t{GENERATE(0, 1, 52, 556, 56767)};
+  test_keys<key_t>(size1, size2);
+}
+
+// cannot put those in an anon namespace, or nvcc complains that the kernels have internal linkage
+using unordered_t = c2h::custom_type_t<c2h::equal_comparable_t>;
+struct order
+{
+  _CCCL_HOST_DEVICE auto operator()(const unordered_t& a, const unordered_t& b) const -> bool
+  {
+    return a.key < b.key;
+  }
+};
+
+CUB_TEST("DeviceMerge::MergeKeys no operator<", "[merge][device]")
+{
+  using key_t    = unordered_t;
+  using offset_t = int;
+  test_keys<key_t, offset_t, order>();
+}
+
+namespace
+{
+template <typename... Its>
+auto zip(Its... its) -> decltype(thrust::make_zip_iterator(its...))
+{
+  return thrust::make_zip_iterator(its...);
+}
+
+template <typename Value>
+struct key_to_value
+{
+  template <typename Key>
+  _CCCL_HOST_DEVICE auto operator()(const Key& k) const -> Value
+  {
+    Value v{};
+    convert(k, v, 0);
+    return v;
+  }
+
+  template <typename Key>
+  _CCCL_HOST_DEVICE static void convert(const Key& k, Value& v, ...)
+  {
+    v = static_cast<Value>(k);
+  }
+
+  template <template <typename> class... Policies>
+  _CCCL_HOST_DEVICE static void convert(const c2h::custom_type_t<Policies...>& k, Value& v, int)
+  {
+    v = static_cast<Value>(k.val);
+  }
+
+  template <typename Key, template <typename> class... Policies>
+  _CCCL_HOST_DEVICE static void convert(const Key& k, c2h::custom_type_t<Policies...>& v, int)
+  {
+    v     = {};
+    v.val = static_cast<decltype(v.val)>(k);
+  }
+};
+} // namespace
+
+template <typename Key,
+          typename Value,
+          typename Offset,
+          typename CompareOp  = ::cuda::std::less<Key>,
+          typename MergePairs = decltype(::merge_pairs)>
+void test_pairs(
+  Offset size1 = 200, Offset size2 = 625, CompareOp compare_op = {}, MergePairs merge_pairs = ::merge_pairs)
+{
+  CAPTURE(c2h::type_name<Key>(), c2h::type_name<Value>(), c2h::type_name<Offset>(), size1, size2);
+
+  // we start with random but sorted keys
+  c2h::device_vector<Key> keys1_d(size1);
+  c2h::device_vector<Key> keys2_d(size2);
+  c2h::gen(CUB_SEED(1), keys1_d);
+  c2h::gen(CUB_SEED(1), keys2_d);
+  thrust::sort(c2h::device_policy, keys1_d.begin(), keys1_d.end(), compare_op);
+  thrust::sort(c2h::device_policy, keys2_d.begin(), keys2_d.end(), compare_op);
+
+  // the values must be functionally dependent on the keys (equal key => equal value), since merge is unstable
+  c2h::device_vector<Value> values1_d(size1);
+  c2h::device_vector<Value> values2_d(size2);
+  thrust::transform(c2h::device_policy, keys1_d.begin(), keys1_d.end(), values1_d.begin(), key_to_value<Value>{});
+  thrust::transform(c2h::device_policy, keys2_d.begin(), keys2_d.end(), values2_d.begin(), key_to_value<Value>{});
+  //  CAPTURE(keys1_d, keys2_d, values1_d, values2_d);
+
+  // compute CUB result
+  c2h::device_vector<Key> result_keys_d(size1 + size2);
+  c2h::device_vector<Value> result_values_d(size1 + size2);
+  merge_pairs(
+    thrust::raw_pointer_cast(keys1_d.data()),
+    thrust::raw_pointer_cast(values1_d.data()),
+    static_cast<Offset>(keys1_d.size()),
+    thrust::raw_pointer_cast(keys2_d.data()),
+    thrust::raw_pointer_cast(values2_d.data()),
+    static_cast<Offset>(keys2_d.size()),
+    thrust::raw_pointer_cast(result_keys_d.data()),
+    thrust::raw_pointer_cast(result_values_d.data()),
+    compare_op);
+
+  // compute reference result
+  c2h::host_vector<Key> reference_keys_h(size1 + size2);
+  c2h::host_vector<Value> reference_values_h(size1 + size2);
+  {
+    c2h::host_vector<Key> keys1_h     = keys1_d;
+    c2h::host_vector<Value> values1_h = values1_d;
+    c2h::host_vector<Key> keys2_h     = keys2_d;
+    c2h::host_vector<Value> values2_h = values2_d;
+    using value_t                     = typename decltype(zip(keys1_h.begin(), values1_h.begin()))::value_type;
+    std::merge(zip(keys1_h.begin(), values1_h.begin()),
+               zip(keys1_h.end(), values1_h.end()),
+               zip(keys2_h.begin(), values2_h.begin()),
+               zip(keys2_h.end(), values2_h.end()),
+               zip(reference_keys_h.begin(), reference_values_h.begin()),
+               [&](const value_t& a, const value_t& b) {
+                 return compare_op(thrust::get<0>(a), thrust::get<0>(b));
+               });
+  }
+
+  // FIXME(bgruber): comparing std::vectors (slower than thrust vectors) but compiles a lot faster
+  CHECK((detail::to_vec(reference_keys_h) == detail::to_vec(c2h::host_vector<Key>(result_keys_d))));
+  CHECK((detail::to_vec(reference_values_h) == detail::to_vec(c2h::host_vector<Value>(result_values_d))));
+}
+
+CUB_TEST("DeviceMerge::MergePairs key types", "[merge][device]", types)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using value_t  = int;
+  using offset_t = int;
+  test_pairs<key_t, value_t, offset_t>();
+}
+
+// TODO(bgruber): fine tune the type sizes again to hit the fallback and the vsmem policies
+// CUB_TEST("DeviceMerge::MergePairs large key types", "[merge][device]", large_types)
+// {
+//   using key_t    = c2h::get<0, TestType>;
+//   using value_t  = int;
+//   using offset_t = int;
+//   test_pairs<key_t, value_t, offset_t>();
+// }
+
+CUB_TEST("DeviceMerge::MergePairs value types", "[merge][device]", types)
+{
+  using key_t    = int;
+  using value_t  = c2h::get<0, TestType>;
+  using offset_t = int;
+  test_pairs<key_t, value_t, offset_t>();
+}
+
+CUB_TEST("DeviceMerge::MergePairs offset types", "[merge][device]", offset_types)
+{
+  using key_t    = int;
+  using value_t  = int;
+  using offset_t = c2h::get<0, TestType>;
+  test_pairs<key_t, value_t, offset_t>(3623, 6346, ::cuda::std::less<>{}, merge_pairs_custom_offset_type);
+}
+
+CUB_TEST("DeviceMerge::MergePairs input sizes", "[merge][device]")
+{
+  using key_t      = int;
+  using value_t    = int;
+  using offset_t   = int;
+  const auto size1 = offset_t{GENERATE(0, 1, 23, 123, 3234234)};
+  const auto size2 = offset_t{GENERATE(0, 1, 52, 556, 56767)};
+  test_pairs<key_t, value_t>(size1, size2);
+}
+
+// this test exceeds 4GiB of memory and the range of 32-bit integers
+CUB_TEST("DeviceMerge::MergePairs really large input", "[merge][device]")
+try
+{
+  using key_t     = char;
+  using value_t   = char;
+  const auto size = std::int64_t{1} << GENERATE(30, 31, 32, 33);
+  test_pairs<key_t, value_t>(size, size, ::cuda::std::less<>{}, merge_pairs_custom_offset_type);
+}
+catch (const std::bad_alloc&)
+{
+  // allocation failure is not a test failure, so we can run tests on smaller GPUs
+}
+
+CUB_TEST("DeviceMerge::MergePairs iterators", "[merge][device]")
+{
+  using key_t             = int;
+  using value_t           = int;
+  using offset_t          = int;
+  const offset_t size1    = 363;
+  const offset_t size2    = 634;
+  const auto values_start = 123456789;
+
+  auto key_it   = thrust::counting_iterator<key_t>{};
+  auto value_it = thrust::counting_iterator<key_t>{values_start};
+
+  // compute CUB result
+  c2h::device_vector<key_t> result_keys_d(size1 + size2);
+  c2h::device_vector<value_t> result_values_d(size1 + size2);
+  merge_pairs(
+    key_it,
+    value_it,
+    size1,
+    key_it,
+    value_it,
+    size2,
+    result_keys_d.begin(),
+    result_values_d.begin(),
+    ::cuda::std::less<key_t>{});
+
+  // check result
+  c2h::host_vector<key_t> result_keys_h     = result_keys_d;
+  c2h::host_vector<value_t> result_values_h = result_values_d;
+  const auto smaller_size                   = std::min(size1, size2);
+  for (offset_t i = 0; i < static_cast<offset_t>(result_keys_h.size()); i++)
+  {
+    if (i < 2 * smaller_size)
+    {
+      CHECK(result_keys_h[i + 0] == i / 2);
+      CHECK(result_values_h[i + 0] == values_start + i / 2);
+    }
+    else
+    {
+      CHECK(result_keys_h[i] == i - smaller_size);
+      CHECK(result_values_h[i] == values_start + i - smaller_size);
+    }
+  }
+}
diff --git a/cub/test/catch2_test_device_merge_api.cu b/cub/test/catch2_test_device_merge_api.cu
new file mode 100644
index 00000000000..8aaf1708459
--- /dev/null
+++ b/cub/test/catch2_test_device_merge_api.cu
@@ -0,0 +1,95 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_merge.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include "catch2_test_helper.h"
+
+CUB_TEST("DeviceMerge::MergeKeys API example", "[merge][device]")
+{
+  // example-begin merge-keys
+  thrust::device_vector<int> keys1{0, 2, 5};
+  thrust::device_vector<int> keys2{0, 3, 3, 4};
+  thrust::device_vector<int> result(7);
+
+  // 1) Get temp storage size
+  std::size_t temp_storage_bytes = 0;
+  cub::DeviceMerge::MergeKeys(
+    nullptr,
+    temp_storage_bytes,
+    keys1.begin(),
+    static_cast<int>(keys1.size()),
+    keys2.begin(),
+    static_cast<int>(keys2.size()),
+    result.begin());
+
+  // 2) Allocate temp storage
+  thrust::device_vector<char> temp_storage(temp_storage_bytes);
+
+  // 3) Perform merge operation
+  cub::DeviceMerge::MergeKeys(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    keys1.begin(),
+    static_cast<int>(keys1.size()),
+    keys2.begin(),
+    static_cast<int>(keys2.size()),
+    result.begin());
+
+  thrust::host_vector<int> expected{0, 0, 2, 3, 3, 4, 5};
+  // example-end merge-keys
+  CHECK(result == expected);
+}
+
+CUB_TEST("DeviceMerge::MergePairs API example", "[merge][device]")
+{
+  // example-begin merge-pairs
+  thrust::device_vector<int> keys1{0, 2, 5};
+  thrust::device_vector<char> values1{'a', 'b', 'c'};
+  thrust::device_vector<int> keys2{0, 3, 3, 4};
+  thrust::device_vector<char> values2{'A', 'B', 'C', 'D'};
+  thrust::device_vector<int> result_keys(7);
+  thrust::device_vector<char> result_values(7);
+
+  // 1) Get temp storage size
+  std::size_t temp_storage_bytes = 0;
+  cub::DeviceMerge::MergePairs(
+    nullptr,
+    temp_storage_bytes,
+    keys1.begin(),
+    values1.begin(),
+    static_cast<int>(keys1.size()),
+    keys2.begin(),
+    values2.begin(),
+    static_cast<int>(keys2.size()),
+    result_keys.begin(),
+    result_values.begin());
+
+  // 2) Allocate temp storage
+  thrust::device_vector<char> temp_storage(temp_storage_bytes);
+
+  // 3) Perform merge operation
+  cub::DeviceMerge::MergePairs(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    keys1.begin(),
+    values1.begin(),
+    static_cast<int>(keys1.size()),
+    keys2.begin(),
+    values2.begin(),
+    static_cast<int>(keys2.size()),
+    result_keys.begin(),
+    result_values.begin());
+
+  thrust::host_vector<int> expected_keys{0, 0, 2, 3, 3, 4, 5};
+  thrust::host_vector<char> expected_values{'a', 'A', 'b', 'B', 'C', 'D', 'c'};
+  // example-end merge-pairs
+  CHECK(result_keys == expected_keys);
+  CHECK(result_values == expected_values);
+}
diff --git a/cudax/include/cuda/experimental/__detail/utility.cuh b/cudax/include/cuda/experimental/__detail/utility.cuh
new file mode 100644
index 00000000000..874075b1075
--- /dev/null
+++ b/cudax/include/cuda/experimental/__detail/utility.cuh
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_DETAIL_UTILITY_H
+#define __CUDAX_DETAIL_UTILITY_H
+
+namespace cuda::experimental
+{
+struct uninit_t
+{
+  explicit uninit_t() = default;
+};
+
+inline constexpr uninit_t uninit{};
+} // namespace cuda::experimental
+
+#endif // __CUDAX_DETAIL_UTILITY_H
diff --git a/cudax/include/cuda/experimental/__device/attributes.cuh b/cudax/include/cuda/experimental/__device/attributes.cuh
new file mode 100644
index 00000000000..cdf76752243
--- /dev/null
+++ b/cudax/include/cuda/experimental/__device/attributes.cuh
@@ -0,0 +1,695 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__DEVICE_ATTRIBUTES_
+#define _CUDAX__DEVICE_ATTRIBUTES_
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cccl/attributes.h>
+#include <cuda/std/__cuda/api_wrapper.h>
+
+#include <cuda/experimental/__device/device.cuh>
+
+namespace cuda::experimental
+{
+
+namespace detail
+{
+template <::cudaDeviceAttr _Attr, typename _Type>
+struct __attr_with_type
+{
+  using type = _Type;
+
+  constexpr operator ::cudaDeviceAttr() const noexcept
+  {
+    return _Attr;
+  }
+
+  _CCCL_NODISCARD type operator()(device __dev) const
+  {
+    return __dev.attr<_Attr>();
+  }
+};
+} // namespace detail
+
+// TODO: give this a strong type for kilohertz
+template <>
+struct device::__attr<::cudaDevAttrClockRate> //
+    : detail::__attr_with_type<::cudaDevAttrClockRate, int>
+{};
+template <>
+struct device::__attr<::cudaDevAttrGpuOverlap> //
+    : detail::__attr_with_type<::cudaDevAttrGpuOverlap, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrKernelExecTimeout> //
+    : detail::__attr_with_type<::cudaDevAttrKernelExecTimeout, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrIntegrated> //
+    : detail::__attr_with_type<::cudaDevAttrIntegrated, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrCanMapHostMemory> //
+    : detail::__attr_with_type<::cudaDevAttrCanMapHostMemory, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrComputeMode> //
+    : detail::__attr_with_type<::cudaDevAttrComputeMode, ::cudaComputeMode>
+{
+  static constexpr type default_mode           = cudaComputeModeDefault;
+  static constexpr type prohibited_mode        = cudaComputeModeProhibited;
+  static constexpr type exclusive_process_mode = cudaComputeModeExclusiveProcess;
+};
+template <>
+struct device::__attr<::cudaDevAttrConcurrentKernels> //
+    : detail::__attr_with_type<::cudaDevAttrConcurrentKernels, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrEccEnabled> //
+    : detail::__attr_with_type<::cudaDevAttrEccEnabled, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrTccDriver> //
+    : detail::__attr_with_type<::cudaDevAttrTccDriver, bool>
+{};
+// TODO: give this a strong type for kilohertz
+template <>
+struct device::__attr<::cudaDevAttrMemoryClockRate> //
+    : detail::__attr_with_type<::cudaDevAttrMemoryClockRate, int>
+{};
+// TODO: give this a strong type for bits
+template <>
+struct device::__attr<::cudaDevAttrGlobalMemoryBusWidth> //
+    : detail::__attr_with_type<::cudaDevAttrGlobalMemoryBusWidth, int>
+{};
+// TODO: give this a strong type for bytes
+template <>
+struct device::__attr<::cudaDevAttrL2CacheSize> //
+    : detail::__attr_with_type<::cudaDevAttrL2CacheSize, int>
+{};
+template <>
+struct device::__attr<::cudaDevAttrUnifiedAddressing> //
+    : detail::__attr_with_type<::cudaDevAttrUnifiedAddressing, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrStreamPrioritiesSupported> //
+    : detail::__attr_with_type<::cudaDevAttrStreamPrioritiesSupported, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrGlobalL1CacheSupported> //
+    : detail::__attr_with_type<::cudaDevAttrGlobalL1CacheSupported, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrLocalL1CacheSupported> //
+    : detail::__attr_with_type<::cudaDevAttrLocalL1CacheSupported, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrManagedMemory> //
+    : detail::__attr_with_type<::cudaDevAttrManagedMemory, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrIsMultiGpuBoard> //
+    : detail::__attr_with_type<::cudaDevAttrIsMultiGpuBoard, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrHostNativeAtomicSupported> //
+    : detail::__attr_with_type<::cudaDevAttrHostNativeAtomicSupported, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrPageableMemoryAccess> //
+    : detail::__attr_with_type<::cudaDevAttrPageableMemoryAccess, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrConcurrentManagedAccess> //
+    : detail::__attr_with_type<::cudaDevAttrConcurrentManagedAccess, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrComputePreemptionSupported> //
+    : detail::__attr_with_type<::cudaDevAttrComputePreemptionSupported, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrCanUseHostPointerForRegisteredMem> //
+    : detail::__attr_with_type<::cudaDevAttrCanUseHostPointerForRegisteredMem, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrCooperativeLaunch> //
+    : detail::__attr_with_type<::cudaDevAttrCooperativeLaunch, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrCooperativeMultiDeviceLaunch> //
+    : detail::__attr_with_type<::cudaDevAttrCooperativeMultiDeviceLaunch, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrCanFlushRemoteWrites> //
+    : detail::__attr_with_type<::cudaDevAttrCanFlushRemoteWrites, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrHostRegisterSupported> //
+    : detail::__attr_with_type<::cudaDevAttrHostRegisterSupported, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrDirectManagedMemAccessFromHost> //
+    : detail::__attr_with_type<::cudaDevAttrDirectManagedMemAccessFromHost, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrSparseCudaArraySupported> //
+    : detail::__attr_with_type<::cudaDevAttrSparseCudaArraySupported, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrMemoryPoolsSupported> //
+    : detail::__attr_with_type<::cudaDevAttrMemoryPoolsSupported, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrGPUDirectRDMASupported> //
+    : detail::__attr_with_type<::cudaDevAttrGPUDirectRDMASupported, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrDeferredMappingCudaArraySupported> //
+    : detail::__attr_with_type<::cudaDevAttrDeferredMappingCudaArraySupported, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrIpcEventSupport> //
+    : detail::__attr_with_type<::cudaDevAttrIpcEventSupport, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>
+    : detail::__attr_with_type<::cudaDevAttrPageableMemoryAccessUsesHostPageTables, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrHostRegisterReadOnlySupported> //
+    : detail::__attr_with_type<::cudaDevAttrHostRegisterReadOnlySupported, bool>
+{};
+template <>
+struct device::__attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions> //
+    : detail::__attr_with_type<::cudaDevAttrGPUDirectRDMAFlushWritesOptions, ::cudaFlushGPUDirectRDMAWritesOptions>
+{
+  static constexpr type host    = ::cudaFlushGPUDirectRDMAWritesOptionHost;
+  static constexpr type mem_ops = ::cudaFlushGPUDirectRDMAWritesOptionMemOps;
+};
+template <>
+struct device::__attr<::cudaDevAttrGPUDirectRDMAWritesOrdering> //
+    : detail::__attr_with_type<::cudaDevAttrGPUDirectRDMAWritesOrdering, ::cudaGPUDirectRDMAWritesOrdering>
+{
+  static constexpr type none        = ::cudaGPUDirectRDMAWritesOrderingNone;
+  static constexpr type owner       = ::cudaGPUDirectRDMAWritesOrderingOwner;
+  static constexpr type all_devices = ::cudaGPUDirectRDMAWritesOrderingAllDevices;
+};
+// TODO: This is a bitmask. What are the possible values?
+template <>
+struct device::__attr<::cudaDevAttrMemoryPoolSupportedHandleTypes> //
+    : detail::__attr_with_type<::cudaDevAttrMemoryPoolSupportedHandleTypes, unsigned int>
+{};
+#if CUDART_VERSION >= 12020
+template <>
+struct device::__attr<::cudaDevAttrNumaConfig> //
+    : detail::__attr_with_type<::cudaDevAttrNumaConfig, ::cudaDeviceNumaConfig>
+{
+  static constexpr type none      = ::cudaDeviceNumaConfigNone;
+  static constexpr type numa_node = ::cudaDeviceNumaConfigNumaNode;
+};
+#endif
+
+struct device::attrs
+{
+  // Maximum number of threads per block
+  using max_threads_per_block_t = __attr<::cudaDevAttrMaxThreadsPerBlock>;
+  static constexpr max_threads_per_block_t max_threads_per_block{};
+
+  // Maximum x-dimension of a block
+  using max_block_dim_x_t = __attr<::cudaDevAttrMaxBlockDimX>;
+  static constexpr max_block_dim_x_t max_block_dim_x{};
+
+  // Maximum y-dimension of a block
+  using max_block_dim_y_t = __attr<::cudaDevAttrMaxBlockDimY>;
+  static constexpr max_block_dim_y_t max_block_dim_y{};
+
+  // Maximum z-dimension of a block
+  using max_block_dim_z_t = __attr<::cudaDevAttrMaxBlockDimZ>;
+  static constexpr max_block_dim_z_t max_block_dim_z{};
+
+  // Maximum x-dimension of a grid
+  using max_grid_dim_x_t = __attr<::cudaDevAttrMaxGridDimX>;
+  static constexpr max_grid_dim_x_t max_grid_dim_x{};
+
+  // Maximum y-dimension of a grid
+  using max_grid_dim_y_t = __attr<::cudaDevAttrMaxGridDimY>;
+  static constexpr max_grid_dim_y_t max_grid_dim_y{};
+
+  // Maximum z-dimension of a grid
+  using max_grid_dim_z_t = __attr<::cudaDevAttrMaxGridDimZ>;
+  static constexpr max_grid_dim_z_t max_grid_dim_z{};
+
+  // Maximum amount of shared memory available to a thread block in bytes
+  using max_shared_memory_per_block_t = __attr<::cudaDevAttrMaxSharedMemoryPerBlock>;
+  static constexpr max_shared_memory_per_block_t max_shared_memory_per_block{};
+
+  // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
+  using total_constant_memory_t = __attr<::cudaDevAttrTotalConstantMemory>;
+  static constexpr total_constant_memory_t total_constant_memory{};
+
+  // Warp size in threads
+  using warp_size_t = __attr<::cudaDevAttrWarpSize>;
+  static constexpr warp_size_t warp_size{};
+
+  // Maximum pitch in bytes allowed by the memory copy functions that involve
+  // memory regions allocated through cudaMallocPitch()
+  using max_pitch_t = __attr<::cudaDevAttrMaxPitch>;
+  static constexpr max_pitch_t max_pitch{};
+
+  // Maximum 1D texture width
+  using max_texture_1d_width_t = __attr<::cudaDevAttrMaxTexture1DWidth>;
+  static constexpr max_texture_1d_width_t max_texture_1d_width{};
+
+  // Maximum width for a 1D texture bound to linear memory
+  using max_texture_1d_linear_width_t = __attr<::cudaDevAttrMaxTexture1DLinearWidth>;
+  static constexpr max_texture_1d_linear_width_t max_texture_1d_linear_width{};
+
+  // Maximum mipmapped 1D texture width
+  using max_texture_1d_mipmapped_width_t = __attr<::cudaDevAttrMaxTexture1DMipmappedWidth>;
+  static constexpr max_texture_1d_mipmapped_width_t max_texture_1d_mipmapped_width{};
+
+  // Maximum 2D texture width
+  using max_texture_2d_width_t = __attr<::cudaDevAttrMaxTexture2DWidth>;
+  static constexpr max_texture_2d_width_t max_texture_2d_width{};
+
+  // Maximum 2D texture height
+  using max_texture_2d_height_t = __attr<::cudaDevAttrMaxTexture2DHeight>;
+  static constexpr max_texture_2d_height_t max_texture_2d_height{};
+
+  // Maximum width for a 2D texture bound to linear memory
+  using max_texture_2d_linear_width_t = __attr<::cudaDevAttrMaxTexture2DLinearWidth>;
+  static constexpr max_texture_2d_linear_width_t max_texture_2d_linear_width{};
+
+  // Maximum height for a 2D texture bound to linear memory
+  using max_texture_2d_linear_height_t = __attr<::cudaDevAttrMaxTexture2DLinearHeight>;
+  static constexpr max_texture_2d_linear_height_t max_texture_2d_linear_height{};
+
+  // Maximum pitch in bytes for a 2D texture bound to linear memory
+  using max_texture_2d_linear_pitch_t = __attr<::cudaDevAttrMaxTexture2DLinearPitch>;
+  static constexpr max_texture_2d_linear_pitch_t max_texture_2d_linear_pitch{};
+
+  // Maximum mipmapped 2D texture width
+  using max_texture_2d_mipmapped_width_t = __attr<::cudaDevAttrMaxTexture2DMipmappedWidth>;
+  static constexpr max_texture_2d_mipmapped_width_t max_texture_2d_mipmapped_width{};
+
+  // Maximum mipmapped 2D texture height
+  using max_texture_2d_mipmapped_height_t = __attr<::cudaDevAttrMaxTexture2DMipmappedHeight>;
+  static constexpr max_texture_2d_mipmapped_height_t max_texture_2d_mipmapped_height{};
+
+  // Maximum 3D texture width
+  using max_texture_3d_width_t = __attr<::cudaDevAttrMaxTexture3DWidth>;
+  static constexpr max_texture_3d_width_t max_texture_3d_width{};
+
+  // Maximum 3D texture height
+  using max_texture_3d_height_t = __attr<::cudaDevAttrMaxTexture3DHeight>;
+  static constexpr max_texture_3d_height_t max_texture_3d_height{};
+
+  // Maximum 3D texture depth
+  using max_texture_3d_depth_t = __attr<::cudaDevAttrMaxTexture3DDepth>;
+  static constexpr max_texture_3d_depth_t max_texture_3d_depth{};
+
+  // Alternate maximum 3D texture width, 0 if no alternate maximum 3D texture size is supported
+  using max_texture_3d_width_alt_t = __attr<::cudaDevAttrMaxTexture3DWidthAlt>;
+  static constexpr max_texture_3d_width_alt_t max_texture_3d_width_alt{};
+
+  // Alternate maximum 3D texture height, 0 if no alternate maximum 3D texture size is supported
+  using max_texture_3d_height_alt_t = __attr<::cudaDevAttrMaxTexture3DHeightAlt>;
+  static constexpr max_texture_3d_height_alt_t max_texture_3d_height_alt{};
+
+  // Alternate maximum 3D texture depth, 0 if no alternate maximum 3D texture size is supported
+  using max_texture_3d_depth_alt_t = __attr<::cudaDevAttrMaxTexture3DDepthAlt>;
+  static constexpr max_texture_3d_depth_alt_t max_texture_3d_depth_alt{};
+
+  // Maximum cubemap texture width or height
+  using max_texture_cubemap_width_t = __attr<::cudaDevAttrMaxTextureCubemapWidth>;
+  static constexpr max_texture_cubemap_width_t max_texture_cubemap_width{};
+
+  // Maximum 1D layered texture width
+  using max_texture_1d_layered_width_t = __attr<::cudaDevAttrMaxTexture1DLayeredWidth>;
+  static constexpr max_texture_1d_layered_width_t max_texture_1d_layered_width{};
+
+  // Maximum layers in a 1D layered texture
+  using max_texture_1d_layered_layers_t = __attr<::cudaDevAttrMaxTexture1DLayeredLayers>;
+  static constexpr max_texture_1d_layered_layers_t max_texture_1d_layered_layers{};
+
+  // Maximum 2D layered texture width
+  using max_texture_2d_layered_width_t = __attr<::cudaDevAttrMaxTexture2DLayeredWidth>;
+  static constexpr max_texture_2d_layered_width_t max_texture_2d_layered_width{};
+
+  // Maximum 2D layered texture height
+  using max_texture_2d_layered_height_t = __attr<::cudaDevAttrMaxTexture2DLayeredHeight>;
+  static constexpr max_texture_2d_layered_height_t max_texture_2d_layered_height{};
+
+  // Maximum layers in a 2D layered texture
+  using max_texture_2d_layered_layers_t = __attr<::cudaDevAttrMaxTexture2DLayeredLayers>;
+  static constexpr max_texture_2d_layered_layers_t max_texture_2d_layered_layers{};
+
+  // Maximum cubemap layered texture width or height
+  using max_texture_cubemap_layered_width_t = __attr<::cudaDevAttrMaxTextureCubemapLayeredWidth>;
+  static constexpr max_texture_cubemap_layered_width_t max_texture_cubemap_layered_width{};
+
+  // Maximum layers in a cubemap layered texture
+  using max_texture_cubemap_layered_layers_t = __attr<::cudaDevAttrMaxTextureCubemapLayeredLayers>;
+  static constexpr max_texture_cubemap_layered_layers_t max_texture_cubemap_layered_layers{};
+
+  // Maximum 1D surface width
+  using max_surface_1d_width_t = __attr<::cudaDevAttrMaxSurface1DWidth>;
+  static constexpr max_surface_1d_width_t max_surface_1d_width{};
+
+  // Maximum 2D surface width
+  using max_surface_2d_width_t = __attr<::cudaDevAttrMaxSurface2DWidth>;
+  static constexpr max_surface_2d_width_t max_surface_2d_width{};
+
+  // Maximum 2D surface height
+  using max_surface_2d_height_t = __attr<::cudaDevAttrMaxSurface2DHeight>;
+  static constexpr max_surface_2d_height_t max_surface_2d_height{};
+
+  // Maximum 3D surface width
+  using max_surface_3d_width_t = __attr<::cudaDevAttrMaxSurface3DWidth>;
+  static constexpr max_surface_3d_width_t max_surface_3d_width{};
+
+  // Maximum 3D surface height
+  using max_surface_3d_height_t = __attr<::cudaDevAttrMaxSurface3DHeight>;
+  static constexpr max_surface_3d_height_t max_surface_3d_height{};
+
+  // Maximum 3D surface depth
+  using max_surface_3d_depth_t = __attr<::cudaDevAttrMaxSurface3DDepth>;
+  static constexpr max_surface_3d_depth_t max_surface_3d_depth{};
+
+  // Maximum 1D layered surface width
+  using max_surface_1d_layered_width_t = __attr<::cudaDevAttrMaxSurface1DLayeredWidth>;
+  static constexpr max_surface_1d_layered_width_t max_surface_1d_layered_width{};
+
+  // Maximum layers in a 1D layered surface
+  using max_surface_1d_layered_layers_t = __attr<::cudaDevAttrMaxSurface1DLayeredLayers>;
+  static constexpr max_surface_1d_layered_layers_t max_surface_1d_layered_layers{};
+
+  // Maximum 2D layered surface width
+  using max_surface_2d_layered_width_t = __attr<::cudaDevAttrMaxSurface2DLayeredWidth>;
+  static constexpr max_surface_2d_layered_width_t max_surface_2d_layered_width{};
+
+  // Maximum 2D layered surface height
+  using max_surface_2d_layered_height_t = __attr<::cudaDevAttrMaxSurface2DLayeredHeight>;
+  static constexpr max_surface_2d_layered_height_t max_surface_2d_layered_height{};
+
+  // Maximum layers in a 2D layered surface
+  using max_surface_2d_layered_layers_t = __attr<::cudaDevAttrMaxSurface2DLayeredLayers>;
+  static constexpr max_surface_2d_layered_layers_t max_surface_2d_layered_layers{};
+
+  // Maximum cubemap surface width
+  using max_surface_cubemap_width_t = __attr<::cudaDevAttrMaxSurfaceCubemapWidth>;
+  static constexpr max_surface_cubemap_width_t max_surface_cubemap_width{};
+
+  // Maximum cubemap layered surface width
+  using max_surface_cubemap_layered_width_t = __attr<::cudaDevAttrMaxSurfaceCubemapLayeredWidth>;
+  static constexpr max_surface_cubemap_layered_width_t max_surface_cubemap_layered_width{};
+
+  // Maximum layers in a cubemap layered surface
+  using max_surface_cubemap_layered_layers_t = __attr<::cudaDevAttrMaxSurfaceCubemapLayeredLayers>;
+  static constexpr max_surface_cubemap_layered_layers_t max_surface_cubemap_layered_layers{};
+
+  // Maximum number of 32-bit registers available to a thread block
+  using max_registers_per_block_t = __attr<::cudaDevAttrMaxRegistersPerBlock>;
+  static constexpr max_registers_per_block_t max_registers_per_block{};
+
+  // Peak clock frequency in kilohertz
+  using clock_rate_t = __attr<::cudaDevAttrClockRate>;
+  static constexpr clock_rate_t clock_rate{};
+
+  // Alignment requirement; texture base addresses aligned to textureAlign bytes
+  // do not need an offset applied to texture fetches
+  using texture_alignment_t = __attr<::cudaDevAttrTextureAlignment>;
+  static constexpr texture_alignment_t texture_alignment{};
+
+  // Pitch alignment requirement for 2D texture references bound to pitched memory
+  using texture_pitch_alignment_t = __attr<::cudaDevAttrTexturePitchAlignment>;
+  static constexpr texture_pitch_alignment_t texture_pitch_alignment{};
+
+  // true if the device can concurrently copy memory between host and device
+  // while executing a kernel, or false if not
+  using gpu_overlap_t = __attr<::cudaDevAttrGpuOverlap>;
+  static constexpr gpu_overlap_t gpu_overlap{};
+
+  // Number of multiprocessors on the device
+  using multi_processor_count_t = __attr<::cudaDevAttrMultiProcessorCount>;
+  static constexpr multi_processor_count_t multi_processor_count{};
+
+  // true if there is a run time limit for kernels executed on the device, or
+  // false if not
+  using kernel_exec_timeout_t = __attr<::cudaDevAttrKernelExecTimeout>;
+  static constexpr kernel_exec_timeout_t kernel_exec_timeout{};
+
+  // true if the device is integrated with the memory subsystem, or false if not
+  using integrated_t = __attr<::cudaDevAttrIntegrated>;
+  static constexpr integrated_t integrated{};
+
+  // true if the d
+  using can_map_host_memory_t = __attr<::cudaDevAttrCanMapHostMemory>;
+  static constexpr can_map_host_memory_t can_map_host_memory{};
+
+  // Compute mode is the compute mode that the device is currently in.
+  using compute_mode_t = __attr<::cudaDevAttrComputeMode>;
+  static constexpr compute_mode_t compute_mode{};
+
+  // true if the device supports executing multiple kernels within the same
+  // context simultaneously, or false if not. It is not guaranteed that multiple
+  // kernels will be resident on the device concurrently so this feature should
+  // not be relied upon for correctness.
+  using concurrent_kernels_t = __attr<::cudaDevAttrConcurrentKernels>;
+  static constexpr concurrent_kernels_t concurrent_kernels{};
+
+  // true if error correction is enabled on the device, 0 if error correction is
+  // disabled or not supported by the device
+  using ecc_enabled_t = __attr<::cudaDevAttrEccEnabled>;
+  static constexpr ecc_enabled_t ecc_enabled{};
+
+  // PCI bus identifier of the device
+  using pci_bus_id_t = __attr<::cudaDevAttrPciBusId>;
+  static constexpr pci_bus_id_t pci_bus_id{};
+
+  // PCI device (also known as slot) identifier of the device
+  using pci_device_id_t = __attr<::cudaDevAttrPciDeviceId>;
+  static constexpr pci_device_id_t pci_device_id{};
+
+  // true if the device is using a TCC driver. TCC is only available on Tesla
+  // hardware running Windows Vista or later.
+  using tcc_driver_t = __attr<::cudaDevAttrTccDriver>;
+  static constexpr tcc_driver_t tcc_driver{};
+
+  // Peak memory clock frequency in kilohertz
+  using memory_clock_rate_t = __attr<::cudaDevAttrMemoryClockRate>;
+  static constexpr memory_clock_rate_t memory_clock_rate{};
+
+  // Global memory bus width in bits
+  using global_memory_bus_width_t = __attr<::cudaDevAttrGlobalMemoryBusWidth>;
+  static constexpr global_memory_bus_width_t global_memory_bus_width{};
+
+  // Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
+  using l2_cache_size_t = __attr<::cudaDevAttrL2CacheSize>;
+  static constexpr l2_cache_size_t l2_cache_size{};
+
+  // Maximum resident threads per multiprocessor
+  using max_threads_per_multi_processor_t = __attr<::cudaDevAttrMaxThreadsPerMultiProcessor>;
+  static constexpr max_threads_per_multi_processor_t max_threads_per_multi_processor{};
+
+  // true if the device shares a unified address space with the host, or false
+  // if not
+  using unified_addressing_t = __attr<::cudaDevAttrUnifiedAddressing>;
+  static constexpr unified_addressing_t unified_addressing{};
+
+  // Major compute capability version number
+  using compute_capability_major_t = __attr<::cudaDevAttrComputeCapabilityMajor>;
+  static constexpr compute_capability_major_t compute_capability_major{};
+
+  // Minor compute capability version number
+  using compute_capability_minor_t = __attr<::cudaDevAttrComputeCapabilityMinor>;
+  static constexpr compute_capability_minor_t compute_capability_minor{};
+
+  // true if the device supports stream priorities, or false if not
+  using stream_priorities_supported_t = __attr<::cudaDevAttrStreamPrioritiesSupported>;
+  static constexpr stream_priorities_supported_t stream_priorities_supported{};
+
+  // true if device supports caching globals in L1 cache, false if not
+  using global_l1_cache_supported_t = __attr<::cudaDevAttrGlobalL1CacheSupported>;
+  static constexpr global_l1_cache_supported_t global_l1_cache_supported{};
+
+  // true if device supports caching locals in L1 cache, false if not
+  using local_l1_cache_supported_t = __attr<::cudaDevAttrLocalL1CacheSupported>;
+  static constexpr local_l1_cache_supported_t local_l1_cache_supported{};
+
+  // Maximum amount of shared memory available to a multiprocessor in bytes;
+  // this amount is shared by all thread blocks simultaneously resident on a
+  // multiprocessor
+  using max_shared_memory_per_multiprocessor_t = __attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor>;
+  static constexpr max_shared_memory_per_multiprocessor_t max_shared_memory_per_multiprocessor{};
+
+  // Maximum number of 32-bit registers available to a multiprocessor; this
+  // number is shared by all thread blocks simultaneously resident on a
+  // multiprocessor
+  using max_registers_per_multiprocessor_t = __attr<::cudaDevAttrMaxRegistersPerMultiprocessor>;
+  static constexpr max_registers_per_multiprocessor_t max_registers_per_multiprocessor{};
+
+  // true if device supports allocating managed memory, false if not
+  using managed_memory_t = __attr<::cudaDevAttrManagedMemory>;
+  static constexpr managed_memory_t managed_memory{};
+
+  // true if device is on a multi-GPU board, false if not
+  using is_multi_gpu_board_t = __attr<::cudaDevAttrIsMultiGpuBoard>;
+  static constexpr is_multi_gpu_board_t is_multi_gpu_board{};
+
+  // Unique identifier for a group of devices on the same multi-GPU board
+  using multi_gpu_board_group_id_t = __attr<::cudaDevAttrMultiGpuBoardGroupID>;
+  static constexpr multi_gpu_board_group_id_t multi_gpu_board_group_id{};
+
+  // true if the link between the device and the host supports native atomic
+  // operations
+  using host_native_atomic_supported_t = __attr<::cudaDevAttrHostNativeAtomicSupported>;
+  static constexpr host_native_atomic_supported_t host_native_atomic_supported{};
+
+  // Ratio of single precision performance (in floating-point operations per
+  // second) to double precision performance
+  using single_to_double_precision_perf_ratio_t = __attr<::cudaDevAttrSingleToDoublePrecisionPerfRatio>;
+  static constexpr single_to_double_precision_perf_ratio_t single_to_double_precision_perf_ratio{};
+
+  // true if the device supports coherently accessing pageable memory without
+  // calling cudaHostRegister on it, and false otherwise
+  using pageable_memory_access_t = __attr<::cudaDevAttrPageableMemoryAccess>;
+  static constexpr pageable_memory_access_t pageable_memory_access{};
+
+  // true if the device can coherently access managed memory concurrently with
+  // the CPU, and false otherwise
+  using concurrent_managed_access_t = __attr<::cudaDevAttrConcurrentManagedAccess>;
+  static constexpr concurrent_managed_access_t concurrent_managed_access{};
+
+  // true if the device supports Compute Preemption, false if not
+  using compute_preemption_supported_t = __attr<::cudaDevAttrComputePreemptionSupported>;
+  static constexpr compute_preemption_supported_t compute_preemption_supported{};
+
+  // true if the device can access host registered memory at the same virtual
+  // address as the CPU, and false otherwise
+  using can_use_host_pointer_for_registered_mem_t = __attr<::cudaDevAttrCanUseHostPointerForRegisteredMem>;
+  static constexpr can_use_host_pointer_for_registered_mem_t can_use_host_pointer_for_registered_mem{};
+
+  // true if the device supports launching cooperative kernels via
+  // cudaLaunchCooperativeKernel, and false otherwise
+  using cooperative_launch_t = __attr<::cudaDevAttrCooperativeLaunch>;
+  static constexpr cooperative_launch_t cooperative_launch{};
+
+  // true if the device supports launching cooperative kernels via
+  // cudaLaunchCooperativeKernelMultiDevice, and false otherwise
+  using cooperative_multi_device_launch_t = __attr<::cudaDevAttrCooperativeMultiDeviceLaunch>;
+  static constexpr cooperative_multi_device_launch_t cooperative_multi_device_launch{};
+
+  // true if the device supports flushing of outstanding remote writes, and
+  // false otherwise
+  using can_flush_remote_writes_t = __attr<::cudaDevAttrCanFlushRemoteWrites>;
+  static constexpr can_flush_remote_writes_t can_flush_remote_writes{};
+
+  // true if the device supports host memory registration via cudaHostRegister,
+  // and false otherwise
+  using host_register_supported_t = __attr<::cudaDevAttrHostRegisterSupported>;
+  static constexpr host_register_supported_t host_register_supported{};
+
+  // true if the device accesses pageable memory via the host's page tables, and
+  // false otherwise
+  using pageable_memory_access_uses_host_page_tables_t = __attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>;
+  static constexpr pageable_memory_access_uses_host_page_tables_t pageable_memory_access_uses_host_page_tables{};
+
+  // true if the host can directly access managed memory on the device without
+  // migration, and false otherwise
+  using direct_managed_mem_access_from_host_t = __attr<::cudaDevAttrDirectManagedMemAccessFromHost>;
+  static constexpr direct_managed_mem_access_from_host_t direct_managed_mem_access_from_host{};
+
+  // Maximum per block shared memory size on the device. This value can be opted
+  // into when using cudaFuncSetAttribute
+  using max_shared_memory_per_block_optin_t = __attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin>;
+  static constexpr max_shared_memory_per_block_optin_t max_shared_memory_per_block_optin{};
+
+  // Maximum number of thread blocks that can reside on a multiprocessor
+  using max_blocks_per_multiprocessor_t = __attr<::cudaDevAttrMaxBlocksPerMultiprocessor>;
+  static constexpr max_blocks_per_multiprocessor_t max_blocks_per_multiprocessor{};
+
+  // Maximum L2 persisting lines capacity setting in bytes
+  using max_persisting_l2_cache_size_t = __attr<::cudaDevAttrMaxPersistingL2CacheSize>;
+  static constexpr max_persisting_l2_cache_size_t max_persisting_l2_cache_size{};
+
+  // Maximum value of cudaAccessPolicyWindow::num_bytes
+  using max_access_policy_window_size_t = __attr<::cudaDevAttrMaxAccessPolicyWindowSize>;
+  static constexpr max_access_policy_window_size_t max_access_policy_window_size{};
+
+  // Shared memory reserved by CUDA driver per block in bytes
+  using reserved_shared_memory_per_block_t = __attr<::cudaDevAttrReservedSharedMemoryPerBlock>;
+  static constexpr reserved_shared_memory_per_block_t reserved_shared_memory_per_block{};
+
+  // true if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
+  using sparse_cuda_array_supported_t = __attr<::cudaDevAttrSparseCudaArraySupported>;
+  static constexpr sparse_cuda_array_supported_t sparse_cuda_array_supported{};
+
+  // Device supports using the cudaHostRegister flag cudaHostRegisterReadOnly to
+  // register memory that must be mapped as read-only to the GPU
+  using host_register_read_only_supported_t = __attr<::cudaDevAttrHostRegisterReadOnlySupported>;
+  static constexpr host_register_read_only_supported_t host_register_read_only_supported{};
+
+  // true if the device supports using the cudaMallocAsync and cudaMemPool
+  // family of APIs, and false otherwise
+  using memory_pools_supported_t = __attr<::cudaDevAttrMemoryPoolsSupported>;
+  static constexpr memory_pools_supported_t memory_pools_supported{};
+
+  // true if the device supports GPUDirect RDMA APIs, and false otherwise
+  using gpu_direct_rdma_supported_t = __attr<::cudaDevAttrGPUDirectRDMASupported>;
+  static constexpr gpu_direct_rdma_supported_t gpu_direct_rdma_supported{};
+
+  // bitmask to be interpreted according to the
+  // cudaFlushGPUDirectRDMAWritesOptions enum
+  using gpu_direct_rdma_flush_writes_options_t = __attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions>;
+  static constexpr gpu_direct_rdma_flush_writes_options_t gpu_direct_rdma_flush_writes_options{};
+
+  // see the cudaGPUDirectRDMAWritesOrdering enum for numerical values
+  using gpu_direct_rdma_writes_ordering_t = __attr<::cudaDevAttrGPUDirectRDMAWritesOrdering>;
+  static constexpr gpu_direct_rdma_writes_ordering_t gpu_direct_rdma_writes_ordering{};
+
+  // Bitmask of handle types supported with mempool based IPC
+  using memory_pool_supported_handle_types_t = __attr<::cudaDevAttrMemoryPoolSupportedHandleTypes>;
+  static constexpr memory_pool_supported_handle_types_t memory_pool_supported_handle_types{};
+
+  // true if the device supports deferred mapping CUDA arrays and CUDA mipmapped
+  // arrays.
+  using deferred_mapping_cuda_array_supported_t = __attr<::cudaDevAttrDeferredMappingCudaArraySupported>;
+  static constexpr deferred_mapping_cuda_array_supported_t deferred_mapping_cuda_array_supported{};
+
+  // true if the device supports IPC Events, false otherwise.
+  using ipc_event_support_t = __attr<::cudaDevAttrIpcEventSupport>;
+  static constexpr ipc_event_support_t ipc_event_support{};
+
+#if CUDART_VERSION >= 12020
+
+  // NUMA configuration of a device: value is of type cudaDeviceNumaConfig enum
+  using numa_config_t = __attr<::cudaDevAttrNumaConfig>;
+  static constexpr numa_config_t numa_config{};
+
+  // NUMA node ID of the GPU memory
+  using numa_id_t = __attr<::cudaDevAttrNumaId>;
+  static constexpr numa_id_t numa_id{};
+
+#endif // CUDART_VERSION >= 12020
+};
+} // namespace cuda::experimental
+
+#endif // _CUDAX__DEVICE_ATTRIBUTES_
diff --git a/cudax/include/cuda/experimental/__device/device.cuh b/cudax/include/cuda/experimental/__device/device.cuh
new file mode 100644
index 00000000000..c69c8de20f3
--- /dev/null
+++ b/cudax/include/cuda/experimental/__device/device.cuh
@@ -0,0 +1,171 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__DEVICE_DEVICE
+#define _CUDAX__DEVICE_DEVICE
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cccl/attributes.h>
+#include <cuda/std/__cuda/api_wrapper.h>
+#include <cuda/std/__type_traits/decay.h>
+
+namespace cuda::experimental
+{
+// Dummy struct for now to be able to reference it in other places
+// TODO this might be device_ref instead
+// TODO proper implementation
+//! @brief A non-owning representation of a CUDA device
+class device
+{
+  int __id_ = 0;
+
+  template <::cudaDeviceAttr _Attr>
+  struct __attr
+  {
+    using type = int;
+
+    _CCCL_NODISCARD constexpr operator ::cudaDeviceAttr() const noexcept
+    {
+      return _Attr;
+    }
+
+    _CCCL_NODISCARD type operator()(device __dev) const
+    {
+      return __dev.attr<_Attr>();
+    }
+  };
+
+public:
+  struct attrs;
+
+  //! @brief For a given attribute, returns the type of the attribute value.
+  //!
+  //! @par Example
+  //! @code
+  //! using threads_per_block_t = device::attr_result_t<device::attrs::max_threads_per_block>;
+  //! static_assert(std::is_same_v<threads_per_block_t, int>);
+  //! @endcode
+  //!
+  //! @sa device::attrs
+  template <::cudaDeviceAttr _Attr>
+  using attr_result_t = typename __attr<_Attr>::type;
+
+  //! @brief Create a `device` object from a native device ordinal.
+  /*implicit*/ constexpr device(int __id) noexcept
+      : __id_(__id)
+  {}
+
+  //! @brief Retrieve the native ordinal of the device
+  //!
+  //! @return int The native device ordinal held by the device object
+  _CCCL_NODISCARD constexpr int get() const noexcept
+  {
+    return __id_;
+  }
+
+  //! @brief Retrieve the specified attribute for the device
+  //!
+  //! @param __attr The attribute to query. See `device::attrs` for the available
+  //!        attributes.
+  //!
+  //! @throws cuda_error if the attribute query fails
+  //!
+  //! @sa device::attrs
+  template <::cudaDeviceAttr _Attr>
+  _CCCL_NODISCARD auto attr([[maybe_unused]] device::__attr<_Attr> __attr) const
+  {
+    int __value = 0;
+    _CCCL_TRY_CUDA_API(::cudaDeviceGetAttribute, "failed to get device attribute", &__value, _Attr, get());
+    return static_cast<typename device::__attr<_Attr>::type>(__value);
+  }
+
+  //! @overload
+  template <::cudaDeviceAttr _Attr>
+  _CCCL_NODISCARD auto attr() const
+  {
+    return attr(__attr<_Attr>());
+  }
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+//! @brief RAII helper which saves the current device and switches to the
+//!        specified device on construction and switches to the saved device on
+//!        destruction.
+//!
+struct __scoped_device
+{
+private:
+  // The original device ordinal, or -1 if the device was not changed.
+  int const __old_device;
+
+  //! @brief Returns the current device ordinal.
+  //!
+  //! @throws cuda_error if the device query fails.
+  static int __current_device()
+  {
+    int device = -1;
+    _CCCL_TRY_CUDA_API(cudaGetDevice, "failed to get the current device", &device);
+    return device;
+  }
+
+  explicit __scoped_device(int new_device, int old_device) noexcept
+      : __old_device(new_device == old_device ? -1 : old_device)
+  {}
+
+public:
+  //! @brief Construct a new `__scoped_device` object and switch to the specified
+  //!        device.
+  //!
+  //! @param new_device The device to switch to
+  //!
+  //! @throws cuda_error if the device switch fails
+  explicit __scoped_device(device new_device)
+      : __scoped_device(new_device.get(), __current_device())
+  {
+    if (__old_device != -1)
+    {
+      _CCCL_TRY_CUDA_API(cudaSetDevice, "failed to set the current device", new_device.get());
+    }
+  }
+
+  __scoped_device(__scoped_device&&)                 = delete;
+  __scoped_device(__scoped_device const&)            = delete;
+  __scoped_device& operator=(__scoped_device&&)      = delete;
+  __scoped_device& operator=(__scoped_device const&) = delete;
+
+  //! @brief Destroy the `__scoped_device` object and switch back to the original
+  //!        device.
+  //!
+  //! @throws cuda_error if the device switch fails. If the destructor is called
+  //!         during stack unwinding, the program is automatically terminated.
+  ~__scoped_device() noexcept(false)
+  {
+    if (__old_device != -1)
+    {
+      _CCCL_TRY_CUDA_API(cudaSetDevice, "failed to restore the current device", __old_device);
+    }
+  }
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+} // namespace cuda::experimental
+
+#endif // _CUDAX__DEVICE_DEVICE
diff --git a/cudax/include/cuda/experimental/__event/event.cuh b/cudax/include/cuda/experimental/__event/event.cuh
new file mode 100644
index 00000000000..0b6b7802b22
--- /dev/null
+++ b/cudax/include/cuda/experimental/__event/event.cuh
@@ -0,0 +1,156 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX_EVENT_DETAIL_H
+#define _CUDAX_EVENT_DETAIL_H
+
+#include <cuda_runtime_api.h>
+// cuda_runtime_api needs to come first
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cuda/api_wrapper.h>
+#include <cuda/std/cstddef>
+#include <cuda/std/utility>
+
+#include <cuda/experimental/__detail/utility.cuh>
+#include <cuda/experimental/__event/event_ref.cuh>
+
+namespace cuda::experimental
+{
+class timed_event;
+
+//! @brief An owning wrapper for an untimed `cudaEvent_t`.
+class event : public event_ref
+{
+  friend class timed_event;
+
+public:
+  //! @brief Flags to use when creating the event.
+  enum class flags : unsigned int
+  {
+    none          = cudaEventDefault,
+    blocking_sync = cudaEventBlockingSync,
+    interprocess  = cudaEventInterprocess
+  };
+
+  //! @brief Construct a new `event` object with timing disabled, and record
+  //!        the event in the specified stream.
+  //!
+  //! @throws cuda_error if the event creation fails.
+  explicit event(stream_ref __stream, flags __flags = flags::none)
+      : event(static_cast<unsigned int>(__flags) | cudaEventDisableTiming)
+  {
+    record(__stream);
+  }
+
+  //! @brief Construct a new `event` object into the moved-from state.
+  //!
+  //! @post `get()` returns `cudaEvent_t()`.
+  explicit constexpr event(uninit_t) noexcept
+      : event_ref(::cudaEvent_t{})
+  {}
+
+  //! @brief Move-construct a new `event` object
+  //!
+  //! @param __other
+  //!
+  //! @post `__other` is in a moved-from state.
+  constexpr event(event&& __other) noexcept
+      : event_ref(_CUDA_VSTD::exchange(__other.__event_, {}))
+  {}
+
+  // Disallow copy construction.
+  event(const event&) = delete;
+
+  //! @brief Destroy the `event` object
+  //!
+  //! @note If the event fails to be destroyed, the error is silently ignored.
+  ~event()
+  {
+    if (__event_ != nullptr)
+    {
+      [[maybe_unused]] auto __status = ::cudaEventDestroy(__event_);
+    }
+  }
+
+  //! @brief Move-assign an `event` object
+  //!
+  //! @param __other
+  //!
+  //! @post `__other` is in a moved-from state.
+  event& operator=(event&& __other) noexcept
+  {
+    event __tmp(_CUDA_VSTD::move(__other));
+    _CUDA_VSTD::swap(__event_, __tmp.__event_);
+    return *this;
+  }
+
+  // Disallow copy assignment.
+  event& operator=(const event&) = delete;
+
+  //! @brief Construct an `event` object from a native `cudaEvent_t` handle.
+  //!
+  //! @param __evnt The native handle
+  //!
+  //! @return event The constructed `event` object
+  //!
+  //! @note The constructed `event` object takes ownership of the native handle.
+  _CCCL_NODISCARD static event from_native_handle(::cudaEvent_t __evnt) noexcept
+  {
+    return event(__evnt);
+  }
+
+  // Disallow construction from an `int`, e.g., `0`.
+  static event from_native_handle(int) = delete;
+
+  // Disallow construction from `nullptr`.
+  static event from_native_handle(_CUDA_VSTD::nullptr_t) = delete;
+
+  //! @brief Retrieve the native `cudaEvent_t` handle and give up ownership.
+  //!
+  //! @return cudaEvent_t The native handle being held by the `event` object.
+  //!
+  //! @post The event object is in a moved-from state.
+  _CCCL_NODISCARD constexpr ::cudaEvent_t release() noexcept
+  {
+    return _CUDA_VSTD::exchange(__event_, {});
+  }
+
+  _CCCL_NODISCARD_FRIEND constexpr flags operator|(flags __lhs, flags __rhs) noexcept
+  {
+    return static_cast<flags>(static_cast<unsigned int>(__lhs) | static_cast<unsigned int>(__rhs));
+  }
+
+private:
+  // Use `event::from_native_handle(e)` to construct an owning `event`
+  // object from a `cudaEvent_t` handle.
+  explicit constexpr event(::cudaEvent_t __evnt) noexcept
+      : event_ref(__evnt)
+  {}
+
+  explicit event(unsigned int __flags)
+      : event_ref(::cudaEvent_t{})
+  {
+    _CCCL_TRY_CUDA_API(
+      ::cudaEventCreateWithFlags, "Failed to create CUDA event", &__event_, static_cast<unsigned int>(__flags));
+  }
+};
+} // namespace cuda::experimental
+
+#endif // _CUDAX_EVENT_DETAIL_H
diff --git a/cudax/include/cuda/experimental/__event/event_ref.cuh b/cudax/include/cuda/experimental/__event/event_ref.cuh
new file mode 100644
index 00000000000..b795d46a77b
--- /dev/null
+++ b/cudax/include/cuda/experimental/__event/event_ref.cuh
@@ -0,0 +1,134 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX_EVENT_REF_DETAIL_H
+#define _CUDAX_EVENT_REF_DETAIL_H
+
+#include <cuda_runtime_api.h>
+// cuda_runtime_api needs to come first
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cuda/api_wrapper.h>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+#include <cuda/std/utility>
+#include <cuda/stream_ref>
+
+namespace cuda::experimental
+{
+class event;
+class timed_event;
+
+//! @brief An non-owning wrapper for an untimed `cudaEvent_t`.
+class event_ref
+{
+private:
+  friend class event;
+  friend class timed_event;
+
+  ::cudaEvent_t __event_{};
+
+public:
+  using value_type = ::cudaEvent_t;
+
+  //! @brief Construct a new `event_ref` object from a `cudaEvent_t`
+  //!
+  //! This constructor provides an implicit conversion from `cudaEvent_t`
+  //!
+  //! @post `get() == __evnt`
+  //!
+  //! @note: It is the callers responsibilty to ensure the `event_ref` does not
+  //! outlive the event denoted by the `cudaEvent_t` handle.
+  constexpr event_ref(::cudaEvent_t __evnt) noexcept
+      : __event_(__evnt)
+  {}
+
+  /// Disallow construction from an `int`, e.g., `0`.
+  event_ref(int) = delete;
+
+  /// Disallow construction from `nullptr`.
+  event_ref(_CUDA_VSTD::nullptr_t) = delete;
+
+  //! @brief Records an event on the specified stream
+  //!
+  //! @param __stream
+  //!
+  //! @throws cuda_error if the event record fails
+  void record(stream_ref __stream) const
+  {
+    assert(__event_ != nullptr);
+    assert(__stream.get() != nullptr);
+    _CCCL_TRY_CUDA_API(::cudaEventRecord, "Failed to record CUDA event", __event_, __stream.get());
+  }
+
+  //! @brief Waits until all the work in the stream prior to the record of the
+  //!        event has completed.
+  //!
+  //! @throws cuda_error if waiting for the event fails
+  void wait() const
+  {
+    assert(__event_ != nullptr);
+    _CCCL_TRY_CUDA_API(::cudaEventSynchronize, "Failed to wait for CUDA event", __event_);
+  }
+
+  //! @brief Retrieve the native `cudaEvent_t` handle.
+  //!
+  //! @return cudaEvent_t The native handle being held by the event_ref object.
+  _CCCL_NODISCARD constexpr ::cudaEvent_t get() const noexcept
+  {
+    return __event_;
+  }
+
+  //! @brief Checks if the `event_ref` is valid
+  //!
+  //! @return true if the `event_ref` is valid, false otherwise.
+  _CCCL_NODISCARD explicit constexpr operator bool() const noexcept
+  {
+    return __event_ != nullptr;
+  }
+
+  //! @brief Compares two `event_ref`s for equality
+  //!
+  //! @note Allows comparison with `cudaEvent_t` due to implicit conversion to
+  //! `event_ref`.
+  //!
+  //! @param lhs The first `event_ref` to compare
+  //! @param rhs The second `event_ref` to compare
+  //! @return true if `lhs` and `rhs` refer to the same `cudaEvent_t` object.
+  _CCCL_NODISCARD_FRIEND constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept
+  {
+    return __lhs.__event_ == __rhs.__event_;
+  }
+
+  //! @brief Compares two `event_ref`s for inequality
+  //!
+  //! @note Allows comparison with `cudaEvent_t` due to implicit conversion to
+  //! `event_ref`.
+  //!
+  //! @param lhs The first `event_ref` to compare
+  //! @param rhs The second `event_ref` to compare
+  //! @return true if `lhs` and `rhs` refer to different `cudaEvent_t` objects.
+  _CCCL_NODISCARD_FRIEND constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept
+  {
+    return __lhs.__event_ != __rhs.__event_;
+  }
+};
+} // namespace cuda::experimental
+
+#endif // _CUDAX_EVENT_REF_DETAIL_H
diff --git a/cudax/include/cuda/experimental/__event/timed_event.cuh b/cudax/include/cuda/experimental/__event/timed_event.cuh
new file mode 100644
index 00000000000..debcbcd26e5
--- /dev/null
+++ b/cudax/include/cuda/experimental/__event/timed_event.cuh
@@ -0,0 +1,107 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX_TIMED_EVENT_DETAIL_H
+#define _CUDAX_TIMED_EVENT_DETAIL_H
+
+#include <cuda_runtime_api.h>
+// cuda_runtime_api needs to come first
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cuda/api_wrapper.h>
+#include <cuda/std/chrono>
+#include <cuda/std/cstddef>
+
+#include <cuda/experimental/__detail/utility.cuh>
+#include <cuda/experimental/__event/event.cuh>
+
+namespace cuda::experimental
+{
+//! @brief An owning wrapper for a `cudaEvent_t` with timing enabled.
+class timed_event : public event
+{
+public:
+  //! @brief Construct a new `timed_event` object with the specified flags
+  //!        and record the event on the specified stream.
+  //!
+  //! @throws cuda_error if the event creation fails.
+  explicit timed_event(stream_ref __stream, flags __flags = flags::none)
+      : event(static_cast<unsigned int>(__flags))
+  {
+    record(__stream);
+  }
+
+  //! @brief Construct a new `timed_event` object into the moved-from state.
+  //!
+  //! @post `get()` returns `cudaEvent_t()`.
+  explicit constexpr timed_event(uninit_t) noexcept
+      : event(uninit)
+  {}
+
+  timed_event(timed_event&&) noexcept            = default;
+  timed_event(const timed_event&)                = delete;
+  timed_event& operator=(timed_event&&) noexcept = default;
+  timed_event& operator=(const timed_event&)     = delete;
+
+  //! @brief Construct a `timed_event` object from a native `cudaEvent_t` handle.
+  //!
+  //! @param __evnt The native handle
+  //!
+  //! @return timed_event The constructed `timed_event` object
+  //!
+  //! @note The constructed `timed_event` object takes ownership of the native handle.
+  _CCCL_NODISCARD static timed_event from_native_handle(::cudaEvent_t __evnt) noexcept
+  {
+    return timed_event(__evnt);
+  }
+
+  // Disallow construction from an `int`, e.g., `0`.
+  static timed_event from_native_handle(int) = delete;
+
+  // Disallow construction from `nullptr`.
+  static timed_event from_native_handle(_CUDA_VSTD::nullptr_t) = delete;
+
+  //! @brief Compute the time elapsed between two `timed_event` objects.
+  //!
+  //! @throws cuda_error if the query for the elapsed time fails.
+  //!
+  //! @param __end The `timed_event` object representing the end time.
+  //! @param __start The `timed_event` object representing the start time.
+  //!
+  //! @return cuda::std::chrono::nanoseconds The elapsed time in nanoseconds.
+  //!
+  //! @note The elapsed time has a resolution of approximately 0.5 microseconds.
+  _CCCL_NODISCARD_FRIEND _CUDA_VSTD::chrono::nanoseconds operator-(const timed_event& __end, const timed_event& __start)
+  {
+    float __ms = 0.0f;
+    _CCCL_TRY_CUDA_API(
+      ::cudaEventElapsedTime, "Failed to get CUDA event elapsed time", &__ms, __start.get(), __end.get());
+    return _CUDA_VSTD::chrono::nanoseconds(static_cast<_CUDA_VSTD::chrono::nanoseconds::rep>(__ms * 1'000'000.0));
+  }
+
+private:
+  // Use `timed_event::from_native_handle(e)` to construct an owning `timed_event`
+  // object from a `cudaEvent_t` handle.
+  explicit constexpr timed_event(::cudaEvent_t __evnt) noexcept
+      : event(__evnt)
+  {}
+};
+} // namespace cuda::experimental
+
+#endif // _CUDAX_TIMED_EVENT_DETAIL_H
diff --git a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh
index 21e298ca40d..48d4b38b1dd 100644
--- a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh
+++ b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh
@@ -11,10 +11,11 @@
 #ifndef _CUDAX__HIERARCHY_HIERARCHY_DIMENSIONS
 #define _CUDAX__HIERARCHY_HIERARCHY_DIMENSIONS
 
-#include <cuda/experimental/__hierarchy/level_dimensions.cuh>
+#include <cuda/std/__utility/declval.h>
 #include <cuda/std/tuple>
 
-#include "cuda/std/__utility/declval.h"
+#include <cuda/experimental/__hierarchy/level_dimensions.cuh>
+
 #include <nv/target>
 
 #if _CCCL_STD_VER >= 2017
diff --git a/cudax/include/cuda/experimental/__hierarchy/hierarchy_levels.cuh b/cudax/include/cuda/experimental/__hierarchy/hierarchy_levels.cuh
index 5764f95b520..9965a3bd113 100644
--- a/cudax/include/cuda/experimental/__hierarchy/hierarchy_levels.cuh
+++ b/cudax/include/cuda/experimental/__hierarchy/hierarchy_levels.cuh
@@ -140,7 +140,7 @@ struct grid_level
   using allowed_above = allowed_levels<>;
   using allowed_below = allowed_levels<block_level, cluster_level>;
 };
-_LIBCUDACXX_CPO_ACCESSIBILITY grid_level grid;
+_CCCL_GLOBAL_CONSTANT grid_level grid;
 
 /**
  * @brief Type representing the cluster level in CUDA thread hierarchy
@@ -158,7 +158,7 @@ struct cluster_level
   using allowed_above = allowed_levels<grid_level>;
   using allowed_below = allowed_levels<block_level>;
 };
-_LIBCUDACXX_CPO_ACCESSIBILITY cluster_level cluster;
+_CCCL_GLOBAL_CONSTANT cluster_level cluster;
 
 /**
  * @brief Type representing the block level in CUDA thread hierarchy
@@ -176,7 +176,7 @@ struct block_level
   using allowed_above = allowed_levels<grid_level, cluster_level>;
   using allowed_below = allowed_levels<thread_level>;
 };
-_LIBCUDACXX_CPO_ACCESSIBILITY block_level block;
+_CCCL_GLOBAL_CONSTANT block_level block;
 
 /**
  * @brief Type representing the thread level in CUDA thread hierarchy
@@ -194,7 +194,7 @@ struct thread_level
   using allowed_above = allowed_levels<block_level>;
   using allowed_below = allowed_levels<>;
 };
-_LIBCUDACXX_CPO_ACCESSIBILITY thread_level thread;
+_CCCL_GLOBAL_CONSTANT thread_level thread;
 
 template <typename Level>
 constexpr bool is_core_cuda_hierarchy_level =
diff --git a/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh
index 13d04ca24a5..69bfd88e24a 100644
--- a/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh
+++ b/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh
@@ -11,9 +11,10 @@
 #ifndef _CUDAX__HIERARCHY_LEVEL_DIMENSIONS
 #define _CUDAX__HIERARCHY_LEVEL_DIMENSIONS
 
-#include <cuda/experimental/__hierarchy/hierarchy_levels.cuh>
 #include <cuda/std/type_traits>
 
+#include <cuda/experimental/__hierarchy/hierarchy_levels.cuh>
+
 #if _CCCL_STD_VER >= 2017
 namespace cuda::experimental
 {
@@ -123,7 +124,8 @@ struct level_dimensions
   _CCCL_HOST_DEVICE constexpr level_dimensions(Dimensions&& d)
       : dims(d)
   {}
-  _CCCL_HOST_DEVICE constexpr level_dimensions(){};
+  _CCCL_HOST_DEVICE constexpr level_dimensions()
+      : dims(){};
 };
 
 /**
diff --git a/cudax/include/cuda/experimental/__launch/configuration.cuh b/cudax/include/cuda/experimental/__launch/configuration.cuh
index 99ff65f7c30..d85a6ff5b96 100644
--- a/cudax/include/cuda/experimental/__launch/configuration.cuh
+++ b/cudax/include/cuda/experimental/__launch/configuration.cuh
@@ -10,10 +10,12 @@
 
 #ifndef _CUDAX__LAUNCH_CONFIGURATION
 #define _CUDAX__LAUNCH_CONFIGURATION
-#include <cuda/experimental/hierarchy.cuh>
+
 #include <cuda/std/span>
 #include <cuda/std/tuple>
 
+#include <cuda/experimental/hierarchy.cuh>
+
 #if _CCCL_STD_VER >= 2017
 namespace cuda::experimental
 {
diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh
index 21d9d3377bf..790af2a9d58 100644
--- a/cudax/include/cuda/experimental/__launch/launch.cuh
+++ b/cudax/include/cuda/experimental/__launch/launch.cuh
@@ -12,10 +12,11 @@
 #define _CUDAX__LAUNCH_LAUNCH
 #include <cuda_runtime.h>
 
-#include <cuda/experimental/__launch/configuration.cuh>
 #include <cuda/std/__exception/cuda_error.h>
 #include <cuda/stream_ref>
 
+#include <cuda/experimental/__launch/configuration.cuh>
+
 #if _CCCL_STD_VER >= 2017
 namespace cuda::experimental
 {
diff --git a/cudax/include/cuda/experimental/__stream/stream.cuh b/cudax/include/cuda/experimental/__stream/stream.cuh
new file mode 100644
index 00000000000..a63a5a39313
--- /dev/null
+++ b/cudax/include/cuda/experimental/__stream/stream.cuh
@@ -0,0 +1,197 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__STREAM_STREAM
+#define _CUDAX__STREAM_STREAM
+
+#include <cuda/std/detail/__config>
+#include <cuda_runtime_api.h>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cuda/api_wrapper.h>
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/__device/device.cuh>
+#include <cuda/experimental/__event/timed_event.cuh>
+
+namespace cuda::experimental
+{
+
+namespace detail
+{
+// 0 is a valid stream in CUDA, so we need some other invalid stream representation
+// Can't make it constexpr, because cudaStream_t is a pointer type
+static const ::cudaStream_t invalid_stream = reinterpret_cast<cudaStream_t>(~0ULL);
+} // namespace detail
+
+//! @brief An owning wrapper for cudaStream_t.
+struct stream : stream_ref
+{
+  // 0 is documented as default priority
+  static constexpr int default_priority = 0;
+
+  //! @brief Constructs a stream on a specified device and with specified priority
+  //!
+  //! Priority is defaulted to stream::default_priority
+  //!
+  //! @throws cuda_error if stream creation fails
+  explicit stream(device __dev, int __priority = default_priority)
+  {
+    __scoped_device dev_setter(__dev);
+    _CCCL_TRY_CUDA_API(
+      ::cudaStreamCreateWithPriority, "Failed to create a stream", &__stream, cudaStreamDefault, __priority);
+  }
+
+  //! @brief Constructs a stream on the default device
+  //!
+  //! @throws cuda_error if stream creation fails.
+  stream()
+      : stream(device{0})
+  {}
+
+  //! @brief Construct a new `stream` object into the moved-from state.
+  //!
+  //! @post `stream()` returns an invalid stream handle
+  // Can't be constexpr because invalid_stream isn't
+  explicit stream(uninit_t) noexcept
+      : stream_ref(detail::invalid_stream)
+  {}
+
+  //! @brief Move-construct a new `stream` object
+  //!
+  //! @param __other
+  //!
+  //! @post `__other` is in moved-from state.
+  stream(stream&& __other) noexcept
+      : stream(_CUDA_VSTD::exchange(__other.__stream, detail::invalid_stream))
+  {}
+
+  stream(const stream&) = delete;
+
+  //! Destroy the `stream` object
+  //!
+  //! @note If the stream fails to be destroyed, the error is silently ignored.
+  ~stream()
+  {
+    if (__stream != detail::invalid_stream)
+    {
+      [[maybe_unused]] auto status = ::cudaStreamDestroy(__stream);
+    }
+  }
+
+  //! @brief Move-assign a `stream` object
+  //!
+  //! @param __other
+  //!
+  //! @post `__other` is in a moved-from state.
+  stream& operator=(stream&& __other) noexcept
+  {
+    stream __tmp(_CUDA_VSTD::move(__other));
+    _CUDA_VSTD::swap(__stream, __tmp.__stream);
+    return *this;
+  }
+
+  stream& operator=(const stream&) = delete;
+
+  // Ideally records and waits below would be in stream_ref, but we can't have it depend on cudax yet
+
+  //! @brief Create a new event and record it into this stream
+  //!
+  //! @return A new event that was recorded into this stream
+  //!
+  //! @throws cuda_error if event creation or record failed
+  _CCCL_NODISCARD event record_event(event::flags __flags = event::flags::none) const
+  {
+    return event(*this, __flags);
+  }
+
+  //! @brief Create a new timed event and record it into this stream
+  //!
+  //! @return A new timed event that was recorded into this stream
+  //!
+  //! @throws cuda_error if event creation or record failed
+  _CCCL_NODISCARD timed_event record_timed_event(event::flags __flags = event::flags::none) const
+  {
+    return timed_event(*this, __flags);
+  }
+
+  using stream_ref::wait;
+
+  //! @brief Make all future work submitted into this stream depend on completion of the specified event
+  //!
+  //! @param __ev Event that this stream should wait for
+  //!
+  //! @throws cuda_error if inserting the dependency fails
+  void wait(event_ref __ev) const
+  {
+    assert(__ev.get() != nullptr);
+    _CCCL_TRY_CUDA_API(::cudaStreamWaitEvent, "Failed to make a stream wait for an event", get(), __ev.get());
+  }
+
+  //! @brief Make all future work submitted into this stream depend on completion of all work from the specified stream
+  //!
+  //! @param __other Stream that this stream should wait for
+  //!
+  //! @throws cuda_error if inserting the dependency fails
+  void wait(stream_ref __other) const
+  {
+    // TODO consider an optimization to not create an event every time and instead have one persistent event or one per
+    // stream
+    assert(__stream.get() != detail::invalid_stream);
+    event __tmp(__other);
+    wait(__tmp);
+  }
+
+  //! @brief Construct an `stream` object from a native `cudaStream_t` handle.
+  //!
+  //! @param __handle The native handle
+  //!
+  //! @return stream The constructed `stream` object
+  //!
+  //! @note The constructed `stream` object takes ownership of the native handle.
+  _CCCL_NODISCARD static stream from_native_handle(::cudaStream_t __handle)
+  {
+    return stream(__handle);
+  }
+
+  // Disallow construction from an `int`, e.g., `0`.
+  static stream from_native_handle(int) = delete;
+
+  // Disallow construction from `nullptr`.
+  static stream from_native_handle(_CUDA_VSTD::nullptr_t) = delete;
+
+  //! @brief Retrieve the native `cudaStream_t` handle and give up ownership.
+  //!
+  //! @return cudaStream_t The native handle being held by the `stream` object.
+  //!
+  //! @post The stream object is in a moved-from state.
+  _CCCL_NODISCARD ::cudaStream_t release()
+  {
+    return _CUDA_VSTD::exchange(__stream, detail::invalid_stream);
+  }
+
+private:
+  // Use `stream::from_native_handle(s)` to construct an owning `stream`
+  // object from a `cudaStream_t` handle.
+  explicit stream(::cudaStream_t __handle)
+      : stream_ref(__handle)
+  {}
+};
+
+} // namespace cuda::experimental
+
+#endif // _CUDAX__STREAM_STREAM
diff --git a/cudax/include/cuda/experimental/device.cuh b/cudax/include/cuda/experimental/device.cuh
new file mode 100644
index 00000000000..62734419bea
--- /dev/null
+++ b/cudax/include/cuda/experimental/device.cuh
@@ -0,0 +1,17 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_DEVICE__
+#define __CUDAX_DEVICE__
+
+#include <cuda/experimental/__device/attributes.cuh>
+#include <cuda/experimental/__device/device.cuh>
+
+#endif // __CUDAX_DEVICE__
diff --git a/cudax/include/cuda/experimental/event.cuh b/cudax/include/cuda/experimental/event.cuh
new file mode 100644
index 00000000000..ecb31597dc7
--- /dev/null
+++ b/cudax/include/cuda/experimental/event.cuh
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX_EVENT_H
+#define _CUDAX_EVENT_H
+
+#include <cuda/experimental/__event/event.cuh>
+#include <cuda/experimental/__event/event_ref.cuh>
+#include <cuda/experimental/__event/timed_event.cuh>
+
+#endif // _CUDAX_EVENT_H
diff --git a/cudax/include/cuda/experimental/hierarchy.cuh b/cudax/include/cuda/experimental/hierarchy.cuh
index 15f7269388f..1edc768650d 100644
--- a/cudax/include/cuda/experimental/hierarchy.cuh
+++ b/cudax/include/cuda/experimental/hierarchy.cuh
@@ -13,4 +13,4 @@
 
 #include <cuda/experimental/__hierarchy/hierarchy_dimensions.cuh>
 
-#endif
+#endif // __CUDAX_HIERARCHY___
diff --git a/cudax/include/cuda/experimental/launch.cuh b/cudax/include/cuda/experimental/launch.cuh
index 8fb5c312275..69048248eff 100644
--- a/cudax/include/cuda/experimental/launch.cuh
+++ b/cudax/include/cuda/experimental/launch.cuh
@@ -13,4 +13,4 @@
 
 #include <cuda/experimental/__launch/launch.cuh>
 
-#endif
+#endif // __CUDAX_LAUNCH___
diff --git a/cudax/include/cuda/experimental/stream.cuh b/cudax/include/cuda/experimental/stream.cuh
new file mode 100644
index 00000000000..c4a1a08c8af
--- /dev/null
+++ b/cudax/include/cuda/experimental/stream.cuh
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_STREAM__
+#define __CUDAX_STREAM__
+
+#include <cuda/experimental/__stream/stream.cuh>
+
+#endif // __CUDAX_STREAM__
diff --git a/cudax/include/cuda/experimental/version.cuh b/cudax/include/cuda/experimental/version.cuh
index d1511237af4..563de5cd4d6 100644
--- a/cudax/include/cuda/experimental/version.cuh
+++ b/cudax/include/cuda/experimental/version.cuh
@@ -18,4 +18,4 @@
 #define CUDAX_VERSION_MINOR CCCL_MINOR_VERSION
 #define CUDAX_VERSION_PATCH CCCL_PATCH_VERSION
 
-#endif
+#endif // __CUDAX_VERSION__
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index 0f8464f2bbc..da42e678dcf 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -47,18 +47,32 @@ foreach(cn_target IN LISTS cudax_TARGETS)
   add_dependencies(${config_prefix}.all ${config_meta_target})
 
   # Add tests:
-  Cudax_add_catch2_test(test_target hierarchy_tests ${cn_target}
+  cudax_add_catch2_test(test_target hierarchy_tests ${cn_target}
     hierarchy/hierarchy_smoke.cu
     hierarchy/hierarchy_custom_types.cu
   )
 
-  Cudax_add_catch2_test(test_target launch_tests ${cn_target}
+  cudax_add_catch2_test(test_target launch_tests ${cn_target}
     launch/launch_smoke.cu
     launch/configuration.cu
   )
   target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
 
-  Cudax_add_catch2_test(test_target misc_tests ${cn_target}
+  Cudax_add_catch2_test(test_target device_tests ${cn_target}
+    device/device_smoke.cu
+  )
+  target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
+
+  cudax_add_catch2_test(test_target event_tests ${cn_target}
+    event/event_smoke.cu
+  )
+  target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
+
+  cudax_add_catch2_test(test_target stream_tests ${cn_target}
+    stream/stream_smoke.cu
+  )
+    
+  cudax_add_catch2_test(test_target misc_tests ${cn_target}
     utility/driver_api.cu
   )
 endforeach()
diff --git a/cudax/test/common/utility.cuh b/cudax/test/common/utility.cuh
new file mode 100644
index 00000000000..2d7254c0699
--- /dev/null
+++ b/cudax/test/common/utility.cuh
@@ -0,0 +1,148 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime_api.h>
+// cuda_runtime_api needs to come first
+
+#include <cuda/atomic>
+#include <cuda/std/__cuda/api_wrapper.h>
+#include <cuda/std/utility>
+#include <cuda/stream_ref>
+
+#include <new> // IWYU pragma: keep (needed for placement new)
+
+// TODO unify the common testing header
+#include "../hierarchy/testing_common.cuh"
+
+namespace
+{
+namespace test
+{
+struct stream : cuda::stream_ref
+{
+  stream()
+      : cuda::stream_ref(::cudaStream_t{})
+  {
+    ::cudaStream_t stream{};
+    _CCCL_TRY_CUDA_API(::cudaStreamCreate, "failed to create a CUDA stream", &stream);
+    static_cast<cuda::stream_ref&>(*this) = cuda::stream_ref(stream);
+  }
+
+  cuda::stream_ref ref() const noexcept
+  {
+    return *this;
+  }
+
+  void wait() const
+  {
+    _CCCL_TRY_CUDA_API(::cudaStreamSynchronize, "failed to synchronize a CUDA stream", get());
+  }
+
+  ~stream()
+  {
+    [[maybe_unused]] auto status = ::cudaStreamDestroy(get());
+  }
+};
+
+struct _malloc_managed
+{
+private:
+  void* pv = nullptr;
+
+public:
+  explicit _malloc_managed(std::size_t size)
+  {
+    _CCCL_TRY_CUDA_API(::cudaMallocManaged, "failed to allocate managed memory", &pv, size);
+  }
+
+  ~_malloc_managed()
+  {
+    [[maybe_unused]] auto status = ::cudaFree(pv);
+  }
+
+  template <class T>
+  T* get_as() const noexcept
+  {
+    return static_cast<T*>(pv);
+  }
+};
+
+template <class T>
+struct managed
+{
+private:
+  _malloc_managed _mem;
+
+public:
+  explicit managed(T t)
+      : _mem(sizeof(T))
+  {
+    ::new (_mem.get_as<void>()) T(_CUDA_VSTD::move(t));
+  }
+
+  ~managed()
+  {
+    get()->~T();
+  }
+
+  T* get() noexcept
+  {
+    return _mem.get_as<T>();
+  }
+  const T* get() const noexcept
+  {
+    return _mem.get_as<T>();
+  }
+
+  T& operator*() noexcept
+  {
+    return *get();
+  }
+  const T& operator*() const noexcept
+  {
+    return *get();
+  }
+};
+
+struct assign_42
+{
+  __device__ constexpr void operator()(int* pi) const noexcept
+  {
+    *pi = 42;
+  }
+};
+
+struct verify_42
+{
+  __device__ void operator()(int* pi) const noexcept
+  {
+    CUDAX_REQUIRE(*pi == 42);
+  }
+};
+
+struct spin_until_80
+{
+  __device__ void operator()(int* pi) const noexcept
+  {
+    cuda::atomic_ref atomic_pi(*pi);
+    while (atomic_pi.load() != 80)
+      ;
+  }
+};
+
+/// A kernel that takes a callable object and invokes it with a set of arguments
+template <class Fn, class... Args>
+__global__ void invokernel(Fn fn, Args... args)
+{
+  fn(args...);
+}
+
+} // namespace test
+} // namespace
diff --git a/cudax/test/device/device_smoke.cu b/cudax/test/device/device_smoke.cu
new file mode 100644
index 00000000000..160b6b087b6
--- /dev/null
+++ b/cudax/test/device/device_smoke.cu
@@ -0,0 +1,239 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#define LIBCUDACXX_ENABLE_EXCEPTIONS
+#include <cuda/experimental/device.cuh>
+
+#include "../hierarchy/testing_common.cuh"
+#include "cuda/std/__type_traits/is_same.h"
+
+namespace
+{
+template <const auto& Attr, ::cudaDeviceAttr ExpectedAttr, class ExpectedResult>
+[[maybe_unused]] auto test_device_attribute()
+{
+  cudax::device dev0(0);
+  STATIC_REQUIRE(Attr == ExpectedAttr);
+  STATIC_REQUIRE(::cuda::std::is_same_v<cudax::device::attr_result_t<Attr>, ExpectedResult>);
+
+  auto result = dev0.attr(Attr);
+  STATIC_REQUIRE(::cuda::std::is_same_v<decltype(result), ExpectedResult>);
+  CUDAX_REQUIRE(result == dev0.attr<ExpectedAttr>());
+  CUDAX_REQUIRE(result == Attr(dev0));
+  return result;
+}
+} // namespace
+
+TEST_CASE("Smoke", "[device]")
+{
+  using cudax::device;
+
+  SECTION("Attributes")
+  {
+    ::test_device_attribute<device::attrs::max_threads_per_block, ::cudaDevAttrMaxThreadsPerBlock, int>();
+    ::test_device_attribute<device::attrs::max_block_dim_x, ::cudaDevAttrMaxBlockDimX, int>();
+    ::test_device_attribute<device::attrs::max_block_dim_y, ::cudaDevAttrMaxBlockDimY, int>();
+    ::test_device_attribute<device::attrs::max_block_dim_z, ::cudaDevAttrMaxBlockDimZ, int>();
+    ::test_device_attribute<device::attrs::max_grid_dim_x, ::cudaDevAttrMaxGridDimX, int>();
+    ::test_device_attribute<device::attrs::max_grid_dim_y, ::cudaDevAttrMaxGridDimY, int>();
+    ::test_device_attribute<device::attrs::max_grid_dim_z, ::cudaDevAttrMaxGridDimZ, int>();
+    ::test_device_attribute<device::attrs::max_shared_memory_per_block, ::cudaDevAttrMaxSharedMemoryPerBlock, int>();
+    ::test_device_attribute<device::attrs::total_constant_memory, ::cudaDevAttrTotalConstantMemory, int>();
+    ::test_device_attribute<device::attrs::warp_size, ::cudaDevAttrWarpSize, int>();
+    ::test_device_attribute<device::attrs::max_pitch, ::cudaDevAttrMaxPitch, int>();
+    ::test_device_attribute<device::attrs::max_texture_1d_width, ::cudaDevAttrMaxTexture1DWidth, int>();
+    ::test_device_attribute<device::attrs::max_texture_1d_linear_width, ::cudaDevAttrMaxTexture1DLinearWidth, int>();
+    ::test_device_attribute<device::attrs::max_texture_1d_mipmapped_width, ::cudaDevAttrMaxTexture1DMipmappedWidth, int>();
+    ::test_device_attribute<device::attrs::max_texture_2d_width, ::cudaDevAttrMaxTexture2DWidth, int>();
+    ::test_device_attribute<device::attrs::max_texture_2d_height, ::cudaDevAttrMaxTexture2DHeight, int>();
+    ::test_device_attribute<device::attrs::max_texture_2d_linear_width, ::cudaDevAttrMaxTexture2DLinearWidth, int>();
+    ::test_device_attribute<device::attrs::max_texture_2d_linear_height, ::cudaDevAttrMaxTexture2DLinearHeight, int>();
+    ::test_device_attribute<device::attrs::max_texture_2d_linear_pitch, ::cudaDevAttrMaxTexture2DLinearPitch, int>();
+    ::test_device_attribute<device::attrs::max_texture_2d_mipmapped_width, ::cudaDevAttrMaxTexture2DMipmappedWidth, int>();
+    ::test_device_attribute<device::attrs::max_texture_2d_mipmapped_height,
+                            ::cudaDevAttrMaxTexture2DMipmappedHeight,
+                            int>();
+    ::test_device_attribute<device::attrs::max_texture_3d_width, ::cudaDevAttrMaxTexture3DWidth, int>();
+    ::test_device_attribute<device::attrs::max_texture_3d_height, ::cudaDevAttrMaxTexture3DHeight, int>();
+    ::test_device_attribute<device::attrs::max_texture_3d_depth, ::cudaDevAttrMaxTexture3DDepth, int>();
+    ::test_device_attribute<device::attrs::max_texture_3d_width_alt, ::cudaDevAttrMaxTexture3DWidthAlt, int>();
+    ::test_device_attribute<device::attrs::max_texture_3d_height_alt, ::cudaDevAttrMaxTexture3DHeightAlt, int>();
+    ::test_device_attribute<device::attrs::max_texture_3d_depth_alt, ::cudaDevAttrMaxTexture3DDepthAlt, int>();
+    ::test_device_attribute<device::attrs::max_texture_cubemap_width, ::cudaDevAttrMaxTextureCubemapWidth, int>();
+    ::test_device_attribute<device::attrs::max_texture_1d_layered_width, ::cudaDevAttrMaxTexture1DLayeredWidth, int>();
+    ::test_device_attribute<device::attrs::max_texture_1d_layered_layers, ::cudaDevAttrMaxTexture1DLayeredLayers, int>();
+    ::test_device_attribute<device::attrs::max_texture_2d_layered_width, ::cudaDevAttrMaxTexture2DLayeredWidth, int>();
+    ::test_device_attribute<device::attrs::max_texture_2d_layered_height, ::cudaDevAttrMaxTexture2DLayeredHeight, int>();
+    ::test_device_attribute<device::attrs::max_texture_2d_layered_layers, ::cudaDevAttrMaxTexture2DLayeredLayers, int>();
+    ::test_device_attribute<device::attrs::max_texture_cubemap_layered_width,
+                            ::cudaDevAttrMaxTextureCubemapLayeredWidth,
+                            int>();
+    ::test_device_attribute<device::attrs::max_texture_cubemap_layered_layers,
+                            ::cudaDevAttrMaxTextureCubemapLayeredLayers,
+                            int>();
+    ::test_device_attribute<device::attrs::max_surface_1d_width, ::cudaDevAttrMaxSurface1DWidth, int>();
+    ::test_device_attribute<device::attrs::max_surface_2d_width, ::cudaDevAttrMaxSurface2DWidth, int>();
+    ::test_device_attribute<device::attrs::max_surface_2d_height, ::cudaDevAttrMaxSurface2DHeight, int>();
+    ::test_device_attribute<device::attrs::max_surface_3d_width, ::cudaDevAttrMaxSurface3DWidth, int>();
+    ::test_device_attribute<device::attrs::max_surface_3d_height, ::cudaDevAttrMaxSurface3DHeight, int>();
+    ::test_device_attribute<device::attrs::max_surface_3d_depth, ::cudaDevAttrMaxSurface3DDepth, int>();
+    ::test_device_attribute<device::attrs::max_surface_1d_layered_width, ::cudaDevAttrMaxSurface1DLayeredWidth, int>();
+    ::test_device_attribute<device::attrs::max_surface_1d_layered_layers, ::cudaDevAttrMaxSurface1DLayeredLayers, int>();
+    ::test_device_attribute<device::attrs::max_surface_2d_layered_width, ::cudaDevAttrMaxSurface2DLayeredWidth, int>();
+    ::test_device_attribute<device::attrs::max_surface_2d_layered_height, ::cudaDevAttrMaxSurface2DLayeredHeight, int>();
+    ::test_device_attribute<device::attrs::max_surface_2d_layered_layers, ::cudaDevAttrMaxSurface2DLayeredLayers, int>();
+    ::test_device_attribute<device::attrs::max_surface_cubemap_width, ::cudaDevAttrMaxSurfaceCubemapWidth, int>();
+    ::test_device_attribute<device::attrs::max_surface_cubemap_layered_width,
+                            ::cudaDevAttrMaxSurfaceCubemapLayeredWidth,
+                            int>();
+    ::test_device_attribute<device::attrs::max_surface_cubemap_layered_layers,
+                            ::cudaDevAttrMaxSurfaceCubemapLayeredLayers,
+                            int>();
+    ::test_device_attribute<device::attrs::max_registers_per_block, ::cudaDevAttrMaxRegistersPerBlock, int>();
+    ::test_device_attribute<device::attrs::clock_rate, ::cudaDevAttrClockRate, int>();
+    ::test_device_attribute<device::attrs::texture_alignment, ::cudaDevAttrTextureAlignment, int>();
+    ::test_device_attribute<device::attrs::texture_pitch_alignment, ::cudaDevAttrTexturePitchAlignment, int>();
+    ::test_device_attribute<device::attrs::gpu_overlap, ::cudaDevAttrGpuOverlap, bool>();
+    ::test_device_attribute<device::attrs::multi_processor_count, ::cudaDevAttrMultiProcessorCount, int>();
+    ::test_device_attribute<device::attrs::kernel_exec_timeout, ::cudaDevAttrKernelExecTimeout, bool>();
+    ::test_device_attribute<device::attrs::integrated, ::cudaDevAttrIntegrated, bool>();
+    ::test_device_attribute<device::attrs::can_map_host_memory, ::cudaDevAttrCanMapHostMemory, bool>();
+    ::test_device_attribute<device::attrs::compute_mode, ::cudaDevAttrComputeMode, ::cudaComputeMode>();
+    ::test_device_attribute<device::attrs::concurrent_kernels, ::cudaDevAttrConcurrentKernels, bool>();
+    ::test_device_attribute<device::attrs::ecc_enabled, ::cudaDevAttrEccEnabled, bool>();
+    ::test_device_attribute<device::attrs::pci_bus_id, ::cudaDevAttrPciBusId, int>();
+    ::test_device_attribute<device::attrs::pci_device_id, ::cudaDevAttrPciDeviceId, int>();
+    ::test_device_attribute<device::attrs::tcc_driver, ::cudaDevAttrTccDriver, bool>();
+    ::test_device_attribute<device::attrs::memory_clock_rate, ::cudaDevAttrMemoryClockRate, int>();
+    ::test_device_attribute<device::attrs::global_memory_bus_width, ::cudaDevAttrGlobalMemoryBusWidth, int>();
+    ::test_device_attribute<device::attrs::l2_cache_size, ::cudaDevAttrL2CacheSize, int>();
+    ::test_device_attribute<device::attrs::max_threads_per_multi_processor,
+                            ::cudaDevAttrMaxThreadsPerMultiProcessor,
+                            int>();
+    ::test_device_attribute<device::attrs::unified_addressing, ::cudaDevAttrUnifiedAddressing, bool>();
+    ::test_device_attribute<device::attrs::compute_capability_major, ::cudaDevAttrComputeCapabilityMajor, int>();
+    ::test_device_attribute<device::attrs::compute_capability_minor, ::cudaDevAttrComputeCapabilityMinor, int>();
+    ::test_device_attribute<device::attrs::stream_priorities_supported, ::cudaDevAttrStreamPrioritiesSupported, bool>();
+    ::test_device_attribute<device::attrs::global_l1_cache_supported, ::cudaDevAttrGlobalL1CacheSupported, bool>();
+    ::test_device_attribute<device::attrs::local_l1_cache_supported, ::cudaDevAttrLocalL1CacheSupported, bool>();
+    ::test_device_attribute<device::attrs::max_shared_memory_per_multiprocessor,
+                            ::cudaDevAttrMaxSharedMemoryPerMultiprocessor,
+                            int>();
+    ::test_device_attribute<device::attrs::max_registers_per_multiprocessor,
+                            ::cudaDevAttrMaxRegistersPerMultiprocessor,
+                            int>();
+    ::test_device_attribute<device::attrs::managed_memory, ::cudaDevAttrManagedMemory, bool>();
+    ::test_device_attribute<device::attrs::is_multi_gpu_board, ::cudaDevAttrIsMultiGpuBoard, bool>();
+    ::test_device_attribute<device::attrs::multi_gpu_board_group_id, ::cudaDevAttrMultiGpuBoardGroupID, int>();
+    ::test_device_attribute<device::attrs::host_native_atomic_supported, ::cudaDevAttrHostNativeAtomicSupported, bool>();
+    ::test_device_attribute<device::attrs::single_to_double_precision_perf_ratio,
+                            ::cudaDevAttrSingleToDoublePrecisionPerfRatio,
+                            int>();
+    ::test_device_attribute<device::attrs::pageable_memory_access, ::cudaDevAttrPageableMemoryAccess, bool>();
+    ::test_device_attribute<device::attrs::concurrent_managed_access, ::cudaDevAttrConcurrentManagedAccess, bool>();
+    ::test_device_attribute<device::attrs::compute_preemption_supported, ::cudaDevAttrComputePreemptionSupported, bool>();
+    ::test_device_attribute<device::attrs::can_use_host_pointer_for_registered_mem,
+                            ::cudaDevAttrCanUseHostPointerForRegisteredMem,
+                            bool>();
+    ::test_device_attribute<device::attrs::cooperative_launch, ::cudaDevAttrCooperativeLaunch, bool>();
+    ::test_device_attribute<device::attrs::cooperative_multi_device_launch,
+                            ::cudaDevAttrCooperativeMultiDeviceLaunch,
+                            bool>();
+    ::test_device_attribute<device::attrs::can_flush_remote_writes, ::cudaDevAttrCanFlushRemoteWrites, bool>();
+    ::test_device_attribute<device::attrs::host_register_supported, ::cudaDevAttrHostRegisterSupported, bool>();
+    ::test_device_attribute<device::attrs::pageable_memory_access_uses_host_page_tables,
+                            ::cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+                            bool>();
+    ::test_device_attribute<device::attrs::direct_managed_mem_access_from_host,
+                            ::cudaDevAttrDirectManagedMemAccessFromHost,
+                            bool>();
+    ::test_device_attribute<device::attrs::max_shared_memory_per_block_optin,
+                            ::cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                            int>();
+    ::test_device_attribute<device::attrs::max_blocks_per_multiprocessor, ::cudaDevAttrMaxBlocksPerMultiprocessor, int>();
+    ::test_device_attribute<device::attrs::max_persisting_l2_cache_size, ::cudaDevAttrMaxPersistingL2CacheSize, int>();
+    ::test_device_attribute<device::attrs::max_access_policy_window_size, ::cudaDevAttrMaxAccessPolicyWindowSize, int>();
+    ::test_device_attribute<device::attrs::reserved_shared_memory_per_block,
+                            ::cudaDevAttrReservedSharedMemoryPerBlock,
+                            int>();
+    ::test_device_attribute<device::attrs::sparse_cuda_array_supported, ::cudaDevAttrSparseCudaArraySupported, bool>();
+    ::test_device_attribute<device::attrs::host_register_read_only_supported,
+                            ::cudaDevAttrHostRegisterReadOnlySupported,
+                            bool>();
+    ::test_device_attribute<device::attrs::memory_pools_supported, ::cudaDevAttrMemoryPoolsSupported, bool>();
+    ::test_device_attribute<device::attrs::gpu_direct_rdma_supported, ::cudaDevAttrGPUDirectRDMASupported, bool>();
+    ::test_device_attribute<device::attrs::gpu_direct_rdma_flush_writes_options,
+                            ::cudaDevAttrGPUDirectRDMAFlushWritesOptions,
+                            ::cudaFlushGPUDirectRDMAWritesOptions>();
+    ::test_device_attribute<device::attrs::gpu_direct_rdma_writes_ordering,
+                            ::cudaDevAttrGPUDirectRDMAWritesOrdering,
+                            ::cudaGPUDirectRDMAWritesOrdering>();
+    ::test_device_attribute<device::attrs::memory_pool_supported_handle_types,
+                            ::cudaDevAttrMemoryPoolSupportedHandleTypes,
+                            unsigned int>();
+    ::test_device_attribute<device::attrs::deferred_mapping_cuda_array_supported,
+                            ::cudaDevAttrDeferredMappingCudaArraySupported,
+                            bool>();
+    ::test_device_attribute<device::attrs::ipc_event_support, ::cudaDevAttrIpcEventSupport, bool>();
+
+#if CUDART_VERSION >= 12020
+    ::test_device_attribute<device::attrs::numa_config, ::cudaDevAttrNumaConfig, ::cudaDeviceNumaConfig>();
+    ::test_device_attribute<device::attrs::numa_id, ::cudaDevAttrNumaId, int>();
+#endif
+
+    SECTION("compute_mode")
+    {
+      STATIC_REQUIRE(::cudaComputeModeDefault == device::attrs::compute_mode.default_mode);
+      STATIC_REQUIRE(::cudaComputeModeProhibited == device::attrs::compute_mode.prohibited_mode);
+      STATIC_REQUIRE(::cudaComputeModeExclusiveProcess == device::attrs::compute_mode.exclusive_process_mode);
+
+      auto mode = device(0).attr(device::attrs::compute_mode);
+      CUDAX_REQUIRE((mode == device::attrs::compute_mode.default_mode || //
+                     mode == device::attrs::compute_mode.prohibited_mode || //
+                     mode == device::attrs::compute_mode.exclusive_process_mode));
+    }
+
+    SECTION("gpu_direct_rdma_flush_writes_options")
+    {
+      STATIC_REQUIRE(
+        ::cudaFlushGPUDirectRDMAWritesOptionHost == device::attrs::gpu_direct_rdma_flush_writes_options.host);
+      STATIC_REQUIRE(
+        ::cudaFlushGPUDirectRDMAWritesOptionMemOps == device::attrs::gpu_direct_rdma_flush_writes_options.mem_ops);
+
+      auto options = device(0).attr(device::attrs::gpu_direct_rdma_flush_writes_options);
+      CUDAX_REQUIRE((options == device::attrs::gpu_direct_rdma_flush_writes_options.host || //
+                     options == device::attrs::gpu_direct_rdma_flush_writes_options.mem_ops));
+    }
+
+    SECTION("gpu_direct_rdma_writes_ordering")
+    {
+      STATIC_REQUIRE(::cudaGPUDirectRDMAWritesOrderingNone == device::attrs::gpu_direct_rdma_writes_ordering.none);
+      STATIC_REQUIRE(::cudaGPUDirectRDMAWritesOrderingOwner == device::attrs::gpu_direct_rdma_writes_ordering.owner);
+      STATIC_REQUIRE(
+        ::cudaGPUDirectRDMAWritesOrderingAllDevices == device::attrs::gpu_direct_rdma_writes_ordering.all_devices);
+
+      auto ordering = device(0).attr(device::attrs::gpu_direct_rdma_writes_ordering);
+      CUDAX_REQUIRE((ordering == device::attrs::gpu_direct_rdma_writes_ordering.none || //
+                     ordering == device::attrs::gpu_direct_rdma_writes_ordering.owner || //
+                     ordering == device::attrs::gpu_direct_rdma_writes_ordering.all_devices));
+    }
+
+#if CUDART_VERSION >= 12020
+    SECTION("numa_config")
+    {
+      STATIC_REQUIRE(::cudaDeviceNumaConfigNone == device::attrs::numa_config.none);
+      STATIC_REQUIRE(::cudaDeviceNumaConfigNumaNode == device::attrs::numa_config.numa_node);
+
+      auto config = device(0).attr(device::attrs::numa_config);
+      CUDAX_REQUIRE((config == device::attrs::numa_config.none || //
+                     config == device::attrs::numa_config.numa_node));
+    }
+#endif
+  }
+}
diff --git a/cudax/test/event/event_smoke.cu b/cudax/test/event/event_smoke.cu
new file mode 100644
index 00000000000..ae5286a4f7c
--- /dev/null
+++ b/cudax/test/event/event_smoke.cu
@@ -0,0 +1,117 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/experimental/event.cuh>
+
+#include "../common/utility.cuh"
+#include "../hierarchy/testing_common.cuh"
+#include <catch2/catch.hpp>
+
+namespace
+{
+namespace test
+{
+cudax::event_ref fn_takes_event_ref(cudax::event_ref ref)
+{
+  return ref;
+}
+} // namespace test
+} // namespace
+
+static_assert(!_CUDA_VSTD::is_default_constructible_v<cudax::event_ref>);
+static_assert(!_CUDA_VSTD::is_default_constructible_v<cudax::event>);
+static_assert(!_CUDA_VSTD::is_default_constructible_v<cudax::timed_event>);
+
+TEST_CASE("can construct an event_ref from a cudaEvent_t", "[event]")
+{
+  ::cudaEvent_t ev;
+  CUDAX_REQUIRE(::cudaEventCreate(&ev) == ::cudaSuccess);
+  cudax::event_ref ref(ev);
+  CUDAX_REQUIRE(ref.get() == ev);
+  CUDAX_REQUIRE(!!ref);
+  // test implicit converstion from cudaEvent_t:
+  cudax::event_ref ref2 = ::test::fn_takes_event_ref(ev);
+  CUDAX_REQUIRE(ref2.get() == ev);
+  CUDAX_REQUIRE(::cudaEventDestroy(ev) == ::cudaSuccess);
+  // test an empty event_ref:
+  cudax::event_ref ref3(::cudaEvent_t{});
+  CUDAX_REQUIRE(ref3.get() == ::cudaEvent_t{});
+  CUDAX_REQUIRE(!ref3);
+}
+
+TEST_CASE("can copy construct an event_ref and compare for equality", "[event]")
+{
+  ::cudaEvent_t ev;
+  CUDAX_REQUIRE(::cudaEventCreate(&ev) == ::cudaSuccess);
+  const cudax::event_ref ref(ev);
+  const cudax::event_ref ref2 = ref;
+  CUDAX_REQUIRE(ref2 == ref);
+  CUDAX_REQUIRE(!(ref != ref2));
+  CUDAX_REQUIRE((ref ? true : false)); // test contextual convertibility to bool
+  CUDAX_REQUIRE(!!ref);
+  CUDAX_REQUIRE(::cudaEvent_t{} != ref);
+  CUDAX_REQUIRE(::cudaEventDestroy(ev) == ::cudaSuccess);
+  // copy from empty event_ref:
+  const cudax::event_ref ref3(::cudaEvent_t{});
+  const cudax::event_ref ref4 = ref3;
+  CUDAX_REQUIRE(ref4 == ref3);
+  CUDAX_REQUIRE(!(ref3 != ref4));
+  CUDAX_REQUIRE(!ref4);
+}
+
+TEST_CASE("can use event_ref to record and wait on an event", "[event]")
+{
+  ::cudaEvent_t ev;
+  CUDAX_REQUIRE(::cudaEventCreate(&ev) == ::cudaSuccess);
+  const cudax::event_ref ref(ev);
+
+  test::managed<int> i(0);
+  test::stream stream;
+  ::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
+  ref.record(stream);
+  ref.wait();
+  CUDAX_REQUIRE(*i == 42);
+
+  stream.wait();
+  CUDAX_REQUIRE(::cudaEventDestroy(ev) == ::cudaSuccess);
+}
+
+TEST_CASE("can construct an event with a stream_ref", "[event]")
+{
+  test::stream stream;
+  cudax::event ev(stream.ref());
+  CUDAX_REQUIRE(ev.get() != ::cudaEvent_t{});
+}
+
+TEST_CASE("can wait on an event", "[event]")
+{
+  test::stream stream;
+  ::test::managed<int> i(0);
+  ::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
+  cudax::event ev(stream);
+  ev.wait();
+  CUDAX_REQUIRE(*i == 42);
+  stream.wait();
+}
+
+TEST_CASE("can take the difference of two timed_event objects", "[event]")
+{
+  test::stream stream;
+  ::test::managed<int> i(0);
+  cudax::timed_event start(stream);
+  ::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
+  cudax::timed_event end(stream);
+  end.wait();
+  CUDAX_REQUIRE(*i == 42);
+  auto elapsed = end - start;
+  CUDAX_REQUIRE(elapsed.count() >= 0);
+  STATIC_REQUIRE(_CUDA_VSTD::is_same_v<decltype(elapsed), _CUDA_VSTD::chrono::nanoseconds>);
+  stream.wait();
+}
diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu
index d7624f3a4ec..554cabd015c 100644
--- a/cudax/test/launch/launch_smoke.cu
+++ b/cudax/test/launch/launch_smoke.cu
@@ -9,11 +9,8 @@
 //===----------------------------------------------------------------------===//
 #define LIBCUDACXX_ENABLE_EXCEPTIONS
 #include <cuda/atomic>
-#include <cuda/experimental/launch.cuh>
 
-#include <functional>
-#include <iostream>
-#include <type_traits>
+#include <cuda/experimental/launch.cuh>
 
 #include "../hierarchy/testing_common.cuh"
 
diff --git a/cudax/test/stream/stream_smoke.cu b/cudax/test/stream/stream_smoke.cu
new file mode 100644
index 00000000000..e6b86ccf16f
--- /dev/null
+++ b/cudax/test/stream/stream_smoke.cu
@@ -0,0 +1,105 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#define LIBCUDACXX_ENABLE_EXCEPTIONS
+#include <cuda/experimental/launch.cuh>
+#include <cuda/experimental/stream.cuh>
+
+#include "../common/utility.cuh"
+#include <catch2/catch.hpp>
+
+constexpr auto one_thread_dims = cudax::make_hierarchy(cudax::block_dims<1>(), cudax::grid_dims<1>());
+
+TEST_CASE("Can create a stream and launch work into it", "[stream]")
+{
+  cudax::stream str;
+  ::test::managed<int> i(0);
+  cudax::launch(str, one_thread_dims, ::test::assign_42{}, i.get());
+  str.wait();
+  CUDAX_REQUIRE(*i == 42);
+}
+
+TEST_CASE("From native handle", "[stream]")
+{
+  cudaStream_t handle;
+  CUDART(cudaStreamCreate(&handle));
+  {
+    auto stream = cudax::stream::from_native_handle(handle);
+
+    ::test::managed<int> i(0);
+    cudax::launch(stream, one_thread_dims, ::test::assign_42{}, i.get());
+    stream.wait();
+    CUDAX_REQUIRE(*i == 42);
+    (void) stream.release();
+  }
+  CUDART(cudaStreamDestroy(handle));
+}
+
+TEST_CASE("Can add dependency into a stream", "[stream]")
+{
+  cudax::stream waiter, waitee;
+  CUDAX_REQUIRE(waiter != waitee);
+
+  auto verify_dependency = [&](const auto& insert_dependency) {
+    ::test::managed<int> i(0);
+    ::cuda::atomic_ref atomic_i(*i);
+
+    cudax::launch(waitee, one_thread_dims, ::test::spin_until_80{}, i.get());
+    cudax::launch(waitee, one_thread_dims, ::test::assign_42{}, i.get());
+    insert_dependency();
+    cudax::launch(waiter, one_thread_dims, ::test::verify_42{}, i.get());
+    CUDAX_REQUIRE(atomic_i.load() != 42);
+    CUDAX_REQUIRE(!waiter.ready());
+    atomic_i.store(80);
+    waiter.wait();
+    waitee.wait();
+  };
+
+  SECTION("Stream wait declared event")
+  {
+    verify_dependency([&]() {
+      cudax::event ev(waitee);
+      waiter.wait(ev);
+    });
+  }
+
+  SECTION("Stream wait returned event")
+  {
+    verify_dependency([&]() {
+      auto ev = waitee.record_event();
+      waiter.wait(ev);
+    });
+  }
+
+  SECTION("Stream wait returned timed event")
+  {
+    verify_dependency([&]() {
+      auto ev = waitee.record_timed_event();
+      waiter.wait(ev);
+    });
+  }
+
+  SECTION("Stream wait stream")
+  {
+    verify_dependency([&]() {
+      waiter.wait(waitee);
+    });
+  }
+}
+
+TEST_CASE("Stream priority", "[stream]")
+{
+  cudax::stream stream_default_prio;
+  CUDAX_REQUIRE(stream_default_prio.priority() == cudax::stream::default_priority);
+
+  auto priority = cudax::stream::default_priority - 1;
+  cudax::stream stream(0, priority);
+  CUDAX_REQUIRE(stream.priority() == priority);
+}
diff --git a/docs/cub/device_wide.rst b/docs/cub/device_wide.rst
index e147b2dd9b6..a4e8bf2cb03 100644
--- a/docs/cub/device_wide.rst
+++ b/docs/cub/device_wide.rst
@@ -17,6 +17,7 @@ CUB device-level single-problem parallel algorithms:
 * :cpp:struct:`cub::DeviceFor <cub::DeviceFor>` provides device-wide, parallel operations for iterating over data residing within device-accessible memory
 * :cpp:class:`cub::DeviceHistogram <cub::DeviceHistogram>` constructs histograms from data samples residing within device-accessible memory
 * :cpp:struct:`cub::DevicePartition <cub::DevicePartition>` partitions data residing within device-accessible memory
+* :cpp:struct:`cub::DeviceMerge <cub::DeviceMerge>` merges two sorted sequences in device-accessible memory into a single one
 * :cpp:class:`cub::DeviceMergeSort <cub::DeviceMergeSort>` sorts items residing within device-accessible memory
 * :cpp:class:`cub::DeviceRadixSort <cub::DeviceRadixSort>` sorts items residing within device-accessible memory using radix sorting method
 * :cpp:struct:`cub::DeviceReduce <cub::DeviceReduce>` computes reduction of items residing within device-accessible memory
diff --git a/docs/cub/index.rst b/docs/cub/index.rst
index 17f00719e0b..f39df651bdc 100644
--- a/docs/cub/index.rst
+++ b/docs/cub/index.rst
@@ -435,7 +435,7 @@ How is CUB different than Thrust and Modern GPU?
 CUB and Thrust
 --------------------------------------------------
 
-CUB and `Thrust <http://thrust.github.io/>`_ share some
+CUB and `Thrust <https://nvidia.github.io/cccl/thrust/>`_ share some
 similarities in that they both provide similar device-wide primitives for CUDA.
 However, they target different abstraction layers for parallel computing.
 Thrust abstractions are agnostic of any particular parallel framework (e.g.,
diff --git a/docs/libcudacxx/extended_api/memory_access_properties/access_property.rst b/docs/libcudacxx/extended_api/memory_access_properties/access_property.rst
index 44006c52d4b..762d0d0e0e5 100644
--- a/docs/libcudacxx/extended_api/memory_access_properties/access_property.rst
+++ b/docs/libcudacxx/extended_api/memory_access_properties/access_property.rst
@@ -258,7 +258,7 @@ Mapping of access properties to NVVM-IR and the PTX ISA
 
 When ``cuda::access_property`` is applied to memory operation, it
 sometimes matches with some of the cache eviction priorities and cache
-hints introduced in the `PTX ISA Version 7.4 <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-isa-version-7-4>`_.
+hints introduced in the `PTX ISA Version 7.4 <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#changes-in-ptx-isa-version-7-4>`_.
 See `Cache Eviction Priority Hints <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-eviction-priority-hints>`_
 
 -  ``global``: ``evict_unchanged``
diff --git a/docs/libcudacxx/extended_api/memory_model.rst b/docs/libcudacxx/extended_api/memory_model.rst
index 91048ca6807..ff9f9ef44ca 100644
--- a/docs/libcudacxx/extended_api/memory_model.rst
+++ b/docs/libcudacxx/extended_api/memory_model.rst
@@ -78,7 +78,7 @@ An atomic operation is atomic at the scope it specifies if:
 .. note::
    If `hostNativeAtomicSupported` is `0`, atomic load or store operations at system scope that affect a
    naturally-aligned 16-byte wide object in
-   `unified memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#unified-memory>`__ or
+   `unified memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd>`__ or
    `mapped memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#mapped-memory>`__ require system
    support. NVIDIA is not aware of any system that lacks this support and there is no CUDA API query available to
    detect such systems.
diff --git a/docs/libcudacxx/extended_api/synchronization_primitives.rst b/docs/libcudacxx/extended_api/synchronization_primitives.rst
index 9a9a08c45ab..0d0101b08ef 100644
--- a/docs/libcudacxx/extended_api/synchronization_primitives.rst
+++ b/docs/libcudacxx/extended_api/synchronization_primitives.rst
@@ -61,7 +61,7 @@ Synchronization Primitives
        primitive for constraining concurrent access
      - libcu++ 1.1.0 / CCCL 2.0.0 / CUDA 11.0
    * - :ref:`cuda::binary_semaphore <libcudacxx-extended-api-synchronization-counting-semaphore>`
-     - System wide `std::binary_semaphore <https://en.cppreference.com/w/cpp/thread/binary_semaphore>`_
+     - System wide `std::binary_semaphore <https://en.cppreference.com/w/cpp/thread/counting_semaphore>`_
        primitive for mutual exclusion
      - libcu++ 1.1.0 / CCCL 2.0.0 / CUDA 11.0
 
diff --git a/docs/libcudacxx/ptx.rst b/docs/libcudacxx/ptx.rst
index c5fea593d5d..c4483db2dec 100644
--- a/docs/libcudacxx/ptx.rst
+++ b/docs/libcudacxx/ptx.rst
@@ -480,9 +480,9 @@ Instructions by section
      - No
    * - `wmma.store <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-wmma-store>`__
      - No
-   * - `wmma.mma <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions-wmma-mma>`__
+   * - `wmma.mma <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-wmma-mma>`__
      - No
-   * - `mma <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions-mma>`__
+   * - `mma <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma>`__
      - No
    * - `ldmatrix <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-ldmatrix>`__
      - No
@@ -490,7 +490,7 @@ Instructions by section
      - No
    * - `movmatrix <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-transpose-instruction-movmatrix>`__
      - No
-   * - `mma.sp <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multiply-and-accumulate-instruction-mma-sp>`__
+   * - `mma.sp <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma>`__
      - No
 
 .. list-table:: `Asynchronous Warpgroup Level Matrix Multiply-Accumulate Instructions <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-multiply-accumulate-instructions>`__
diff --git a/docs/libcudacxx/releases.rst b/docs/libcudacxx/releases.rst
index 182df6b351f..cf91820fad4 100644
--- a/docs/libcudacxx/releases.rst
+++ b/docs/libcudacxx/releases.rst
@@ -1,7 +1,7 @@
 .. _libcudacxx-releases:
 
 Releases
-============
+========
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/libcudacxx/releases/versioning.rst b/docs/libcudacxx/releases/versioning.rst
index 703ffaa8557..514aed184f4 100644
--- a/docs/libcudacxx/releases/versioning.rst
+++ b/docs/libcudacxx/releases/versioning.rst
@@ -149,8 +149,9 @@ that the default ABI version may change in any release. A subset of
 older ABI versions can be used instead by defining
 ``_LIBCUDACXX_CUDA_ABI_VERSION`` to the desired version.
 
-For more information on specific ABI versions, please see the `releases
-section <../releases.md>`_ and `changelog <changelog.md>`_.
+For more information on specific ABI versions, please see the
+:ref:`release section <libcudacxx-releases>` and
+:ref:`changelog <libcudacxx-releases-changelog>`.
 
 A program is ill-formed, no diagnostic required, if it uses two
 different translation units compiled with a different NVIDIA C++
diff --git a/docs/libcudacxx/standard_api/time_library.rst b/docs/libcudacxx/standard_api/time_library.rst
index 176e2a9ef69..e28685730b1 100644
--- a/docs/libcudacxx/standard_api/time_library.rst
+++ b/docs/libcudacxx/standard_api/time_library.rst
@@ -34,7 +34,7 @@ we use:
 -  `GetSystemTimePreciseAsFileTime <https://docs.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-getsystemtimepreciseasfiletime>`_ and
    `GetSystemTimeAsFileTime <https://docs.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-getsystemtimeasfiletime>`_
    for host code on Windows.
--  `clock_gettime(CLOCK_REALTIME, ...) <https://linux.die.net/man/3/clock_gettime>`_ and `gettimeofday <https://linux.die.net/man/2/gettimeofday>`_
+-  `clock_gettime(CLOCK_REALTIME, ...) <https://man7.org/linux/man-pages/man3/clock_gettime.3.html>`_ and `gettimeofday <https://man7.org/linux/man-pages/man2/gettimeofday.2.html>`_
    for host code on Linux, Android, and QNX.
 -  `PTX's %globaltimer <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer>`_ for device code.
 
diff --git a/docs/repo.toml b/docs/repo.toml
index e6144466c0a..81fddea2299 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -250,6 +250,7 @@ doxygen_predefined = [
   "_CCCL_SUPPRESS_DEPRECATED_POP",
   "DOXYGEN_SHOULD_SKIP_THIS",
   "DOXYGEN_ACTIVE",
+  "_LIBCUDACXX_DEPRECATED_IN_CXX11",
   "THRUST_DISABLE_NAMESPACE_MAGIC",
   "THRUST_IGNORE_NAMESPACE_MAGIC_ERROR",
   "THRUST_NAMESPACE_BEGIN=namespace thrust {",
diff --git a/docs/thrust/cmake_options.rst b/docs/thrust/cmake_options.rst
index 47fb14e750e..a3dab2487b5 100644
--- a/docs/thrust/cmake_options.rst
+++ b/docs/thrust/cmake_options.rst
@@ -1,3 +1,5 @@
+.. _cmake-options:
+
 CMake Options
 =============
 
@@ -83,6 +85,8 @@ Single Config CMake Options
    -  Selects the C++ standard dialect to use. Default is ``14``
       (C++14).
 
+.. _cmake-multi-config-options:
+
 Multi Config CMake Options
 --------------------------
 
diff --git a/docs/thrust/releases/changelog.rst b/docs/thrust/releases/changelog.rst
index 477e3c74d72..9ace537b480 100644
--- a/docs/thrust/releases/changelog.rst
+++ b/docs/thrust/releases/changelog.rst
@@ -223,7 +223,7 @@ Thrust 1.17.0 is the final minor release of the 1.X series. This release
 provides GDB pretty-printers for device vectors/references, a new
 ``unique_count`` algorithm, and an easier way to create tagged Thrust
 iterators. Several documentation fixes are included, which can be found
-on the new Thrust documentation site at https://nvidia.github.io/thrust.
+on the new Thrust documentation site at https://nvidia.github.io/cccl/thrust/.
 We’ll be migrating existing documentation sources to this new location
 over the next few months.
 
@@ -255,8 +255,7 @@ Other Enhancements
 
 -  NVIDIA/thrust#1512: Use CUB to implement ``adjacent_difference``.
 -  NVIDIA/thrust#1555: Use CUB to implement ``scan_by_key``.
--  NVIDIA/thrust#1611: Add new doxybook-based Thrust documentation at
-   https://nvidia.github.io/thrust.
+-  NVIDIA/thrust#1611: Add new doxybook-based Thrust documentation
 -  NVIDIA/thrust#1639: Fixed broken link in documentation. Thanks to
    @jrhemstad for this contribution.
 -  NVIDIA/thrust#1644: Increase contrast of search input text in new doc
@@ -792,7 +791,7 @@ New Features
 -  NVIDIA/thrust#1159: CMake multi-config support, which allows multiple
    combinations of host and device systems to be built and tested at
    once. More details can be found here:
-   https://github.com/NVIDIA/thrust/blob/main/CONTRIBUTING.md#multi-config-cmake-options
+   :ref:`Multi Config CMake Options <cmake-multi-config-options>`
 -  CMake refactoring:
 
    -  Added install targets to CMake builds.
@@ -800,7 +799,7 @@ New Features
    -  Thrust can be added to another CMake project by calling
       ``add_subdirectory`` with the Thrust source root (see
       NVIDIA/thrust#976). An example can be found here:
-      https://github.com/NVIDIA/thrust/blob/main/examples/cmake/add_subdir/CMakeLists.txt
+      https://github.com/NVIDIA/cccl/blob/main/thrust/examples/cmake/add_subdir/CMakeLists.txt
    -  CMake < 3.15 is no longer supported.
    -  Dialects are now configured through target properties. A new
       ``THRUST_CPP_DIALECT`` option has been added for single config
@@ -831,7 +830,7 @@ Other Enhancements
 ~~~~~~~~~~~~~~~~~~
 
 -  Contributor documentation:
-   https://github.com/NVIDIA/thrust/blob/main/CONTRIBUTING.md
+   https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md
 -  Code of Conduct:
    https://github.com/NVIDIA/thrust/blob/main/CODE_OF_CONDUCT.md. Thanks
    to Conor Hoekstra for this contribution.
diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h
index a18f6d62134..75ba16bd05c 100644
--- a/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/cuda_managed_memory_resource.h
@@ -38,9 +38,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
 
-/**
- * @brief `cuda_managed_memory_resource` uses cudaMallocManaged / cudaFree for allocation/deallocation.
- */
+//! @brief \c cuda_managed_memory_resource uses `cudaMallocManaged` / `cudaFree` for allocation / deallocation.
 class cuda_managed_memory_resource
 {
 private:
@@ -55,13 +53,11 @@ class cuda_managed_memory_resource
     _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to cuda_managed_memory_resource");
   }
 
-  /**
-   * @brief Allocate CUDA unified memory of size at least \p __bytes.
-   * @param __bytes The size in bytes of the allocation.
-   * @param __alignment The requested alignment of the allocation.
-   * @throw cuda::cuda_error of the returned error code
-   * @return Pointer to the newly allocated memory
-   */
+  //! @brief Allocate CUDA unified memory of size at least \p __bytes.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __alignment The requested alignment of the allocation.
+  //! @throw cuda::cuda_error of the returned error code
+  //! @return Pointer to the newly allocated memory
   _CCCL_NODISCARD void* allocate(const size_t __bytes, const size_t __alignment = default_cuda_malloc_alignment) const
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
@@ -76,12 +72,10 @@ class cuda_managed_memory_resource
     return __ptr;
   }
 
-  /**
-   * @brief Deallocate memory pointed to by \p __ptr.
-   * @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`
-   * @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
-   * @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
-   */
+  //! @brief Deallocate memory pointed to by \p __ptr.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`
+  //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
+  //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
   void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
@@ -91,32 +85,28 @@ class cuda_managed_memory_resource
     (void) __alignment;
   }
 
-  /**
-   * @brief Equality comparison with another cuda_managed_memory_resource
-   * @return Whether both cuda_managed_memory_resource were constructed with the same flags
-   */
+  //! @brief Equality comparison with another \c cuda_managed_memory_resource
+  //! @param __other The other \c cuda_managed_memory_resource
+  //! @return Whether both \c cuda_managed_memory_resource were constructed with the same flags
   _CCCL_NODISCARD constexpr bool operator==(cuda_managed_memory_resource const& __other) const noexcept
   {
     return __flags_ == __other.__flags_;
   }
 #    if _CCCL_STD_VER <= 2017
-  /**
-   * @brief Inequality comparison with another cuda_managed_memory_resource
-   * @return Whether both cuda_managed_memory_resource were constructed with different flags
-   */
+  //! @brief Inequality comparison with another \c cuda_managed_memory_resource
+  //! @param __other The other \c cuda_managed_memory_resource
+  //! @return Whether both \c cuda_managed_memory_resource were constructed with different flags
   _CCCL_NODISCARD constexpr bool operator!=(cuda_managed_memory_resource const& __other) const noexcept
   {
     return __flags_ != __other.__flags_;
   }
 #    endif // _CCCL_STD_VER <= 2017
 
-  /**
-   * @brief Equality comparison between a cuda_memory_resource and another resource
-   * @param __lhs The cuda_memory_resource
-   * @param __rhs The resource to compare to
-   * @return If the underlying types are equality comparable, returns the result of equality comparison of both
-   * resources. Otherwise, returns false.
-   */
+  //! @brief Equality comparison between a \c cuda_memory_resource and another resource
+  //! @param __lhs The \c cuda_memory_resource
+  //! @param __rhs The resource to compare to
+  //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
+  //! resources. Otherwise, returns false.
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(cuda_managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_managed_memory_resource, _Resource>)
@@ -125,9 +115,8 @@ class cuda_managed_memory_resource
         == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
 #    if _CCCL_STD_VER <= 2017
-  /**
-   * @copydoc cuda_managed_memory_resource::operator<_Resource>==(cuda_managed_memory_resource const&, _Resource const&)
-   */
+  //! @copydoc cuda_managed_memory_resource::operator<_Resource>==(cuda_managed_memory_resource const&, _Resource
+  //! const&)
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, cuda_managed_memory_resource const& __lhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_managed_memory_resource, _Resource>)
@@ -135,9 +124,8 @@ class cuda_managed_memory_resource
     return resource_ref<>{const_cast<cuda_managed_memory_resource&>(__lhs)}
         == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  /**
-   * @copydoc cuda_managed_memory_resource::operator<_Resource>==(cuda_managed_memory_resource const&, _Resource const&)
-   */
+  //! @copydoc cuda_managed_memory_resource::operator<_Resource>==(cuda_managed_memory_resource const&, _Resource
+  //! const&)
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(cuda_managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_managed_memory_resource, _Resource>)
@@ -145,9 +133,8 @@ class cuda_managed_memory_resource
     return resource_ref<>{const_cast<cuda_managed_memory_resource&>(__lhs)}
         != resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  /**
-   * @copydoc cuda_managed_memory_resource::operator<_Resource>==(cuda_managed_memory_resource const&, _Resource const&)
-   */
+  //! @copydoc cuda_managed_memory_resource::operator<_Resource>==(cuda_managed_memory_resource const&, _Resource
+  //! const&)
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, cuda_managed_memory_resource const& __lhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_managed_memory_resource, _Resource>)
@@ -157,18 +144,12 @@ class cuda_managed_memory_resource
   }
 #    endif // _CCCL_STD_VER <= 2017
 
-  /**
-   * @brief Enables the `device_accessible` property
-   */
+  //! @brief Enables the \c device_accessible property
   friend constexpr void get_property(cuda_managed_memory_resource const&, device_accessible) noexcept {}
-  /**
-   * @brief Enables the `host_accessible` property
-   */
+  //! @brief Enables the \c host_accessible property
   friend constexpr void get_property(cuda_managed_memory_resource const&, host_accessible) noexcept {}
 
-  /**
-   * @brief Checks whether the passed in alignment is valid
-   */
+  //! @brief Checks whether the passed in alignment is valid
   static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
   {
     return __alignment <= default_cuda_malloc_alignment && (default_cuda_malloc_alignment % __alignment == 0);
diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/cuda_memory_resource.h
index 1a55de6145b..8b706b9e78e 100644
--- a/libcudacxx/include/cuda/__memory_resource/cuda_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/cuda_memory_resource.h
@@ -38,18 +38,14 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
 
-/**
- * @brief `cuda_memory_resource` uses cudaMalloc / cudaFree for allocation/deallocation.
- */
+//! @brief cuda_memory_resource uses `cudaMalloc` / `cudaFree` for allocation / deallocation.
 struct cuda_memory_resource
 {
-  /**
-   * @brief Allocate device memory of size at least \p __bytes.
-   * @param __bytes The size in bytes of the allocation.
-   * @param __alignment The requested alignment of the allocation.
-   * @throw std::bad_alloc in case of invalid alignment or cuda::cuda_error of the returned error code.
-   * @return Pointer to the newly allocated memory
-   */
+  //! @brief Allocate device memory of size at least \p __bytes.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __alignment The requested alignment of the allocation.
+  //! @throw std::bad_alloc in case of invalid alignment or \c cuda::cuda_error of the returned error code.
+  //! @return Pointer to the newly allocated memory
   _CCCL_NODISCARD void* allocate(const size_t __bytes, const size_t __alignment = default_cuda_malloc_alignment) const
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
@@ -63,12 +59,10 @@ struct cuda_memory_resource
     return __ptr;
   }
 
-  /**
-   * @brief Deallocate memory pointed to by \p __ptr.
-   * @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`
-   * @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
-   * @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
-   */
+  //! @brief Deallocate memory pointed to by \p __ptr.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`
+  //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
+  //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
   void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
@@ -78,32 +72,26 @@ struct cuda_memory_resource
     (void) __alignment;
   }
 
-  /**
-   * @brief Equality comparison with another cuda_memory_resource
-   * @return true
-   */
+  //! @brief Equality comparison with another \c cuda_memory_resource
+  //! @return true
   _CCCL_NODISCARD constexpr bool operator==(cuda_memory_resource const&) const noexcept
   {
     return true;
   }
 #    if _CCCL_STD_VER <= 2017
-  /**
-   * @brief Inequality comparison with another cuda_memory_resource
-   * @return false
-   */
+  //! @brief Inequality comparison with another \c cuda_memory_resource
+  //! @return false
   _CCCL_NODISCARD constexpr bool operator!=(cuda_memory_resource const&) const noexcept
   {
     return false;
   }
 #    endif // _CCCL_STD_VER <= 2017
 
-  /**
-   * @brief Equality comparison between a cuda_memory_resource and another resource
-   * @param __lhs The cuda_memory_resource
-   * @param __rhs The resource to compare to
-   * @return If the underlying types are equality comparable, returns the result of equality comparison of both
-   * resources. Otherwise, returns false.
-   */
+  //! @brief Equality comparison between a \c cuda_memory_resource and another resource
+  //! @param __lhs The \c cuda_memory_resource
+  //! @param __rhs The resource to compare to
+  //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
+  //! resources. Otherwise, returns false.
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(cuda_memory_resource const& __lhs, _Resource const& __rhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_memory_resource, _Resource>)
@@ -111,27 +99,21 @@ struct cuda_memory_resource
     return resource_ref<>{const_cast<cuda_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
 #    if _CCCL_STD_VER <= 2017
-  /**
-   * @copydoc cuda_memory_resource::operator==<_Resource>(cuda_memory_resource const&, _Resource const&)
-   */
+  //! @copydoc cuda_memory_resource::operator==<_Resource>(cuda_memory_resource const&, _Resource const&)
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, cuda_memory_resource const& __lhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_memory_resource, _Resource>)
   {
     return resource_ref<>{const_cast<cuda_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  /**
-   * @copydoc cuda_memory_resource::operator==<_Resource>(cuda_memory_resource const&, _Resource const&)
-   */
+  //! @copydoc cuda_memory_resource::operator==<_Resource>(cuda_memory_resource const&, _Resource const&)
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(cuda_memory_resource const& __lhs, _Resource const& __rhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_memory_resource, _Resource>)
   {
     return resource_ref<>{const_cast<cuda_memory_resource&>(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  /**
-   * @copydoc cuda_memory_resource::operator==<_Resource>(cuda_memory_resource const&, _Resource const&)
-   */
+  //! @copydoc cuda_memory_resource::operator==<_Resource>(cuda_memory_resource const&, _Resource const&)
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, cuda_memory_resource const& __lhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_memory_resource, _Resource>)
@@ -140,14 +122,10 @@ struct cuda_memory_resource
   }
 #    endif // _CCCL_STD_VER <= 2017
 
-  /**
-   * @brief Enables the `device_accessible` property
-   */
+  //! @brief Enables the \c device_accessible property
   friend constexpr void get_property(cuda_memory_resource const&, device_accessible) noexcept {}
 
-  /**
-   * @brief Checks whether the passed in alignment is valid
-   */
+  //! @brief Checks whether the passed in alignment is valid
   static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
   {
     return __alignment <= default_cuda_malloc_alignment && (default_cuda_malloc_alignment % __alignment == 0);
diff --git a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h
index 2de1552913a..ac7fd07b965 100644
--- a/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/cuda_pinned_memory_resource.h
@@ -39,9 +39,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
 
-/**
- * @brief `cuda_pinned_memory_resource` uses cudaMallocHost / cudaFreeHost for allocation/deallocation.
- */
+//! @brief cuda_pinned_memory_resource uses `cudaMallocHost` / `cudaFreeHost` for allocation / deallocation.
 class cuda_pinned_memory_resource
 {
 private:
@@ -57,13 +55,11 @@ class cuda_pinned_memory_resource
     _LIBCUDACXX_ASSERT(__flags_ == __flags, "Unexpected flags passed to cuda_pinned_memory_resource");
   }
 
-  /**
-   * @brief Allocate host memory of size at least \p __bytes.
-   * @param __bytes The size in bytes of the allocation.
-   * @param __alignment The requested alignment of the allocation.
-   * @throw cuda::cuda_error if allocation fails with a CUDA error.
-   * @return Pointer to the newly allocated memory
-   */
+  //! @brief Allocate host memory of size at least \p __bytes.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __alignment The requested alignment of the allocation.
+  //! @throw cuda::cuda_error if allocation fails with a CUDA error.
+  //! @return Pointer to the newly allocated memory
   _CCCL_NODISCARD void* allocate(const size_t __bytes,
                                  const size_t __alignment = default_cuda_malloc_host_alignment) const
   {
@@ -78,12 +74,10 @@ class cuda_pinned_memory_resource
     return __ptr;
   }
 
-  /**
-   * @brief Deallocate memory pointed to by \p __ptr.
-   * @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`
-   * @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
-   * @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
-   */
+  //! @brief Deallocate memory pointed to by \p __ptr.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`
+  //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
+  //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
   void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_host_alignment) const
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
@@ -93,32 +87,28 @@ class cuda_pinned_memory_resource
     (void) __alignment;
   }
 
-  /**
-   * @brief Equality comparison with another cuda_pinned_memory_resource
-   * @return Whether both cuda_pinned_memory_resource were constructed with the same flags
-   */
+  //! @brief Equality comparison with another \c cuda_pinned_memory_resource
+  //! @param __other The other \c cuda_pinned_memory_resource
+  //! @return Whether both \c cuda_pinned_memory_resource were constructed with the same flags
   _CCCL_NODISCARD constexpr bool operator==(cuda_pinned_memory_resource const& __other) const noexcept
   {
     return __flags_ == __other.__flags_;
   }
 #    if _CCCL_STD_VER <= 2017
-  /**
-   * @brief Equality comparison with another cuda_pinned_memory_resource
-   * @return Whether both cuda_pinned_memory_resource were constructed with different flags
-   */
+  //! @brief Equality comparison with another \c cuda_pinned_memory_resource
+  //! @param __other The other \c cuda_pinned_memory_resource
+  //! @return Whether both \c cuda_pinned_memory_resource were constructed with different flags
   _CCCL_NODISCARD constexpr bool operator!=(cuda_pinned_memory_resource const& __other) const noexcept
   {
     return __flags_ != __other.__flags_;
   }
 #    endif // _CCCL_STD_VER <= 2017
 
-  /**
-   * @brief Equality comparison between a cuda_memory_resource and another resource
-   * @param __lhs The cuda_memory_resource
-   * @param __rhs The resource to compare to
-   * @return If the underlying types are equality comparable, returns the result of equality comparison of both
-   * resources. Otherwise, returns false.
-   */
+  //! @brief Equality comparison between a \c cuda_memory_resource and another resource
+  //! @param __lhs The \c cuda_memory_resource
+  //! @param __rhs The resource to compare to
+  //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
+  //! resources. Otherwise, returns false.
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(cuda_pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_pinned_memory_resource, _Resource>)
@@ -127,9 +117,7 @@ class cuda_pinned_memory_resource
         == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
 #    if _CCCL_STD_VER <= 2017
-  /**
-   * @copydoc cuda_pinned_memory_resource::operator<_Resource>==(cuda_pinned_memory_resource const&, _Resource const&)
-   */
+  //! @copydoc cuda_pinned_memory_resource::operator<_Resource>==(cuda_pinned_memory_resource const&, _Resource const&)
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, cuda_pinned_memory_resource const& __lhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_pinned_memory_resource, _Resource>)
@@ -137,9 +125,7 @@ class cuda_pinned_memory_resource
     return resource_ref<>{const_cast<cuda_pinned_memory_resource&>(__lhs)}
         == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  /**
-   * @copydoc cuda_pinned_memory_resource::operator<_Resource>==(cuda_pinned_memory_resource const&, _Resource const&)
-   */
+  //! @copydoc cuda_pinned_memory_resource::operator<_Resource>==(cuda_pinned_memory_resource const&, _Resource const&)
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(cuda_pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_pinned_memory_resource, _Resource>)
@@ -147,9 +133,7 @@ class cuda_pinned_memory_resource
     return resource_ref<>{const_cast<cuda_pinned_memory_resource&>(__lhs)}
         != resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  /**
-   * @copydoc cuda_pinned_memory_resource::operator<_Resource>==(cuda_pinned_memory_resource const&, _Resource const&)
-   */
+  //! @copydoc cuda_pinned_memory_resource::operator<_Resource>==(cuda_pinned_memory_resource const&, _Resource const&)
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, cuda_pinned_memory_resource const& __lhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<cuda_pinned_memory_resource, _Resource>)
@@ -159,18 +143,12 @@ class cuda_pinned_memory_resource
   }
 #    endif // _CCCL_STD_VER <= 2017
 
-  /**
-   * @brief Enables the `device_accessible` property
-   */
+  //! @brief Enables the \c device_accessible property
   friend constexpr void get_property(cuda_pinned_memory_resource const&, device_accessible) noexcept {}
-  /**
-   * @brief Enables the `host_accessible` property
-   */
+  //! @brief Enables the \c host_accessible property
   friend constexpr void get_property(cuda_pinned_memory_resource const&, host_accessible) noexcept {}
 
-  /**
-   * @brief Checks whether the passed in alignment is valid
-   */
+  //! @brief Checks whether the passed in alignment is valid
   static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
   {
     return __alignment <= default_cuda_malloc_host_alignment && (default_cuda_malloc_host_alignment % __alignment == 0);
diff --git a/libcudacxx/include/cuda/__memory_resource/get_property.h b/libcudacxx/include/cuda/__memory_resource/get_property.h
index d93df59661c..4208d07232c 100644
--- a/libcudacxx/include/cuda/__memory_resource/get_property.h
+++ b/libcudacxx/include/cuda/__memory_resource/get_property.h
@@ -32,8 +32,15 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-/// \concept has_property
-/// \brief The \c has_property concept
+//! @brief The \c has_property concept verifies that a Resource satisfies a given Property
+//! @rst
+//! For \c has_property we require the following free function to be callable
+//!
+//! .. code cpp::
+//!
+//!    get_property(const Resource& res, Property prop);
+//!
+//! @endrst
 template <class _Resource, class _Property, class = void>
 _LIBCUDACXX_INLINE_VAR constexpr bool has_property = false;
 
@@ -44,11 +51,21 @@ _LIBCUDACXX_INLINE_VAR constexpr bool has_property<
   _CUDA_VSTD::void_t<decltype(get_property(_CUDA_VSTD::declval<const _Resource&>(), _CUDA_VSTD::declval<_Property>()))>> =
   true;
 
-/// \concept property_with_value
-/// \brief The \c property_with_value concept
 template <class _Property>
 using __property_value_t = typename _Property::value_type;
 
+//! @brief The \c property_with_value concept verifies that a Property is stateful and signals this through the
+//! `value_type` alias
+//! @rst
+//! .. code cpp::
+//!
+//!    struct stateless_property {};
+//!    static_assert(!cuda::property_with_value<stateless_property>);
+//!
+//!    struct stateful_property { using value_type = int; };
+//!    static_assert(!cuda::property_with_value<stateful_property>);
+//!
+//! @endrst
 template <class _Property, class = void>
 _LIBCUDACXX_INLINE_VAR constexpr bool property_with_value = false;
 
@@ -56,8 +73,34 @@ template <class _Property>
 _LIBCUDACXX_INLINE_VAR constexpr bool property_with_value<_Property, _CUDA_VSTD::void_t<__property_value_t<_Property>>> =
   true;
 
-/// \concept has_property_with
-/// \brief The \c has_property_with concept
+//! @brief The \c has_property_with concept verifies that a Resource satisfies a given stateful Property
+//! @rst
+//! For \c has_property_with we require the following free function to be callable and its return type to exactly match
+//! the ``value_type`` of the Property
+//!
+//! .. code cpp::
+//!
+//!    struct stateless_property {};
+//!    constexpr void get_property(const Resource& res, stateless_property) {}
+//!
+//!    // The resource must be stateful
+//!    static_assert(!cuda::has_property_with<Resource, stateless_property, int>);
+//!
+//!    struct stateful_property { using value_type = int; };
+//!    constexpr int get_property(const Resource& res, stateful_property) {}
+//!
+//!    // The resource is stateful and has the correct return type
+//!    static_assert(cuda::has_property_with<Resource, stateful_property, int>);
+//!
+//!    // The resource is stateful but the return type is incorrect
+//!    static_assert(!cuda::has_property_with<Resource, stateful_property, double>);
+//!
+//!    constexpr double get_property(const OtherResource& res, stateful_property) {}
+//!
+//!    // The resource is stateful but the value_type does not match the `get_property` return type
+//!    static_assert(!cuda::has_property_with<OtherResource, stateful_property, double>);
+//!
+//! @endrst
 template <class _Resource, class _Property, class _Return>
 _LIBCUDACXX_CONCEPT_FRAGMENT(
   __has_property_with_,
@@ -66,8 +109,6 @@ _LIBCUDACXX_CONCEPT_FRAGMENT(
 template <class _Resource, class _Property, class _Return>
 _LIBCUDACXX_CONCEPT has_property_with = _LIBCUDACXX_FRAGMENT(__has_property_with_, _Resource, _Property, _Return);
 
-/// \concept __has_upstream_resource
-/// \brief The \c __has_upstream_resource concept
 template <class _Resource, class _Upstream>
 _LIBCUDACXX_CONCEPT_FRAGMENT(
   __has_upstream_resource_,
@@ -76,9 +117,6 @@ _LIBCUDACXX_CONCEPT_FRAGMENT(
 template <class _Resource, class _Upstream>
 _LIBCUDACXX_CONCEPT __has_upstream_resource = _LIBCUDACXX_FRAGMENT(__has_upstream_resource_, _Resource, _Upstream);
 
-/// class forward_property
-/// \brief The \c forward_property crtp template simplifies the user facing side of forwarding properties
-///        We can just derive from it to properly forward all properties
 _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__forward_property)
 template <class _Derived, class _Upstream>
 struct __fn
@@ -101,11 +139,26 @@ struct __fn
 };
 _LIBCUDACXX_END_NAMESPACE_CPO
 
+//! @brief The \c forward_property CRTP template allows Derived to forward all properties of Upstream
+//! @rst
+//! .. code cpp::
+//!
+//!    class UpstreamWithProperties;
+//!
+//!    class DerivedClass : cuda::forward_properties<DerivedClass, UpstreamWithProperties> {
+//!      // This method is needed to forward stateful properties
+//!      UpstreamWithProperties& upstream_resource() const { ... }
+//!    };
+//!
+//! .. note::
+//!
+//!    In order to forward stateful properties, a type needs do implement an `upstream_resource()` method that returns
+//!    an instance of the upstream.
+//!
+//! @endrst
 template <class _Derived, class _Upstream>
 using forward_property = __forward_property::__fn<_Derived, _Upstream>;
 
-/// class get_property
-/// \brief The \c get_property customization point ensures that `cuda::get_property` is always available
 _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__get_property)
 void get_property();
 
@@ -123,7 +176,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto get_property = __get_property::__fn{};
+_CCCL_GLOBAL_CONSTANT auto get_property = __get_property::__fn{};
 } // namespace __cpo
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
diff --git a/libcudacxx/include/cuda/__memory_resource/properties.h b/libcudacxx/include/cuda/__memory_resource/properties.h
index ccdd285d0e3..f44d956e1ca 100644
--- a/libcudacxx/include/cuda/__memory_resource/properties.h
+++ b/libcudacxx/include/cuda/__memory_resource/properties.h
@@ -29,25 +29,17 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
 
-/**
- * @brief The default alignment by a cudaMalloc{...} call
- */
+//! @brief The default alignment by a cudaMalloc{...} call
 _LIBCUDACXX_INLINE_VAR constexpr size_t default_cuda_malloc_alignment = 256;
 
-/**
- * @brief The default alignment by a cudaMallocHost{...} call
- */
+//! @brief The default alignment by a cudaMallocHost{...} call
 _LIBCUDACXX_INLINE_VAR constexpr size_t default_cuda_malloc_host_alignment = alignof(_CUDA_VSTD::max_align_t);
 
-/**
- * @brief The \c device_accessible property signals that the allocated memory is device accessible
- */
+//! @brief The device_accessible property signals that the allocated memory is device accessible
 struct device_accessible
 {};
 
-/**
- * @brief The \c host_accessible property signals that the allocated memory is host accessible
- */
+//! @brief The device_accessible property signals that the allocated memory is host accessible
 struct host_accessible
 {};
 
diff --git a/libcudacxx/include/cuda/__memory_resource/resource.h b/libcudacxx/include/cuda/__memory_resource/resource.h
index d97e1f797e6..8328d9809c5 100644
--- a/libcudacxx/include/cuda/__memory_resource/resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/resource.h
@@ -34,8 +34,6 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
 
-/// \concept resource
-/// \brief The \c resource concept
 template <class _Resource>
 _LIBCUDACXX_CONCEPT_FRAGMENT(
   __resource_,
@@ -44,11 +42,21 @@ _LIBCUDACXX_CONCEPT_FRAGMENT(
     requires(_CUDA_VSTD::same_as<void, decltype(__res.deallocate(__ptr, __bytes, __alignment))>),
     requires(_CUDA_VSTD::equality_comparable<_Resource>)));
 
+//! @brief The \c resource concept verifies that a type Resource satisfies the basic requirements of a memory
+//! resource
+//! @rst
+//! We require that a resource supports the following interface
+//!
+//!   - ``allocate(size_t bytes, size_t alginment)``
+//!   - ``deallocate(void* ptr, size_t bytes, size_t alginment)``
+//!   - ``T() == T()``
+//!   - ``T() != T()``
+//!
+//! @endrst
+//! @tparam _Resource The type that should implement the resource concept
 template <class _Resource>
 _LIBCUDACXX_CONCEPT resource = _LIBCUDACXX_FRAGMENT(__resource_, _Resource);
 
-/// \concept async_resource
-/// \brief The \c async_resource concept
 template <class _Resource>
 _LIBCUDACXX_CONCEPT_FRAGMENT(
   __async_resource_,
@@ -57,17 +65,36 @@ _LIBCUDACXX_CONCEPT_FRAGMENT(
     requires(_CUDA_VSTD::same_as<void*, decltype(__res.allocate_async(__bytes, __alignment, __stream))>),
     requires(_CUDA_VSTD::same_as<void, decltype(__res.deallocate_async(__ptr, __bytes, __alignment, __stream))>)));
 
+//! @brief The \c async_resource concept verifies that a type Resource satisfies the basic requirements of a
+//! memory resource and additionally supports stream ordered allocations
+//! @rst
+//! We require that an async resource supports the following interface
+//!
+//!   - ``allocate(size_t bytes, size_t alignment)``
+//!   - ``deallocate(void* ptr, size_t bytes, size_t alignment)``
+//!   - ``T() == T()``
+//!   - ``T() != T()``
+//!
+//!   - ``allocate_async(size_t bytes, size_t alignment, cuda::stream_ref stream)``
+//!   - ``deallocate_async(void* ptr, size_t bytes, size_t alignment, cuda::stream_ref stream)``
+//!
+//! @endrst
+//! @tparam _Resource The type that should implement the async resource concept
 template <class _Resource>
 _LIBCUDACXX_CONCEPT async_resource = _LIBCUDACXX_FRAGMENT(__async_resource_, _Resource);
 
-/// \concept resource_with
-/// \brief The \c resource_with concept
+//! @brief The \c resource_with concept verifies that a type Resource satisfies the `resource` concept and
+//! also satisfies all the provided Properties
+//! @tparam _Resource
+//! @tparam _Properties
 template <class _Resource, class... _Properties>
 _LIBCUDACXX_CONCEPT resource_with =
   resource<_Resource> && _CUDA_VSTD::__all_of<has_property<_Resource, _Properties>...>;
 
-/// \concept async_resource_with
-/// \brief The \c async_resource_with concept
+//! @brief The \c async_resource_with concept verifies that a type Resource satisfies the `async_resource`
+//! concept and also satisfies all the provided Properties
+//! @tparam _Resource
+//! @tparam _Properties
 template <class _Resource, class... _Properties>
 _LIBCUDACXX_CONCEPT async_resource_with =
   async_resource<_Resource> && _CUDA_VSTD::__all_of<has_property<_Resource, _Properties>...>;
diff --git a/libcudacxx/include/cuda/__memory_resource/resource_ref.h b/libcudacxx/include/cuda/__memory_resource/resource_ref.h
index ab75c7f6f27..c6fce54b621 100644
--- a/libcudacxx/include/cuda/__memory_resource/resource_ref.h
+++ b/libcudacxx/include/cuda/__memory_resource/resource_ref.h
@@ -108,8 +108,8 @@ struct _Resource_vtable_builder
     }
 
     template <class _Resource>
-    static bool _Equal(void* __left, void* __right) {
-        return *static_cast<_Resource *>(__left) == *static_cast<_Resource *>(__right);
+    static bool _Equal(void* __left, void* __rhs) {
+        return *static_cast<_Resource *>(__left) == *static_cast<_Resource *>(__rhs);
     }
 
     _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type)
@@ -279,7 +279,7 @@ _LIBCUDACXX_INLINE_VAR constexpr _Vtable_store<_Alloc_type> __alloc_vtable =
   _Resource_vtable_builder::template _Create<_Resource, _Alloc_type>();
 
 template <class>
-_LIBCUDACXX_INLINE_VAR constexpr bool _Is_basic_resource_ref = false;
+_LIBCUDACXX_INLINE_VAR constexpr bool __is_basic_resource_ref = false;
 
 template <_AllocType _Alloc_type, class... _Properties>
 class basic_resource_ref
@@ -300,38 +300,52 @@ class basic_resource_ref
     _CUDA_VSTD::__all_of<_CUDA_VSTD::_One_of<_Properties, _OtherProperties...>...>;
 
 public:
+  //! @brief Constructs a \c basic_resource_ref from a type that satisfies the \c resource or \c async_resource concept
+  //! as well as all properties
+  //! @param __res The resource to be wrapped within the \c basic_resource_ref
   _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type2 = _Alloc_type)
-  _LIBCUDACXX_REQUIRES((!_Is_basic_resource_ref<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Default)
+  _LIBCUDACXX_REQUIRES((!__is_basic_resource_ref<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Default)
                          _LIBCUDACXX_AND resource_with<_Resource, _Properties...>)
   basic_resource_ref(_Resource& __res) noexcept
       : _Resource_ref_base<_Alloc_type>(_CUDA_VSTD::addressof(__res), &__alloc_vtable<_Alloc_type, _Resource>)
       , __vtable(__vtable::template _Create<_Resource>())
   {}
 
+  //! @brief Constructs a \c resource_ref from a type that satisfies the \c async_resource concept  as well as all
+  //! properties. This ignores the async interface of the passed in resource
+  //! @param __res The resource to be wrapped within the \c resource_ref
   _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type2 = _Alloc_type)
-  _LIBCUDACXX_REQUIRES((!_Is_basic_resource_ref<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Async)
+  _LIBCUDACXX_REQUIRES((!__is_basic_resource_ref<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Async)
                          _LIBCUDACXX_AND async_resource_with<_Resource, _Properties...>)
   basic_resource_ref(_Resource& __res) noexcept
       : _Resource_ref_base<_Alloc_type>(_CUDA_VSTD::addressof(__res), &__alloc_vtable<_Alloc_type, _Resource>)
       , __vtable(__vtable::template _Create<_Resource>())
   {}
 
+  //! @brief Constructs a \c basic_resource_ref from a type that satisfies the \c resource or \c async_resource concept
+  //! as well as all properties
+  //! @param __res Pointer to a resource to be wrapped within the \c basic_resource_ref
   _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type2 = _Alloc_type)
-  _LIBCUDACXX_REQUIRES((!_Is_basic_resource_ref<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Default)
+  _LIBCUDACXX_REQUIRES((!__is_basic_resource_ref<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Default)
                          _LIBCUDACXX_AND resource_with<_Resource, _Properties...>)
   basic_resource_ref(_Resource* __res) noexcept
       : _Resource_ref_base<_Alloc_type>(__res, &__alloc_vtable<_Alloc_type, _Resource>)
       , __vtable(__vtable::template _Create<_Resource>())
   {}
 
+  //! @brief Constructs a \c resource_ref from a type that satisfies the \c async_resource concept  as well as all
+  //! properties. This ignores the async interface of the passed in resource
+  //! @param __res Pointer to a resource to be wrapped within the \c resource_ref
   _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type2 = _Alloc_type)
-  _LIBCUDACXX_REQUIRES((!_Is_basic_resource_ref<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Async)
+  _LIBCUDACXX_REQUIRES((!__is_basic_resource_ref<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Async)
                          _LIBCUDACXX_AND async_resource_with<_Resource, _Properties...>)
   basic_resource_ref(_Resource* __res) noexcept
       : _Resource_ref_base<_Alloc_type>(__res, &__alloc_vtable<_Alloc_type, _Resource>)
       , __vtable(__vtable::template _Create<_Resource>())
   {}
 
+  //! @brief Conversion from a \c basic_resource_ref with the same set of properties but in a different order
+  //! @param __ref The other \c basic_resource_ref
   _LIBCUDACXX_TEMPLATE(class... _OtherProperties)
   _LIBCUDACXX_REQUIRES(__properties_match<_OtherProperties...>)
   basic_resource_ref(basic_resource_ref<_Alloc_type, _OtherProperties...> __ref) noexcept
@@ -339,6 +353,9 @@ class basic_resource_ref
       , __vtable(__ref)
   {}
 
+  //! @brief Conversion from a \c async_resource_ref with the same set of properties but in a different order to a
+  //! \c resource_ref
+  //! @param __ref The other \c async_resource_ref
   _LIBCUDACXX_TEMPLATE(_AllocType _OtherAllocType, class... _OtherProperties)
   _LIBCUDACXX_REQUIRES((_OtherAllocType == _AllocType::_Async) _LIBCUDACXX_AND(_OtherAllocType != _Alloc_type)
                          _LIBCUDACXX_AND __properties_match<_OtherProperties...>)
@@ -347,43 +364,57 @@ class basic_resource_ref
       , __vtable(__ref)
   {}
 
+  //! @brief Equality comparison between two \c basic_resource_ref
+  //! @param __rhs The other \c basic_resource_ref
+  //! @return Checks whether both resources have the same equality function stored in their vtable and if so returns
+  //! the result of that equality comparison. Otherwise returns false.
   _LIBCUDACXX_TEMPLATE(class... _OtherProperties)
   _LIBCUDACXX_REQUIRES((sizeof...(_Properties) == sizeof...(_OtherProperties))
                          _LIBCUDACXX_AND __properties_match<_OtherProperties...>)
-  _CCCL_NODISCARD bool operator==(const basic_resource_ref<_Alloc_type, _OtherProperties...>& __right) const
+  _CCCL_NODISCARD bool operator==(const basic_resource_ref<_Alloc_type, _OtherProperties...>& __rhs) const
   {
-    return (this->__static_vtable->__equal_fn == __right.__static_vtable->__equal_fn) //
-        && this->__static_vtable->__equal_fn(this->__object, __right.__object);
+    return (this->__static_vtable->__equal_fn == __rhs.__static_vtable->__equal_fn)
+        && this->__static_vtable->__equal_fn(this->__object, __rhs.__object);
   }
 
+  //! @brief Inequality comparison between two \c basic_resource_ref
+  //! @param __rhs The other \c basic_resource_ref
+  //! @return Checks whether both resources have the same equality function stored in their vtable and if so returns
+  //! the inverse result of that equality comparison. Otherwise returns true.
   _LIBCUDACXX_TEMPLATE(class... _OtherProperties)
   _LIBCUDACXX_REQUIRES((sizeof...(_Properties) == sizeof...(_OtherProperties))
                          _LIBCUDACXX_AND __properties_match<_OtherProperties...>)
-  _CCCL_NODISCARD bool operator!=(const basic_resource_ref<_Alloc_type, _OtherProperties...>& __right) const
+  _CCCL_NODISCARD bool operator!=(const basic_resource_ref<_Alloc_type, _OtherProperties...>& __rhs) const
   {
-    return !(*this == __right);
+    return !(*this == __rhs);
   }
 
+  //! @brief Forwards the stateless properties
   _LIBCUDACXX_TEMPLATE(class _Property)
-  _LIBCUDACXX_REQUIRES(
-    (!property_with_value<_Property>) _LIBCUDACXX_AND _CUDA_VSTD::_One_of<_Property, _Properties...>) //
+  _LIBCUDACXX_REQUIRES((!property_with_value<_Property>) _LIBCUDACXX_AND _CUDA_VSTD::_One_of<_Property, _Properties...>)
   friend void get_property(const basic_resource_ref&, _Property) noexcept {}
 
+  //! @brief Forwards the stateful properties
   _LIBCUDACXX_TEMPLATE(class _Property)
-  _LIBCUDACXX_REQUIRES(property_with_value<_Property> _LIBCUDACXX_AND _CUDA_VSTD::_One_of<_Property, _Properties...>) //
+  _LIBCUDACXX_REQUIRES(property_with_value<_Property> _LIBCUDACXX_AND _CUDA_VSTD::_One_of<_Property, _Properties...>)
   _CCCL_NODISCARD_FRIEND __property_value_t<_Property> get_property(const basic_resource_ref& __res, _Property) noexcept
   {
     return __res._Property_vtable<_Property>::__property_fn(__res.__object);
   }
 };
 
+//! @brief Checks whether a passed in type is a specialization of basic_resource_ref
 template <_AllocType _Alloc_type, class... _Properties>
-_LIBCUDACXX_INLINE_VAR constexpr bool _Is_basic_resource_ref<basic_resource_ref<_Alloc_type, _Properties...>> = true;
+_LIBCUDACXX_INLINE_VAR constexpr bool __is_basic_resource_ref<basic_resource_ref<_Alloc_type, _Properties...>> = true;
 
-template <class... _Properties> //
+//! @brief Type erased wrapper around a `resource` that satisfies \tparam _Properties
+//! @tparam _Properties The properties that any resource wrapped within the `resource_ref` needs to satisfy
+template <class... _Properties>
 using resource_ref = basic_resource_ref<_AllocType::_Default, _Properties...>;
 
-template <class... _Properties> //
+//! @brief Type erased wrapper around a `async_resource` that satisfies \tparam _Properties
+//! @tparam _Properties The properties that any async resource wrapped within the `async_resource_ref` needs to satisfy
+template <class... _Properties>
 using async_resource_ref = basic_resource_ref<_AllocType::_Async, _Properties...>;
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_MR
diff --git a/libcudacxx/include/cuda/memory_resource b/libcudacxx/include/cuda/memory_resource
index 4c23140f8db..a6aced11796 100644
--- a/libcudacxx/include/cuda/memory_resource
+++ b/libcudacxx/include/cuda/memory_resource
@@ -3,86 +3,13 @@
 // Part of the CUDA Toolkit, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef _CUDA_MEMORY_RESOURCE
 #define _CUDA_MEMORY_RESOURCE
 
-// clang-format off
-/*
-    memory_resource synopsis
-namespace cuda {
-namespace mr {
-template <class Resource>
-concept resource = equality_comparable<Resource>
-                && requires(Resource& res, void* ptr, size_t size, size_t alignment) {
-    { res.allocate(size, alignment) } -> same_as<void*>;
-    { res.deallocate(ptr, size, alignment) } -> same_as<void>;
-};
-
-template <class Resource>
-concept async_resource = resource<Resource>
-                      && requires(Resource& res, void* ptr, size_t size, size_t alignment, cuda_stream_ref stream) {
-    { res.allocate_async(size, alignment, stream) } -> same_as<void*>;
-    { res.deallocate_async(ptr, size, alignment, stream) } -> same_as<void>;
-};
-
-template <class Resource, class Property>
-concept has_property = resource<Resource> && requires(const Resource& res, Property prop) {
-    get_property(res, prop);
-};
-
-template <class Property>
-concept property_with_value = requires {
-    typename Property::value_type;
-};
-
-template <class Resource, class Property, class Return>
-concept has_property_with = resource<Resource>
-                         && property_with_value<Property>
-                         && same_as<Return, typename Property::value_type>
-                         && requires(const Resource& res, Property prop) {
-    get_property(res, prop) -> Return;
-};
-
-template <class Resource, class... Properties>
-concept resource_with = resource<Resource> && (has_property<Resource, Properties> && ...);
-
-template <class Resource, class... Properties>
-concept async_resource_with = async_resource<Resource> && (has_property<Resource, Properties> && ...);
-
-template <class... Properties>
-class resource_ref {
-    template <resource_with<Properties...> Resource>
-    resource_ref(Resource&) noexcept;
-
-    void* allocate(size_t size, size_t alignment);
-    void deallocate(void* ptr, size_t size, size_t alignment);
-
-    template <class... OtherProperties>
-        requires resource_with<resource_ref, OtherProperties...>
-              && resource_with<resource_ref<OtherProperties...>, Properties...>
-    friend bool operator==(const resource_ref& left, const resource_ref<OtherProperties...>& right);
-
-    template <property_with_value Property>
-        requires has_property<resource_ref, Property>
-    friend typename Property::value_type get_property(const resource_ref& ref, Property) noexcept;
-
-    template <class Property>
-        requires (has_property<resource_ref, Property> && !property_with_value<Property>)
-    friend void get_property(const resource_ref& ref, Property) noexcept;
-};
-
-}  // mr
-}  // cuda
-*/
-// clang-format on
-
-#include <cuda_runtime_api.h>
-// cuda_runtime_api needs to come first
-
 #include <cuda/std/detail/__config>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
@@ -93,6 +20,17 @@ class resource_ref {
 #  pragma system_header
 #endif // no system header
 
+//!@rst
+//! @file Defines facilities to allocate and deallocate memory in a type safe manner
+//!
+//! .. note::
+//!
+//!    The content of ``<cuda/memory_resource>`` is only available from C++14 onwards. The design is still experimental
+//!    and does not provide the usual API stability guarantees. To enable these features define
+//!    ``LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE``
+//!
+//!@endrst
+
 #include <cuda/__memory_resource/cuda_managed_memory_resource.h>
 #include <cuda/__memory_resource/cuda_memory_resource.h>
 #include <cuda/__memory_resource/cuda_pinned_memory_resource.h>
diff --git a/libcudacxx/include/cuda/std/__cccl/dialect.h b/libcudacxx/include/cuda/std/__cccl/dialect.h
index 7b15b6ef814..0c8989fbc6f 100644
--- a/libcudacxx/include/cuda/std/__cccl/dialect.h
+++ b/libcudacxx/include/cuda/std/__cccl/dialect.h
@@ -90,4 +90,25 @@
 #  define _CCCL_TRAIT(__TRAIT, ...) __TRAIT<__VA_ARGS__>::value
 #endif // _CCCL_STD_VER <= 2014
 
+// In nvcc prior to 11.3 global variables could not be marked constexpr
+#if defined(_CCCL_CUDACC_BELOW_11_3)
+#  define _CCCL_CONSTEXPR_GLOBAL const
+#else // ^^^ _CCCL_CUDACC_BELOW_11_3 ^^^ / vvv !_CCCL_CUDACC_BELOW_11_3 vvv
+#  define _CCCL_CONSTEXPR_GLOBAL constexpr
+#endif // !_CCCL_CUDACC_BELOW_11_3
+
+// Inline variables are only available from C++17 onwards
+#if _CCCL_STD_VER >= 2017 && defined(__cpp_inline_variables) && (__cpp_inline_variables >= 201606L)
+#  define _CCCL_INLINE_VAR inline
+#else // ^^^ C++14 ^^^ / vvv C++17 vvv
+#  define _CCCL_INLINE_VAR
+#endif // _CCCL_STD_VER <= 2014
+
+// We need to treat host and device separately
+#if defined(__CUDA_ARCH__)
+#  define _CCCL_GLOBAL_CONSTANT _CCCL_DEVICE _CCCL_CONSTEXPR_GLOBAL
+#else // ^^^ __CUDA_ARCH__ ^^^ / vvv !__CUDA_ARCH__ vvv
+#  define _CCCL_GLOBAL_CONSTANT _CCCL_INLINE_VAR constexpr
+#endif // __CUDA_ARCH__
+
 #endif // __CCCL_DIALECT_H
diff --git a/libcudacxx/include/cuda/std/__concepts/swappable.h b/libcudacxx/include/cuda/std/__concepts/swappable.h
index adf54861dca..f148b54dc66 100644
--- a/libcudacxx/include/cuda/std/__concepts/swappable.h
+++ b/libcudacxx/include/cuda/std/__concepts/swappable.h
@@ -158,7 +158,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto swap = __swap::__fn{};
+_CCCL_GLOBAL_CONSTANT auto swap = __swap::__fn{};
 } // namespace __cpo
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h b/libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h
index 506438bf120..2186393409c 100644
--- a/libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h
+++ b/libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h
@@ -37,47 +37,47 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // trigonometric functions
 inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 sin(__nv_bfloat16 __v)
 {
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsin(__v);), (return __nv_bfloat16(::sin(float(__v)));))
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsin(__v);), (return __float2bfloat16(::sin(__bfloat162float(__v)));))
 }
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 sinh(__nv_bfloat16 __v)
 {
-  return __nv_bfloat16(::sinh(float(__v)));
+  return __float2bfloat16(::sinh(__bfloat162float(__v)));
 }
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 cos(__nv_bfloat16 __v)
 {
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hcos(__v);), (return __nv_bfloat16(::cos(float(__v)));))
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hcos(__v);), (return __float2bfloat16(::cos(__bfloat162float(__v)));))
 }
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 cosh(__nv_bfloat16 __v)
 {
-  return __nv_bfloat16(::cosh(float(__v)));
+  return __float2bfloat16(::cosh(__bfloat162float(__v)));
 }
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 exp(__nv_bfloat16 __v)
 {
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hexp(__v);), (return __nv_bfloat16(::exp(float(__v)));))
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hexp(__v);), (return __float2bfloat16(::exp(__bfloat162float(__v)));))
 }
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 hypot(__nv_bfloat16 __x, __nv_bfloat16 __y)
 {
-  return __nv_bfloat16(::hypot(float(__x), float(__y)));
+  return __float2bfloat16(::hypot(__bfloat162float(__x), __bfloat162float(__y)));
 }
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 atan2(__nv_bfloat16 __x, __nv_bfloat16 __y)
 {
-  return __nv_bfloat16(::atan2(float(__x), float(__y)));
+  return __float2bfloat16(::atan2(__bfloat162float(__x), __bfloat162float(__y)));
 }
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 log(__nv_bfloat16 __x)
 {
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hlog(__x);), (return __nv_bfloat16(::log(float(__x)));))
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hlog(__x);), (return __float2bfloat16(::log(__bfloat162float(__x)));))
 }
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 sqrt(__nv_bfloat16 __x)
 {
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __nv_bfloat16(::sqrt(float(__x)));))
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2bfloat16(::sqrt(__bfloat162float(__x)));))
 }
 
 // floating point helper
@@ -123,7 +123,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY bool isfinite(__nv_bfloat16 __v)
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __constexpr_copysign(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
 {
-  return __nv_bfloat16(::copysignf(float(__x), float(__y)));
+  return __float2bfloat16(::copysignf(__bfloat162float(__x), __bfloat162float(__y)));
 }
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 copysign(__nv_bfloat16 __x, __nv_bfloat16 __y)
diff --git a/libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h b/libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h
index 1eea9afe333..8706b65475a 100644
--- a/libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h
+++ b/libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h
@@ -35,7 +35,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 inline _LIBCUDACXX_INLINE_VISIBILITY __half sin(__half __v)
 {
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, (return ::hsin(__v);), ({
-                      float __vf            = __v;
+                      float __vf            = __half2float(__v);
                       __vf                  = ::sin(__vf);
                       __half_raw __ret_repr = ::__float2half_rn(__vf);
 
@@ -61,7 +61,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __half sin(__half __v)
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __half sinh(__half __v)
 {
-  return __half(::sinh(float(__v)));
+  return __float2half(::sinh(__half2float(__v)));
 }
 
 // clang-format off
@@ -71,7 +71,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __half cos(__half __v)
     return ::hcos(__v);
   ), (
     {
-      float __vf            = __v;
+      float __vf            = __half2float(__v);
       __vf                  = ::cos(__vf);
       __half_raw __ret_repr = ::__float2half_rn(__vf);
 
@@ -94,7 +94,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __half cos(__half __v)
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __half cosh(__half __v)
 {
-  return __half(::cosh(float(__v)));
+  return __float2half(::cosh(__half2float(__v)));
 }
 
 // clang-format off
@@ -104,7 +104,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __half exp(__half __v)
     return ::hexp(__v);
   ), (
     {
-      float __vf            = __v;
+      float __vf            = __half2float(__v);
       __vf                  = ::exp(__vf);
       __half_raw __ret_repr = ::__float2half_rn(__vf);
 
@@ -127,12 +127,12 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __half exp(__half __v)
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __half hypot(__half __x, __half __y)
 {
-  return __half(::hypot(float(__x), float(__y)));
+  return __float2half(::hypot(__half2float(__x), __half2float(__y)));
 }
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __half atan2(__half __x, __half __y)
 {
-  return __half(::atan2(float(__x), float(__y)));
+  return __float2half(::atan2(__half2float(__x), __half2float(__y)));
 }
 
 // clang-format off
@@ -142,7 +142,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __half log(__half __x)
     return ::hlog(__x);
   ), (
     {
-      float __vf            = __x;
+      float __vf            = __half2float(__x);
       __vf                  = ::log(__vf);
       __half_raw __ret_repr = ::__float2half_rn(__vf);
 
@@ -164,7 +164,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __half log(__half __x)
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __half sqrt(__half __x)
 {
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __half(::sqrt(float(__x)));))
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2half(::sqrt(__half2float(__x)));))
 }
 
 // floating point helper
@@ -210,7 +210,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY bool isfinite(__half __v)
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __half __constexpr_copysign(__half __x, __half __y) noexcept
 {
-  return __half(::copysignf(float(__x), float(__y)));
+  return __float2half(::copysignf(__half2float(__x), __half2float(__y)));
 }
 
 inline _LIBCUDACXX_INLINE_VISIBILITY __half copysign(__half __x, __half __y)
diff --git a/libcudacxx/include/cuda/std/__expected/unexpect.h b/libcudacxx/include/cuda/std/__expected/unexpect.h
index fd8f1034476..fcca0c777b2 100644
--- a/libcudacxx/include/cuda/std/__expected/unexpect.h
+++ b/libcudacxx/include/cuda/std/__expected/unexpect.h
@@ -28,7 +28,7 @@ struct unexpect_t
   explicit unexpect_t() = default;
 };
 
-_LIBCUDACXX_CPO_ACCESSIBILITY unexpect_t unexpect{};
+_CCCL_GLOBAL_CONSTANT unexpect_t unexpect{};
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__functional/operations.h b/libcudacxx/include/cuda/std/__functional/operations.h
index ec3917f7d88..9b58d14ef86 100644
--- a/libcudacxx/include/cuda/std/__functional/operations.h
+++ b/libcudacxx/include/cuda/std/__functional/operations.h
@@ -34,7 +34,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS plus : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x + __y;
   }
@@ -46,7 +46,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS plus<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) + _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) + _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -60,7 +60,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS minus : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x - __y;
   }
@@ -72,7 +72,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS minus<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) - _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) - _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -86,7 +86,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS multiplies : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x * __y;
   }
@@ -98,7 +98,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS multiplies<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) * _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) * _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -112,7 +112,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS divides : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x / __y;
   }
@@ -124,7 +124,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS divides<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) / _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) / _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -138,7 +138,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS modulus : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x % __y;
   }
@@ -150,7 +150,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS modulus<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) % _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) % _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -191,7 +191,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_and : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x & __y;
   }
@@ -203,7 +203,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_and<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) & _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) & _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -253,7 +253,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_or<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) | _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) | _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -267,7 +267,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_xor : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x ^ __y;
   }
@@ -279,7 +279,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_xor<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) ^ _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) ^ _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -295,7 +295,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS equal_to : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x == __y;
   }
@@ -307,7 +307,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS equal_to<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) == _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) == _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -321,7 +321,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS not_equal_to : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x != __y;
   }
@@ -333,7 +333,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS not_equal_to<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) != _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) != _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -347,7 +347,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS less : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x < __y;
   }
@@ -359,7 +359,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS less<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) < _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) < _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -373,7 +373,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS less_equal : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x <= __y;
   }
@@ -385,7 +385,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS less_equal<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) <= _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) <= _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -399,7 +399,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS greater_equal : __binary_function<_Tp, _Tp, bool
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x >= __y;
   }
@@ -411,7 +411,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS greater_equal<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) >= _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) >= _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -425,7 +425,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS greater : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x > __y;
   }
@@ -437,7 +437,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS greater<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) > _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) > _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -453,7 +453,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS logical_and : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x && __y;
   }
@@ -465,7 +465,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS logical_and<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) && _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) && _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -479,7 +479,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS logical_not : __unary_function<_Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x) const
   {
     return !__x;
   }
@@ -504,7 +504,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS logical_or : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x || __y;
   }
@@ -516,7 +516,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS logical_or<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) || _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) || _CUDA_VSTD::forward<_T2>(__u))
   {
diff --git a/libcudacxx/include/cuda/std/__iterator/access.h b/libcudacxx/include/cuda/std/__iterator/access.h
index f720e7730d1..6ff5bf07107 100644
--- a/libcudacxx/include/cuda/std/__iterator/access.h
+++ b/libcudacxx/include/cuda/std/__iterator/access.h
@@ -53,7 +53,7 @@ struct __fn
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto begin = __begin::__fn{};
+_CCCL_GLOBAL_CONSTANT auto begin = __begin::__fn{};
 } // namespace __cpo
 
 namespace __end
@@ -84,7 +84,7 @@ struct __fn
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto end = __end::__fn{};
+_CCCL_GLOBAL_CONSTANT auto end = __end::__fn{};
 } // namespace __cpo
 
 #if _CCCL_STD_VER >= 2014
@@ -104,7 +104,7 @@ struct __fn
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto cbegin = __cbegin::__fn{};
+_CCCL_GLOBAL_CONSTANT auto cbegin = __cbegin::__fn{};
 } // namespace __cpo
 
 namespace __cend
@@ -122,7 +122,7 @@ struct __fn
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto cend = __cend::__fn{};
+_CCCL_GLOBAL_CONSTANT auto cend = __cend::__fn{};
 } // namespace __cpo
 
 #endif // _CCCL_STD_VER >= 2014
diff --git a/libcudacxx/include/cuda/std/__iterator/advance.h b/libcudacxx/include/cuda/std/__iterator/advance.h
index 7e5368156b0..ed6d711c0e0 100644
--- a/libcudacxx/include/cuda/std/__iterator/advance.h
+++ b/libcudacxx/include/cuda/std/__iterator/advance.h
@@ -238,7 +238,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto advance = __advance::__fn{};
+_CCCL_GLOBAL_CONSTANT auto advance = __advance::__fn{};
 } // namespace __cpo
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
diff --git a/libcudacxx/include/cuda/std/__iterator/distance.h b/libcudacxx/include/cuda/std/__iterator/distance.h
index 9b31aaa44a6..2ce304f95a7 100644
--- a/libcudacxx/include/cuda/std/__iterator/distance.h
+++ b/libcudacxx/include/cuda/std/__iterator/distance.h
@@ -116,7 +116,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto distance = __distance::__fn{};
+_CCCL_GLOBAL_CONSTANT auto distance = __distance::__fn{};
 } // namespace __cpo
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__iterator/iter_move.h b/libcudacxx/include/cuda/std/__iterator/iter_move.h
index d447e29fa07..070da03f117 100644
--- a/libcudacxx/include/cuda/std/__iterator/iter_move.h
+++ b/libcudacxx/include/cuda/std/__iterator/iter_move.h
@@ -119,7 +119,7 @@ struct __fn
 _LIBCUDACXX_END_NAMESPACE_CPO
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto iter_move = __iter_move::__fn{};
+_CCCL_GLOBAL_CONSTANT auto iter_move = __iter_move::__fn{};
 } // namespace __cpo
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__iterator/iter_swap.h b/libcudacxx/include/cuda/std/__iterator/iter_swap.h
index 76d02c69891..f989719586a 100644
--- a/libcudacxx/include/cuda/std/__iterator/iter_swap.h
+++ b/libcudacxx/include/cuda/std/__iterator/iter_swap.h
@@ -119,7 +119,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto iter_swap = __iter_swap::__fn{};
+_CCCL_GLOBAL_CONSTANT auto iter_swap = __iter_swap::__fn{};
 } // namespace __cpo
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__iterator/next.h b/libcudacxx/include/cuda/std/__iterator/next.h
index 5adfee72702..c4e70d2c4a2 100644
--- a/libcudacxx/include/cuda/std/__iterator/next.h
+++ b/libcudacxx/include/cuda/std/__iterator/next.h
@@ -90,7 +90,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto next = __next::__fn{};
+_CCCL_GLOBAL_CONSTANT auto next = __next::__fn{};
 } // namespace __cpo
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__iterator/prev.h b/libcudacxx/include/cuda/std/__iterator/prev.h
index 9d8255ec629..5a84b44c59a 100644
--- a/libcudacxx/include/cuda/std/__iterator/prev.h
+++ b/libcudacxx/include/cuda/std/__iterator/prev.h
@@ -81,7 +81,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto prev = __prev::__fn{};
+_CCCL_GLOBAL_CONSTANT auto prev = __prev::__fn{};
 } // namespace __cpo
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__iterator/reverse_access.h b/libcudacxx/include/cuda/std/__iterator/reverse_access.h
index 5ef118b11c9..927b3a3d0e1 100644
--- a/libcudacxx/include/cuda/std/__iterator/reverse_access.h
+++ b/libcudacxx/include/cuda/std/__iterator/reverse_access.h
@@ -65,7 +65,7 @@ struct __fn
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto rbegin = __rbegin::__fn{};
+_CCCL_GLOBAL_CONSTANT auto rbegin = __rbegin::__fn{};
 } // namespace __cpo
 
 namespace __rend
@@ -104,7 +104,7 @@ struct __fn
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto rend = __rend::__fn{};
+_CCCL_GLOBAL_CONSTANT auto rend = __rend::__fn{};
 } // namespace __cpo
 
 namespace __crbegin
@@ -122,7 +122,7 @@ struct __fn
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto crbegin = __crbegin::__fn{};
+_CCCL_GLOBAL_CONSTANT auto crbegin = __crbegin::__fn{};
 } // namespace __cpo
 
 namespace __crend
@@ -140,7 +140,7 @@ struct __fn
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto crend = __crend::__fn{};
+_CCCL_GLOBAL_CONSTANT auto crend = __crend::__fn{};
 } // namespace __cpo
 
 #endif // _CCCL_STD_VER >= 2014
diff --git a/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h b/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h
index 0543e33ad14..2304d5a15bc 100644
--- a/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h
+++ b/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h
@@ -82,7 +82,7 @@ struct unreachable_sentinel_t : __unreachable_sentinel_detail::__unreachable_bas
 
 _LIBCUDACXX_END_NAMESPACE_RANGES_ABI
 
-_LIBCUDACXX_CPO_ACCESSIBILITY unreachable_sentinel_t unreachable_sentinel{};
+_CCCL_GLOBAL_CONSTANT unreachable_sentinel_t unreachable_sentinel{};
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _CCCL_STD_VER > 2014
diff --git a/libcudacxx/include/cuda/std/__mdspan/standard_layout_static_array.h b/libcudacxx/include/cuda/std/__mdspan/standard_layout_static_array.h
index fc4c355d7da..64440bfacc7 100644
--- a/libcudacxx/include/cuda/std/__mdspan/standard_layout_static_array.h
+++ b/libcudacxx/include/cuda/std/__mdspan/standard_layout_static_array.h
@@ -77,12 +77,11 @@ namespace __detail
 
 struct __construct_psa_from_dynamic_exts_values_tag_t
 {};
-_LIBCUDACXX_CPO_ACCESSIBILITY __construct_psa_from_dynamic_exts_values_tag_t
-  __construct_psa_from_dynamic_exts_values_tag;
+_CCCL_GLOBAL_CONSTANT __construct_psa_from_dynamic_exts_values_tag_t __construct_psa_from_dynamic_exts_values_tag;
 
 struct __construct_psa_from_all_exts_values_tag_t
 {};
-_LIBCUDACXX_CPO_ACCESSIBILITY __construct_psa_from_all_exts_values_tag_t __construct_psa_from_all_exts_values_tag;
+_CCCL_GLOBAL_CONSTANT __construct_psa_from_all_exts_values_tag_t __construct_psa_from_all_exts_values_tag;
 
 struct __construct_psa_from_all_exts_array_tag_t
 {};
diff --git a/libcudacxx/include/cuda/std/__ranges/access.h b/libcudacxx/include/cuda/std/__ranges/access.h
index c2b065075a5..a2bc2b41333 100644
--- a/libcudacxx/include/cuda/std/__ranges/access.h
+++ b/libcudacxx/include/cuda/std/__ranges/access.h
@@ -125,7 +125,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto begin = __begin::__fn{};
+_CCCL_GLOBAL_CONSTANT auto begin = __begin::__fn{};
 } // namespace __cpo
 
 // [range.range]
@@ -213,7 +213,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto end = __end::__fn{};
+_CCCL_GLOBAL_CONSTANT auto end = __end::__fn{};
 } // namespace __cpo
 
 // [range.access.cbegin]
@@ -243,7 +243,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto cbegin = __cbegin::__fn{};
+_CCCL_GLOBAL_CONSTANT auto cbegin = __cbegin::__fn{};
 } // namespace __cpo
 
 // [range.access.cend]
@@ -273,7 +273,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto cend = __cend::__fn{};
+_CCCL_GLOBAL_CONSTANT auto cend = __cend::__fn{};
 } // namespace __cpo
 #endif // _CCCL_STD_VER > 2014 && !_CCCL_COMPILER_MSVC_2017
 
diff --git a/libcudacxx/include/cuda/std/__ranges/data.h b/libcudacxx/include/cuda/std/__ranges/data.h
index 63a89d6c07e..14d3473f3c9 100644
--- a/libcudacxx/include/cuda/std/__ranges/data.h
+++ b/libcudacxx/include/cuda/std/__ranges/data.h
@@ -97,7 +97,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto data = __data::__fn{};
+_CCCL_GLOBAL_CONSTANT auto data = __data::__fn{};
 } // namespace __cpo
 
 // [range.prim.cdata]
@@ -127,7 +127,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto cdata = __cdata::__fn{};
+_CCCL_GLOBAL_CONSTANT auto cdata = __cdata::__fn{};
 } // namespace __cpo
 
 #endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
diff --git a/libcudacxx/include/cuda/std/__ranges/empty.h b/libcudacxx/include/cuda/std/__ranges/empty.h
index 84a9cc2e272..581faa8bbdb 100644
--- a/libcudacxx/include/cuda/std/__ranges/empty.h
+++ b/libcudacxx/include/cuda/std/__ranges/empty.h
@@ -102,7 +102,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto empty = __empty::__fn{};
+_CCCL_GLOBAL_CONSTANT auto empty = __empty::__fn{};
 } // namespace __cpo
 
 #endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
diff --git a/libcudacxx/include/cuda/std/__ranges/rbegin.h b/libcudacxx/include/cuda/std/__ranges/rbegin.h
index 9399228a4d8..8fe23be3b27 100644
--- a/libcudacxx/include/cuda/std/__ranges/rbegin.h
+++ b/libcudacxx/include/cuda/std/__ranges/rbegin.h
@@ -130,7 +130,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto rbegin = __rbegin::__fn{};
+_CCCL_GLOBAL_CONSTANT auto rbegin = __rbegin::__fn{};
 } // namespace __cpo
 
 // [range.access.crbegin]
@@ -160,7 +160,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto crbegin = __crbegin::__fn{};
+_CCCL_GLOBAL_CONSTANT auto crbegin = __crbegin::__fn{};
 } // namespace __cpo
 
 #endif // _CCCL_STD_VER >= 2017 && && !_CCCL_COMPILER_MSVC_2017
diff --git a/libcudacxx/include/cuda/std/__ranges/rend.h b/libcudacxx/include/cuda/std/__ranges/rend.h
index 4a0e490e731..458480b14d5 100644
--- a/libcudacxx/include/cuda/std/__ranges/rend.h
+++ b/libcudacxx/include/cuda/std/__ranges/rend.h
@@ -137,7 +137,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto rend = __rend::__fn{};
+_CCCL_GLOBAL_CONSTANT auto rend = __rend::__fn{};
 } // namespace __cpo
 
 // [range.access.crend]
@@ -167,7 +167,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto crend = __crend::__fn{};
+_CCCL_GLOBAL_CONSTANT auto crend = __crend::__fn{};
 } // namespace __cpo
 
 #endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
diff --git a/libcudacxx/include/cuda/std/__ranges/size.h b/libcudacxx/include/cuda/std/__ranges/size.h
index feaa2b51489..849e55440c5 100644
--- a/libcudacxx/include/cuda/std/__ranges/size.h
+++ b/libcudacxx/include/cuda/std/__ranges/size.h
@@ -157,7 +157,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto size = __size::__fn{};
+_CCCL_GLOBAL_CONSTANT auto size = __size::__fn{};
 } // namespace __cpo
 
 // [range.prim.ssize]
@@ -198,7 +198,7 @@ _LIBCUDACXX_END_NAMESPACE_CPO
 
 inline namespace __cpo
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY auto ssize = __ssize::__fn{};
+_CCCL_GLOBAL_CONSTANT auto ssize = __ssize::__fn{};
 } // namespace __cpo
 
 #endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/sfinae_helpers.h b/libcudacxx/include/cuda/std/__tuple_dir/sfinae_helpers.h
index 88e07d83a56..f6bf304bac4 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/sfinae_helpers.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/sfinae_helpers.h
@@ -51,7 +51,7 @@ struct __tuple_sfinae_base
   {};
 
   template <class... _Tp, class... _Up>
-  struct __test_size<__tuple_types<_Tp...>, __tuple_types<_Up...>> : _BoolConstant<sizeof...(_Tp) == sizeof...(_Up)>
+  struct __test_size<__tuple_types<_Tp...>, __tuple_types<_Up...>> : bool_constant<sizeof...(_Tp) == sizeof...(_Up)>
   {};
 
   template <template <class, class...> class, class _Tp, class _Up, bool = __test_size<_Tp, _Up>::value>
diff --git a/libcudacxx/include/cuda/std/__type_traits/integral_constant.h b/libcudacxx/include/cuda/std/__type_traits/integral_constant.h
index 6b5767235b2..6e5fadf89e3 100644
--- a/libcudacxx/include/cuda/std/__type_traits/integral_constant.h
+++ b/libcudacxx/include/cuda/std/__type_traits/integral_constant.h
@@ -47,18 +47,13 @@ typedef integral_constant<bool, true> true_type;
 typedef integral_constant<bool, false> false_type;
 
 template <bool _Val>
-using _BoolConstant _LIBCUDACXX_NODEBUG_TYPE = integral_constant<bool, _Val>;
+using _BoolConstant _LIBCUDACXX_DEPRECATED _LIBCUDACXX_NODEBUG_TYPE = integral_constant<bool, _Val>;
 
-#if _CCCL_STD_VER > 2011
 template <bool __b>
 using bool_constant = integral_constant<bool, __b>;
-#endif
 
-#if _CCCL_STD_VER > 2011
-#  define _LIBCUDACXX_BOOL_CONSTANT(__b) bool_constant<(__b)>
-#else
-#  define _LIBCUDACXX_BOOL_CONSTANT(__b) integral_constant<bool, (__b)>
-#endif
+// deprecated [Since 2.7.0]
+#define _LIBCUDACXX_BOOL_CONSTANT(__b) bool_constant<(__b)>
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_same.h b/libcudacxx/include/cuda/std/__type_traits/is_same.h
index 665bd106844..4c19a94598c 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_same.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_same.h
@@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_SAME) && !defined(_LIBCUDACXX_USE_IS_SAME_FALLBACK)
 
 template <class _Tp, class _Up>
-struct _LIBCUDACXX_TEMPLATE_VIS is_same : _BoolConstant<_LIBCUDACXX_IS_SAME(_Tp, _Up)>
+struct _LIBCUDACXX_TEMPLATE_VIS is_same : bool_constant<_LIBCUDACXX_IS_SAME(_Tp, _Up)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -43,10 +43,10 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_same_v = _LIBCUDACXX_IS_SAME(_Tp, _Up);
 // (such as in a dependent return type).
 
 template <class _Tp, class _Up>
-using _IsSame = _BoolConstant<_LIBCUDACXX_IS_SAME(_Tp, _Up)>;
+using _IsSame = bool_constant<_LIBCUDACXX_IS_SAME(_Tp, _Up)>;
 
 template <class _Tp, class _Up>
-using _IsNotSame = _BoolConstant<!_LIBCUDACXX_IS_SAME(_Tp, _Up)>;
+using _IsNotSame = bool_constant<!_LIBCUDACXX_IS_SAME(_Tp, _Up)>;
 
 #else
 
@@ -70,10 +70,10 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_same_v = is_same<_Tp, _Up>::value;
 // (such as in a dependent return type).
 
 template <class _Tp, class _Up>
-using _IsSame = _BoolConstant<is_same<_Tp, _Up>::value>;
+using _IsSame = bool_constant<is_same<_Tp, _Up>::value>;
 
 template <class _Tp, class _Up>
-using _IsNotSame = _BoolConstant<!is_same<_Tp, _Up>::value>;
+using _IsNotSame = bool_constant<!is_same<_Tp, _Up>::value>;
 
 #endif // defined(_LIBCUDACXX_IS_SAME) && !defined(_LIBCUDACXX_USE_IS_SAME_FALLBACK)
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_signed.h b/libcudacxx/include/cuda/std/__type_traits/is_signed.h
index 6c8f711518c..eb6cf15357f 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_signed.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_signed.h
@@ -39,7 +39,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_signed_v = _LIBCUDACXX_IS_SIGNED(_Tp);
 #else
 
 template <class _Tp, bool = is_integral<_Tp>::value>
-struct __libcpp_is_signed_impl : public _BoolConstant<(_Tp(-1) < _Tp(0))>
+struct __libcpp_is_signed_impl : public bool_constant<(_Tp(-1) < _Tp(0))>
 {};
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h
index 45412d75c0e..a5290db3830 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h
@@ -42,7 +42,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_unsigned_v = _LIBCUDACXX_IS_UNSIGNED(_T
 #else
 
 template <class _Tp, bool = is_integral<_Tp>::value>
-struct __libcpp_is_unsigned_impl : public _BoolConstant<(_Tp(0) < _Tp(-1))>
+struct __libcpp_is_unsigned_impl : public bool_constant<(_Tp(0) < _Tp(-1))>
 {};
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/negation.h b/libcudacxx/include/cuda/std/__type_traits/negation.h
index e398a2e8698..547b38be420 100644
--- a/libcudacxx/include/cuda/std/__type_traits/negation.h
+++ b/libcudacxx/include/cuda/std/__type_traits/negation.h
@@ -25,7 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Pred>
-struct _Not : _BoolConstant<!_Pred::value>
+struct _Not : bool_constant<!_Pred::value>
 {};
 
 #if _CCCL_STD_VER > 2011
diff --git a/libcudacxx/include/cuda/std/__utility/in_place.h b/libcudacxx/include/cuda/std/__utility/in_place.h
index 9bbd07a42b9..cbd0aeb90e0 100644
--- a/libcudacxx/include/cuda/std/__utility/in_place.h
+++ b/libcudacxx/include/cuda/std/__utility/in_place.h
@@ -32,7 +32,7 @@ struct _LIBCUDACXX_TYPE_VIS in_place_t
 {
   explicit in_place_t() = default;
 };
-_LIBCUDACXX_CPO_ACCESSIBILITY in_place_t in_place{};
+_CCCL_GLOBAL_CONSTANT in_place_t in_place{};
 
 template <class _Tp>
 struct _LIBCUDACXX_TEMPLATE_VIS in_place_type_t
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
index fa29e4ab349..fc823c46fd3 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -88,7 +88,9 @@ extern "C++" {
 #  endif
 
 #  if (defined(_CCCL_COMPILER_NVHPC) && defined(__linux__)) || defined(_CCCL_COMPILER_NVRTC)
-#    define __ELF__
+#    ifndef __ELF__
+#      define __ELF__
+#    endif // !__ELF__
 #  endif
 
 #  if defined(__ELF__)
@@ -1741,19 +1743,6 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*,
 #    define _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(_ClassName) static_assert(true, "")
 #  endif
 
-#  if (defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ <= 11) \
-    && (defined(__CUDACC_VER_MINOR__) && __CUDACC_VER_MINOR__ <= 2)
-#    define _LIBCUDACXX_CONSTEXPR_GLOBAL const
-#  else
-#    define _LIBCUDACXX_CONSTEXPR_GLOBAL constexpr
-#  endif
-
-#  if defined(__CUDA_ARCH__)
-#    define _LIBCUDACXX_CPO_ACCESSIBILITY _CCCL_DEVICE _LIBCUDACXX_CONSTEXPR_GLOBAL
-#  else
-#    define _LIBCUDACXX_CPO_ACCESSIBILITY _LIBCUDACXX_INLINE_VAR constexpr
-#  endif
-
 // Older nvcc do not handle the constraint of `construct_at` in earlier std modes
 // So to preserve our performance optimization we default to the unconstrained
 // `__construct_at` and only in C++20 use `construct_at`
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/iterator b/libcudacxx/include/cuda/std/detail/libcxx/include/iterator
index ee065ad475b..7d63f431066 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/iterator
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/iterator
@@ -740,22 +740,22 @@ template <class E> constexpr const E* data(initializer_list<E> il) noexcept;
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Iter>
-struct __libcpp_is_trivial_iterator : public _LIBCUDACXX_BOOL_CONSTANT(is_pointer<_Iter>::value)
+struct __libcpp_is_trivial_iterator : public bool_constant<is_pointer<_Iter>::value>
 {};
 
 template <class _Iter>
 struct __libcpp_is_trivial_iterator<move_iterator<_Iter>>
-    : public _LIBCUDACXX_BOOL_CONSTANT(__libcpp_is_trivial_iterator<_Iter>::value)
+    : public bool_constant<__libcpp_is_trivial_iterator<_Iter>::value>
 {};
 
 template <class _Iter>
 struct __libcpp_is_trivial_iterator<reverse_iterator<_Iter>>
-    : public _LIBCUDACXX_BOOL_CONSTANT(__libcpp_is_trivial_iterator<_Iter>::value)
+    : public bool_constant<__libcpp_is_trivial_iterator<_Iter>::value>
 {};
 
 template <class _Iter>
 struct __libcpp_is_trivial_iterator<__wrap_iter<_Iter>>
-    : public _LIBCUDACXX_BOOL_CONSTANT(__libcpp_is_trivial_iterator<_Iter>::value)
+    : public bool_constant<__libcpp_is_trivial_iterator<_Iter>::value>
 {};
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/optional b/libcudacxx/include/cuda/std/detail/libcxx/include/optional
index c77ed212dac..75190c3999c 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/optional
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/optional
@@ -265,7 +265,7 @@ struct nullopt_t
   _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit nullopt_t(__secret_tag, __secret_tag) noexcept {}
 };
 
-_LIBCUDACXX_CPO_ACCESSIBILITY nullopt_t nullopt{nullopt_t::__secret_tag{}, nullopt_t::__secret_tag{}};
+_CCCL_GLOBAL_CONSTANT nullopt_t nullopt{nullopt_t::__secret_tag{}, nullopt_t::__secret_tag{}};
 
 struct __optional_construct_from_invoke_tag
 {};
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/ratio b/libcudacxx/include/cuda/std/detail/libcxx/include/ratio
index 39304fdf3e7..68f28feda34 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/ratio
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/ratio
@@ -372,12 +372,11 @@ using ratio_subtract = typename __ratio_subtract<_R1, _R2>::type;
 // ratio_equal
 
 template <class _R1, class _R2>
-struct _LIBCUDACXX_TEMPLATE_VIS ratio_equal
-    : public _LIBCUDACXX_BOOL_CONSTANT((_R1::num == _R2::num && _R1::den == _R2::den))
+struct _LIBCUDACXX_TEMPLATE_VIS ratio_equal : public bool_constant<(_R1::num == _R2::num && _R1::den == _R2::den)>
 {};
 
 template <class _R1, class _R2>
-struct _LIBCUDACXX_TEMPLATE_VIS ratio_not_equal : public _LIBCUDACXX_BOOL_CONSTANT((!ratio_equal<_R1, _R2>::value))
+struct _LIBCUDACXX_TEMPLATE_VIS ratio_not_equal : public bool_constant<(!ratio_equal<_R1, _R2>::value)>
 {};
 
 // ratio_less
@@ -440,19 +439,19 @@ struct __ratio_less<_R1, _R2, -1LL, -1LL>
 };
 
 template <class _R1, class _R2>
-struct _LIBCUDACXX_TEMPLATE_VIS ratio_less : public _LIBCUDACXX_BOOL_CONSTANT((__ratio_less<_R1, _R2>::value))
+struct _LIBCUDACXX_TEMPLATE_VIS ratio_less : public bool_constant<(__ratio_less<_R1, _R2>::value)>
 {};
 
 template <class _R1, class _R2>
-struct _LIBCUDACXX_TEMPLATE_VIS ratio_less_equal : public _LIBCUDACXX_BOOL_CONSTANT((!ratio_less<_R2, _R1>::value))
+struct _LIBCUDACXX_TEMPLATE_VIS ratio_less_equal : public bool_constant<(!ratio_less<_R2, _R1>::value)>
 {};
 
 template <class _R1, class _R2>
-struct _LIBCUDACXX_TEMPLATE_VIS ratio_greater : public _LIBCUDACXX_BOOL_CONSTANT((ratio_less<_R2, _R1>::value))
+struct _LIBCUDACXX_TEMPLATE_VIS ratio_greater : public bool_constant<(ratio_less<_R2, _R1>::value)>
 {};
 
 template <class _R1, class _R2>
-struct _LIBCUDACXX_TEMPLATE_VIS ratio_greater_equal : public _LIBCUDACXX_BOOL_CONSTANT((!ratio_less<_R1, _R2>::value))
+struct _LIBCUDACXX_TEMPLATE_VIS ratio_greater_equal : public bool_constant<(!ratio_less<_R1, _R2>::value)>
 {};
 
 template <class _R1, class _R2>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
index 62802506ddf..46b4f4db742 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
@@ -1037,7 +1037,7 @@ struct __ignore_t
 
 namespace
 {
-_LIBCUDACXX_CPO_ACCESSIBILITY __ignore_t<unsigned char> ignore{};
+_CCCL_GLOBAL_CONSTANT __ignore_t<unsigned char> ignore{};
 } // namespace
 
 template <class... _Tp>
diff --git a/libcudacxx/include/cuda/stream_ref b/libcudacxx/include/cuda/stream_ref
index 9bb23d3e2ef..82c49867ae5 100644
--- a/libcudacxx/include/cuda/stream_ref
+++ b/libcudacxx/include/cuda/stream_ref
@@ -62,14 +62,14 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
  */
 class stream_ref
 {
-private:
+protected:
   ::cudaStream_t __stream{0};
 
 public:
   using value_type = ::cudaStream_t;
 
   /**
-   * \brief Constructs a `stream_view` of the "default" CUDA stream.
+   * \brief Constructs a `stream_ref` of the "default" CUDA stream.
    *
    * For behavior of the default stream,
    * \see
@@ -79,11 +79,11 @@ public:
   stream_ref() = default;
 
   /**
-   * \brief Constructs a `stream_view` from a `cudaStream_t` handle.
+   * \brief Constructs a `stream_ref` from a `cudaStream_t` handle.
    *
    * This constructor provides implicit conversion from `cudaStream_t`.
    *
-   * \note: It is the callers responsibilty to ensure the `stream_view` does not
+   * \note: It is the callers responsibilty to ensure the `stream_ref` does not
    * outlive the stream identified by the `cudaStream_t` handle.
    *
    */
@@ -98,13 +98,13 @@ public:
   stream_ref(_CUDA_VSTD::nullptr_t) = delete;
 
   /**
-   * \brief Compares two `stream_view`s for equality
+   * \brief Compares two `stream_ref`s for equality
    *
    * \note Allows comparison with `cudaStream_t` due to implicit conversion to
-   * `stream_view`.
+   * `stream_ref`.
    *
-   * \param lhs The first `stream_view` to compare
-   * \param rhs The second `stream_view` to compare
+   * \param lhs The first `stream_ref` to compare
+   * \param rhs The second `stream_ref` to compare
    * \return true if equal, false if unequal
    */
   _CCCL_NODISCARD_FRIEND constexpr bool operator==(const stream_ref& __lhs, const stream_ref& __rhs) noexcept
@@ -113,13 +113,13 @@ public:
   }
 
   /**
-   * \brief Compares two `stream_view`s for inequality
+   * \brief Compares two `stream_ref`s for inequality
    *
    * \note Allows comparison with `cudaStream_t` due to implicit conversion to
-   * `stream_view`.
+   * `stream_ref`.
    *
-   * \param lhs The first `stream_view` to compare
-   * \param rhs The second `stream_view` to compare
+   * \param lhs The first `stream_ref` to compare
+   * \param rhs The second `stream_ref` to compare
    * \return true if unequal, false if equal
    */
   _CCCL_NODISCARD_FRIEND constexpr bool operator!=(const stream_ref& __lhs, const stream_ref& __rhs) noexcept
@@ -164,10 +164,24 @@ public:
         break;
       default:
         ::cudaGetLastError(); // Clear CUDA error state
-        ::cuda::__throw_cuda_error(__result, "Failed to querry stream.");
+        ::cuda::__throw_cuda_error(__result, "Failed to query stream.");
     }
     return true;
   }
+
+  /**
+   * \brief Queries the priority of the wrapped stream.
+   *
+   * \throws cuda::cuda_error if the query fails.
+   *
+   * \return value representing the priority of the wrapped stream.
+   */
+  _CCCL_NODISCARD int priority() const
+  {
+    int __result = 0;
+    _CCCL_TRY_CUDA_API(::cudaStreamGetPriority, "Failed to get stream priority", get(), &__result);
+    return __result;
+  }
 };
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
diff --git a/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.special/double_float_explicit.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.special/double_float_explicit.pass.cpp
index 86e7c1fbd9d..3aae8f7fe4c 100644
--- a/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.special/double_float_explicit.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.special/double_float_explicit.pass.cpp
@@ -35,5 +35,9 @@ int main(int, char**)
     static_assert(cf.imag() == cd.imag(), "");
   }
 
+  static_assert(cuda::std::is_same<cuda::std::common_type<cuda::std::complex<double>, cuda::std::complex<float>>::type,
+                                   cuda::std::complex<cuda::std::common_type<double, float>::type>>::value,
+                "");
+
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.special/float_double_explicit.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.special/float_double_explicit.pass.cpp
index 432e800248b..9b990689d07 100644
--- a/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.special/float_double_explicit.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.special/float_double_explicit.pass.cpp
@@ -35,5 +35,9 @@ int main(int, char**)
     static_assert(cf.imag() == cd.imag(), "");
   }
 
+  static_assert(cuda::std::is_same<cuda::std::common_type<cuda::std::complex<float>, cuda::std::complex<double>>::type,
+                                   cuda::std::complex<cuda::std::common_type<float, double>::type>>::value,
+                "");
+
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/ranges/range.access/begin.pass.cpp b/libcudacxx/test/libcudacxx/std/ranges/range.access/begin.pass.cpp
index bd1eed87997..2cfd0d1ab27 100644
--- a/libcudacxx/test/libcudacxx/std/ranges/range.access/begin.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/ranges/range.access/begin.pass.cpp
@@ -371,14 +371,14 @@ ASSERT_NOEXCEPT(cuda::std::ranges::begin(cuda::std::declval<int (&)[10]>()));
 ASSERT_NOEXCEPT(cuda::std::ranges::cbegin(cuda::std::declval<int (&)[10]>()));
 
 #if !defined(TEST_COMPILER_MSVC_2019) // broken noexcept
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowMemberBegin
+_CCCL_GLOBAL_CONSTANT struct NoThrowMemberBegin
 {
   __host__ __device__ ThrowingIterator<int> begin() const noexcept; // auto(t.begin()) doesn't throw
 } ntmb;
 static_assert(noexcept(cuda::std::ranges::begin(ntmb)));
 static_assert(noexcept(cuda::std::ranges::cbegin(ntmb)));
 
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowADLBegin
+_CCCL_GLOBAL_CONSTANT struct NoThrowADLBegin
 {
   __host__ __device__ friend ThrowingIterator<int> begin(NoThrowADLBegin&) noexcept; // auto(begin(t)) doesn't throw
   __host__ __device__ friend ThrowingIterator<int> begin(const NoThrowADLBegin&) noexcept;
@@ -388,7 +388,7 @@ static_assert(noexcept(cuda::std::ranges::cbegin(ntab)));
 #endif // !TEST_COMPILER_MSVC_2019
 
 #if !defined(TEST_COMPILER_ICC)
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowMemberBeginReturnsRef
+_CCCL_GLOBAL_CONSTANT struct NoThrowMemberBeginReturnsRef
 {
   __host__ __device__ ThrowingIterator<int>& begin() const noexcept; // auto(t.begin()) may throw
 } ntmbrr;
@@ -396,7 +396,7 @@ static_assert(!noexcept(cuda::std::ranges::begin(ntmbrr)));
 static_assert(!noexcept(cuda::std::ranges::cbegin(ntmbrr)));
 #endif // !TEST_COMPILER_ICC
 
-_LIBCUDACXX_CPO_ACCESSIBILITY struct BeginReturnsArrayRef
+_CCCL_GLOBAL_CONSTANT struct BeginReturnsArrayRef
 {
   __host__ __device__ auto begin() const noexcept -> int (&)[10];
 } brar;
diff --git a/libcudacxx/test/libcudacxx/std/ranges/range.access/end.pass.cpp b/libcudacxx/test/libcudacxx/std/ranges/range.access/end.pass.cpp
index ef15449a4d4..69f79cb6ba4 100644
--- a/libcudacxx/test/libcudacxx/std/ranges/range.access/end.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/ranges/range.access/end.pass.cpp
@@ -420,7 +420,7 @@ ASSERT_NOEXCEPT(cuda::std::ranges::end(cuda::std::declval<int (&)[10]>()));
 ASSERT_NOEXCEPT(cuda::std::ranges::cend(cuda::std::declval<int (&)[10]>()));
 
 #if !defined(TEST_COMPILER_MSVC_2019) // broken noexcept
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowMemberEnd
+_CCCL_GLOBAL_CONSTANT struct NoThrowMemberEnd
 {
   __host__ __device__ ThrowingIterator<int> begin() const;
   __host__ __device__ ThrowingIterator<int> end() const noexcept; // auto(t.end()) doesn't throw
@@ -428,7 +428,7 @@ _LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowMemberEnd
 static_assert(noexcept(cuda::std::ranges::end(ntme)));
 static_assert(noexcept(cuda::std::ranges::cend(ntme)));
 
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowADLEnd
+_CCCL_GLOBAL_CONSTANT struct NoThrowADLEnd
 {
   __host__ __device__ ThrowingIterator<int> begin() const;
   __host__ __device__ friend ThrowingIterator<int> end(NoThrowADLEnd&) noexcept; // auto(end(t)) doesn't throw
@@ -439,7 +439,7 @@ static_assert(noexcept(cuda::std::ranges::cend(ntae)));
 #endif // !TEST_COMPILER_MSVC_2019
 
 #if !defined(TEST_COMPILER_ICC)
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowMemberEndReturnsRef
+_CCCL_GLOBAL_CONSTANT struct NoThrowMemberEndReturnsRef
 {
   __host__ __device__ ThrowingIterator<int> begin() const;
   __host__ __device__ ThrowingIterator<int>& end() const noexcept; // auto(t.end()) may throw
@@ -448,7 +448,7 @@ static_assert(!noexcept(cuda::std::ranges::end(ntmerr)));
 static_assert(!noexcept(cuda::std::ranges::cend(ntmerr)));
 #endif // !TEST_COMPILER_ICC
 
-_LIBCUDACXX_CPO_ACCESSIBILITY struct EndReturnsArrayRef
+_CCCL_GLOBAL_CONSTANT struct EndReturnsArrayRef
 {
   __host__ __device__ auto begin() const noexcept -> int (&)[10];
   __host__ __device__ auto end() const noexcept -> int (&)[10];
diff --git a/libcudacxx/test/libcudacxx/std/ranges/range.access/rbegin.pass.cpp b/libcudacxx/test/libcudacxx/std/ranges/range.access/rbegin.pass.cpp
index d4e3245b744..373ac14a954 100644
--- a/libcudacxx/test/libcudacxx/std/ranges/range.access/rbegin.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/ranges/range.access/rbegin.pass.cpp
@@ -571,14 +571,14 @@ ASSERT_NOEXCEPT(cuda::std::ranges::rbegin(cuda::std::declval<int (&)[10]>()));
 ASSERT_NOEXCEPT(cuda::std::ranges::crbegin(cuda::std::declval<int (&)[10]>()));
 
 #if !defined(TEST_COMPILER_MSVC_2019) // broken noexcept
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowMemberRBegin
+_CCCL_GLOBAL_CONSTANT struct NoThrowMemberRBegin
 {
   __host__ __device__ ThrowingIterator<int> rbegin() const noexcept; // auto(t.rbegin()) doesn't throw
 } ntmb;
 static_assert(noexcept(cuda::std::ranges::rbegin(ntmb)));
 static_assert(noexcept(cuda::std::ranges::crbegin(ntmb)));
 
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowADLRBegin
+_CCCL_GLOBAL_CONSTANT struct NoThrowADLRBegin
 {
   __host__ __device__ friend ThrowingIterator<int> rbegin(NoThrowADLRBegin&) noexcept; // auto(rbegin(t)) doesn't throw
   __host__ __device__ friend ThrowingIterator<int> rbegin(const NoThrowADLRBegin&) noexcept;
@@ -588,7 +588,7 @@ static_assert(noexcept(cuda::std::ranges::crbegin(ntab)));
 #endif // !TEST_COMPILER_MSVC_2019
 
 #if !defined(TEST_COMPILER_ICC)
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowMemberRBeginReturnsRef
+_CCCL_GLOBAL_CONSTANT struct NoThrowMemberRBeginReturnsRef
 {
   __host__ __device__ ThrowingIterator<int>& rbegin() const noexcept; // auto(t.rbegin()) may throw
 } ntmbrr;
@@ -596,7 +596,7 @@ static_assert(!noexcept(cuda::std::ranges::rbegin(ntmbrr)));
 static_assert(!noexcept(cuda::std::ranges::crbegin(ntmbrr)));
 #endif // !TEST_COMPILER_ICC
 
-_LIBCUDACXX_CPO_ACCESSIBILITY struct RBeginReturnsArrayRef
+_CCCL_GLOBAL_CONSTANT struct RBeginReturnsArrayRef
 {
   __host__ __device__ auto rbegin() const noexcept -> int (&)[10];
 } brar;
@@ -604,7 +604,7 @@ static_assert(noexcept(cuda::std::ranges::rbegin(brar)));
 static_assert(noexcept(cuda::std::ranges::crbegin(brar)));
 
 #if !defined(TEST_COMPILER_ICC)
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowBeginThrowingEnd
+_CCCL_GLOBAL_CONSTANT struct NoThrowBeginThrowingEnd
 {
   __host__ __device__ int* begin() const noexcept;
   __host__ __device__ int* end() const;
@@ -613,7 +613,7 @@ static_assert(!noexcept(cuda::std::ranges::rbegin(ntbte)));
 static_assert(!noexcept(cuda::std::ranges::crbegin(ntbte)));
 #endif // !TEST_COMPILER_ICC
 
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowEndThrowingBegin
+_CCCL_GLOBAL_CONSTANT struct NoThrowEndThrowingBegin
 {
   __host__ __device__ int* begin() const;
   __host__ __device__ int* end() const noexcept;
diff --git a/libcudacxx/test/libcudacxx/std/ranges/range.access/rend.pass.cpp b/libcudacxx/test/libcudacxx/std/ranges/range.access/rend.pass.cpp
index 44d281ac84a..239b9d53873 100644
--- a/libcudacxx/test/libcudacxx/std/ranges/range.access/rend.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/ranges/range.access/rend.pass.cpp
@@ -629,7 +629,7 @@ ASSERT_NOEXCEPT(cuda::std::ranges::rend(cuda::std::declval<int (&)[10]>()));
 ASSERT_NOEXCEPT(cuda::std::ranges::crend(cuda::std::declval<int (&)[10]>()));
 
 #if !defined(TEST_COMPILER_MSVC_2019)
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowMemberREnd
+_CCCL_GLOBAL_CONSTANT struct NoThrowMemberREnd
 {
   __host__ __device__ ThrowingIterator<int> rbegin() const;
   __host__ __device__ ThrowingIterator<int> rend() const noexcept; // auto(t.rend()) doesn't throw
@@ -637,7 +637,7 @@ _LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowMemberREnd
 static_assert(noexcept(cuda::std::ranges::rend(ntmre)));
 static_assert(noexcept(cuda::std::ranges::crend(ntmre)));
 
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowADLREnd
+_CCCL_GLOBAL_CONSTANT struct NoThrowADLREnd
 {
   __host__ __device__ ThrowingIterator<int> rbegin() const;
   __host__ __device__ friend ThrowingIterator<int> rend(NoThrowADLREnd&) noexcept; // auto(rend(t)) doesn't throw
@@ -648,7 +648,7 @@ static_assert(noexcept(cuda::std::ranges::crend(ntare)));
 #endif // !TEST_COMPILER_MSVC_2019
 
 #if !defined(TEST_COMPILER_ICC)
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowMemberREndReturnsRef
+_CCCL_GLOBAL_CONSTANT struct NoThrowMemberREndReturnsRef
 {
   __host__ __device__ ThrowingIterator<int> rbegin() const;
   __host__ __device__ ThrowingIterator<int>& rend() const noexcept; // auto(t.rend()) may throw
@@ -657,7 +657,7 @@ static_assert(!noexcept(cuda::std::ranges::rend(ntmrerr)));
 static_assert(!noexcept(cuda::std::ranges::crend(ntmrerr)));
 #endif // !TEST_COMPILER_ICC
 
-_LIBCUDACXX_CPO_ACCESSIBILITY struct REndReturnsArrayRef
+_CCCL_GLOBAL_CONSTANT struct REndReturnsArrayRef
 {
   __host__ __device__ auto rbegin() const noexcept -> int (&)[10];
   __host__ __device__ auto rend() const noexcept -> int (&)[10];
@@ -665,7 +665,7 @@ _LIBCUDACXX_CPO_ACCESSIBILITY struct REndReturnsArrayRef
 static_assert(noexcept(cuda::std::ranges::rend(rerar)));
 static_assert(noexcept(cuda::std::ranges::crend(rerar)));
 
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowBeginThrowingEnd
+_CCCL_GLOBAL_CONSTANT struct NoThrowBeginThrowingEnd
 {
   __host__ __device__ int* begin() const noexcept;
   __host__ __device__ int* end() const;
@@ -674,7 +674,7 @@ static_assert(noexcept(cuda::std::ranges::rend(ntbte)));
 static_assert(noexcept(cuda::std::ranges::crend(ntbte)));
 
 #if !defined(TEST_COMPILER_ICC)
-_LIBCUDACXX_CPO_ACCESSIBILITY struct NoThrowEndThrowingBegin
+_CCCL_GLOBAL_CONSTANT struct NoThrowEndThrowingBegin
 {
   __host__ __device__ int* begin() const;
   __host__ __device__ int* end() const noexcept;
diff --git a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.help/bool_constant.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.help/bool_constant.pass.cpp
index 5854828314b..1e3d21d9ce8 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.help/bool_constant.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.help/bool_constant.pass.cpp
@@ -17,7 +17,6 @@
 
 int main(int, char**)
 {
-#if TEST_STD_VER > 2011
   typedef cuda::std::bool_constant<true> _t;
   static_assert(_t::value, "");
   static_assert((cuda::std::is_same<_t::value_type, bool>::value), "");
@@ -29,7 +28,6 @@ int main(int, char**)
   static_assert((cuda::std::is_same<_f::value_type, bool>::value), "");
   static_assert((cuda::std::is_same<_f::type, _f>::value), "");
   static_assert((_f() == false), "");
-#endif
 
   return 0;
 }
diff --git a/libcudacxx/test/support/test_macros.h b/libcudacxx/test/support/test_macros.h
index e6c8e42bc18..a066348d05e 100644
--- a/libcudacxx/test/support/test_macros.h
+++ b/libcudacxx/test/support/test_macros.h
@@ -461,7 +461,7 @@ __host__ __device__ constexpr bool unused(T&&...)
 #  endif // not MSVC
 #endif
 
-#define TEST_CONSTEXPR_GLOBAL _LIBCUDACXX_CONSTEXPR_GLOBAL
+#define TEST_CONSTEXPR_GLOBAL _CCCL_CONSTEXPR_GLOBAL
 
 // Some convenience macros for checking nvcc versions
 #if defined(__CUDACC__) && _CCCL_CUDACC_VER < 1103000
diff --git a/thrust/CMakeLists.txt b/thrust/CMakeLists.txt
index a7da7329fb6..ca372d3677f 100644
--- a/thrust/CMakeLists.txt
+++ b/thrust/CMakeLists.txt
@@ -58,7 +58,6 @@ option(THRUST_ENABLE_HEADER_TESTING "Test that all public headers compile." "ON"
 option(THRUST_ENABLE_TESTING "Build Thrust testing suite." "ON")
 option(THRUST_ENABLE_EXAMPLES "Build Thrust examples." "ON")
 option(THRUST_ENABLE_BENCHMARKS "Build Thrust runtime benchmarks." "${CCCL_ENABLE_BENCHMARKS}")
-option(THRUST_INCLUDE_CUB_CMAKE "Build CUB tests and examples. (Requires CUDA)." "OFF")
 
 # Check if we're actually building anything before continuing. If not, no need
 # to search for deps, etc. This is a common approach for packagers that just
@@ -66,8 +65,7 @@ option(THRUST_INCLUDE_CUB_CMAKE "Build CUB tests and examples. (Requires CUDA)."
 if (NOT (THRUST_ENABLE_HEADER_TESTING OR
          THRUST_ENABLE_TESTING OR
          THRUST_ENABLE_EXAMPLES OR
-         THRUST_ENABLE_BENCHMARKS OR
-         THRUST_INCLUDE_CUB_CMAKE))
+         THRUST_ENABLE_BENCHMARKS))
   return()
 endif()
 
@@ -131,11 +129,3 @@ endif()
 if (THRUST_ENABLE_BENCHMARKS)
   add_subdirectory(benchmarks)
 endif()
-
-if (THRUST_INCLUDE_CUB_CMAKE AND THRUST_CUDA_FOUND)
-  set(CUB_IN_THRUST ON)
-  # CUB's path is specified generically to support both GitHub and Perforce
-  # source tree layouts. The include directory used by cub-config.cmake
-  # for source layouts is the same as the project root.
-  add_subdirectory("${_CUB_INCLUDE_DIR}" dependencies/cub)
-endif()
diff --git a/thrust/README.md b/thrust/README.md
index 7a0c478c996..37103fee047 100644
--- a/thrust/README.md
+++ b/thrust/README.md
@@ -123,7 +123,7 @@ git clone --recursive https://github.com/NVIDIA/thrust.git
 
 ## Using Thrust From Your Project
 
-For CMake-based projects, we provide a CMake package for use with `find_package`. See the [CMake README](https://github.com/NVIDIA/cccl/blob/main/docs/thrust/github_pages/setup/cmake_options.md) for more information.
+For CMake-based projects, we provide a CMake package for use with `find_package`. See :ref:`CMake Options <cmake-options>` for more information.
 Thrust can also be added via `add_subdirectory` or tools like the [CMake Package Manager](https://github.com/cpm-cmake/CPM.cmake).
 
 For non-CMake projects, compile with:
@@ -188,7 +188,7 @@ Some parts are distributed under the [Apache License v2.0] and the [Boost Licens
 
 [GitHub]: https://github.com/NVIDIA/cccl/tree/main/thrust
 
-[contributing section]: https://nvidia.github.io/thrust/contributing.html
+[contributing section]: https://nvidia.github.io/cccl/thrust/contributing.html
 
 [CMake build system]: https://cmake.org
 
diff --git a/thrust/benchmarks/bench/copy/basic.cu b/thrust/benchmarks/bench/copy/basic.cu
index 174a6af1178..3167c135cfb 100644
--- a/thrust/benchmarks/bench/copy/basic.cu
+++ b/thrust/benchmarks/bench/copy/basic.cu
@@ -37,7 +37,7 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
 {
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
 
-  thrust::device_vector<T> input(elements, 1);
+  thrust::device_vector<T> input(elements, T{1});
   thrust::device_vector<T> output(elements);
 
   state.add_element_count(elements);
@@ -52,7 +52,31 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
   });
 }
 
-using types = nvbench::type_list<nvbench::uint8_t, nvbench::uint16_t, nvbench::uint32_t, nvbench::uint64_t>;
+// Non-trivially-copyable/relocatable type which is not allowed to be copied using std::memcpy or cudaMemcpy
+struct non_trivial
+{
+  int a;
+  int b;
+
+  non_trivial() = default;
+
+  _CCCL_HOST_DEVICE explicit non_trivial(int i)
+      : a(i)
+      , b(i)
+  {}
+
+  // the user-defined copy constructor prevents the type from being trivially copyable
+  _CCCL_HOST_DEVICE non_trivial(const non_trivial& nt)
+      : a(nt.a)
+      , b(nt.b)
+  {}
+};
+
+static_assert(!::cuda::std::is_trivially_copyable<non_trivial>::value, ""); // as required by the C++ standard
+static_assert(!thrust::is_trivially_relocatable<non_trivial>::value, ""); // thrust uses this check internally
+
+using types =
+  nvbench::type_list<nvbench::uint8_t, nvbench::uint16_t, nvbench::uint32_t, nvbench::uint64_t, non_trivial>;
 
 NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(types))
   .set_name("base")
diff --git a/thrust/testing/merge_by_key.cu b/thrust/testing/merge_by_key.cu
index 411db0794f3..d7e34676956 100644
--- a/thrust/testing/merge_by_key.cu
+++ b/thrust/testing/merge_by_key.cu
@@ -1,6 +1,8 @@
 #include <thrust/functional.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/merge.h>
 #include <thrust/sort.h>
 #include <thrust/unique.h>
@@ -253,3 +255,67 @@ void TestMergeByKeyDescending(size_t n)
   TestMergeByKey<T, thrust::greater<T>>(n);
 }
 DECLARE_VARIABLE_UNITTEST(TestMergeByKeyDescending);
+
+struct def_level_fn
+{
+  _CCCL_DEVICE std::uint32_t operator()(int i) const
+  {
+    return static_cast<uint32_t>(i + 10);
+  }
+};
+
+struct offset_transform
+{
+  _CCCL_DEVICE int operator()(int i) const
+  {
+    return i + 1;
+  }
+};
+
+// Tests the use of thrust::merge_by_key similar to cuDF in
+// https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/lists/dremel.cu#L413
+void TestMergeByKeyFromCuDFDremel()
+{
+  // TODO(bgruber): I have no idea what this code is actually computing, but I tried to replicate the types/iterators
+  constexpr std::ptrdiff_t empties_size = 123;
+  constexpr int max_vals_size           = 225;
+  constexpr int level                   = 4;
+  constexpr int curr_rep_values_size    = 0;
+
+  thrust::device_vector<int> empties(empties_size, 42);
+  thrust::device_vector<int> empties_idx(empties_size, 13);
+
+  thrust::device_vector<std::uint8_t> temp_rep_vals(max_vals_size);
+  thrust::device_vector<std::uint8_t> temp_def_vals(max_vals_size);
+  thrust::device_vector<std::uint8_t> rep_level(max_vals_size);
+  thrust::device_vector<std::uint8_t> def_level(max_vals_size);
+
+  auto offset_transformer  = offset_transform{};
+  auto transformed_empties = thrust::make_transform_iterator(empties.begin(), offset_transformer);
+
+  auto input_parent_rep_it = thrust::make_constant_iterator(level);
+  auto input_parent_def_it = thrust::make_transform_iterator(empties_idx.begin(), def_level_fn{});
+  auto input_parent_zip_it = thrust::make_zip_iterator(input_parent_rep_it, input_parent_def_it);
+  auto input_child_zip_it  = thrust::make_zip_iterator(temp_rep_vals.begin(), temp_def_vals.begin());
+  auto output_zip_it       = thrust::make_zip_iterator(rep_level.begin(), def_level.begin());
+
+  thrust::merge_by_key(
+    transformed_empties,
+    transformed_empties + empties_size,
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(curr_rep_values_size),
+    input_parent_zip_it,
+    input_child_zip_it,
+    thrust::make_discard_iterator(),
+    output_zip_it);
+
+  thrust::device_vector<std::uint8_t> reference_rep_level(max_vals_size);
+  thrust::fill(reference_rep_level.begin(), reference_rep_level.begin() + empties_size, level);
+
+  thrust::device_vector<std::uint8_t> reference_def_level(max_vals_size);
+  thrust::fill(reference_def_level.begin(), reference_def_level.begin() + empties_size, 13 + 10);
+
+  ASSERT_EQUAL(reference_rep_level, rep_level);
+  ASSERT_EQUAL(reference_def_level, def_level);
+}
+DECLARE_UNITTEST(TestMergeByKeyFromCuDFDremel);
diff --git a/thrust/testing/omp/reduce_intervals.cu b/thrust/testing/omp/reduce_intervals.cu
index c77ff96fef8..309faf69cc9 100644
--- a/thrust/testing/omp/reduce_intervals.cu
+++ b/thrust/testing/omp/reduce_intervals.cu
@@ -12,7 +12,7 @@ void reduce_intervals(InputIterator input, OutputIterator output, BinaryFunction
   using index_type = typename Decomposition::index_type;
 
   // wrap binary_op
-  thrust::detail::wrapped_function<BinaryFunction, OutputType> wrapped_binary_op(binary_op);
+  thrust::detail::wrapped_function<BinaryFunction, OutputType> wrapped_binary_op{binary_op};
 
   for (index_type i = 0; i < decomp.size(); ++i, ++output)
   {
diff --git a/thrust/testing/scan.cu b/thrust/testing/scan.cu
index fc2d1bd8627..42e60b95471 100644
--- a/thrust/testing/scan.cu
+++ b/thrust/testing/scan.cu
@@ -8,6 +8,8 @@
 #include <thrust/iterator/retag.h>
 #include <thrust/scan.h>
 
+#include <cuda/std/array>
+
 #include <unittest/unittest.h>
 
 template <typename T>
@@ -41,99 +43,62 @@ void TestScanSimple()
   Vector result(5);
   Vector output(5);
 
-  input[0] = 1;
-  input[1] = 3;
-  input[2] = -2;
-  input[3] = 4;
-  input[4] = -5;
-
+  input = {1, 3, -2, 4, -5};
   Vector input_copy(input);
 
   // inclusive scan
-  iter      = thrust::inclusive_scan(input.begin(), input.end(), output.begin());
-  result[0] = 1;
-  result[1] = 4;
-  result[2] = 2;
-  result[3] = 6;
-  result[4] = 1;
+  iter   = thrust::inclusive_scan(input.begin(), input.end(), output.begin());
+  result = {1, 4, 2, 6, 1};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
 
   // exclusive scan
-  iter      = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(0));
-  result[0] = 0;
-  result[1] = 1;
-  result[2] = 4;
-  result[3] = 2;
-  result[4] = 6;
+  iter   = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(0));
+  result = {0, 1, 4, 2, 6};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
 
   // exclusive scan with init
-  iter      = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(3));
-  result[0] = 3;
-  result[1] = 4;
-  result[2] = 7;
-  result[3] = 5;
-  result[4] = 9;
+  iter   = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(3));
+  result = {3, 4, 7, 5, 9};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
 
   // inclusive scan with op
-  iter      = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), thrust::plus<T>());
-  result[0] = 1;
-  result[1] = 4;
-  result[2] = 2;
-  result[3] = 6;
-  result[4] = 1;
+  iter   = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), thrust::plus<T>());
+  result = {1, 4, 2, 6, 1};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
 
   // exclusive scan with init and op
-  iter      = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(3), thrust::plus<T>());
-  result[0] = 3;
-  result[1] = 4;
-  result[2] = 7;
-  result[3] = 5;
-  result[4] = 9;
+  iter   = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(3), thrust::plus<T>());
+  result = {3, 4, 7, 5, 9};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
 
   // inplace inclusive scan
-  input     = input_copy;
-  iter      = thrust::inclusive_scan(input.begin(), input.end(), input.begin());
-  result[0] = 1;
-  result[1] = 4;
-  result[2] = 2;
-  result[3] = 6;
-  result[4] = 1;
+  input  = input_copy;
+  iter   = thrust::inclusive_scan(input.begin(), input.end(), input.begin());
+  result = {1, 4, 2, 6, 1};
   ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 
   // inplace exclusive scan with init
-  input     = input_copy;
-  iter      = thrust::exclusive_scan(input.begin(), input.end(), input.begin(), T(3));
-  result[0] = 3;
-  result[1] = 4;
-  result[2] = 7;
-  result[3] = 5;
-  result[4] = 9;
+  input  = input_copy;
+  iter   = thrust::exclusive_scan(input.begin(), input.end(), input.begin(), T(3));
+  result = {3, 4, 7, 5, 9};
   ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 
   // inplace exclusive scan with implicit init=0
-  input     = input_copy;
-  iter      = thrust::exclusive_scan(input.begin(), input.end(), input.begin());
-  result[0] = 0;
-  result[1] = 1;
-  result[2] = 4;
-  result[3] = 2;
-  result[4] = 6;
+  input  = input_copy;
+  iter   = thrust::exclusive_scan(input.begin(), input.end(), input.begin());
+  result = {0, 1, 4, 2, 6};
   ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 }
@@ -252,18 +217,8 @@ template <class IntVector, class FloatVector>
 void TestScanMixedTypes()
 {
   // make sure we get types for default args and operators correct
-  IntVector int_input(4);
-  int_input[0] = 1;
-  int_input[1] = 2;
-  int_input[2] = 3;
-  int_input[3] = 4;
-
-  FloatVector float_input(4);
-  float_input[0] = 1.5;
-  float_input[1] = 2.5;
-  float_input[2] = 3.5;
-  float_input[3] = 4.5;
-
+  IntVector int_input{1, 2, 3, 4};
+  FloatVector float_input{1.5, 2.5, 3.5, 4.5};
   IntVector int_output(4);
   FloatVector float_output(4);
 
@@ -545,32 +500,11 @@ void TestInclusiveScanWithIndirection()
   // add numbers modulo 3 with external lookup table
   using T = typename Vector::value_type;
 
-  Vector data(7);
-  data[0] = 0;
-  data[1] = 1;
-  data[2] = 2;
-  data[3] = 1;
-  data[4] = 2;
-  data[5] = 0;
-  data[6] = 1;
-
-  Vector table(6);
-  table[0] = 0;
-  table[1] = 1;
-  table[2] = 2;
-  table[3] = 0;
-  table[4] = 1;
-  table[5] = 2;
-
+  Vector data{0, 1, 2, 1, 2, 0, 1};
+  Vector table{0, 1, 2, 0, 1, 2};
   thrust::inclusive_scan(data.begin(), data.end(), data.begin(), plus_mod3<T>(thrust::raw_pointer_cast(&table[0])));
 
-  ASSERT_EQUAL(data[0], T(0));
-  ASSERT_EQUAL(data[1], T(1));
-  ASSERT_EQUAL(data[2], T(0));
-  ASSERT_EQUAL(data[3], T(1));
-  ASSERT_EQUAL(data[4], T(0));
-  ASSERT_EQUAL(data[5], T(0));
-  ASSERT_EQUAL(data[6], T(1));
+  ASSERT_EQUAL(data, (Vector{0, 1, 0, 1, 0, 0, 1}));
 }
 DECLARE_INTEGRAL_VECTOR_UNITTEST(TestInclusiveScanWithIndirection);
 
@@ -595,33 +529,12 @@ void TestInclusiveScanWithConstAccumulator()
   // add numbers modulo 3 with external lookup table
   using T = typename Vector::value_type;
 
-  Vector data(7);
-  data[0] = 0;
-  data[1] = 1;
-  data[2] = 2;
-  data[3] = 1;
-  data[4] = 2;
-  data[5] = 0;
-  data[6] = 1;
-
-  Vector table(6);
-  table[0] = 0;
-  table[1] = 1;
-  table[2] = 2;
-  table[3] = 0;
-  table[4] = 1;
-  table[5] = 2;
-
+  Vector data{0, 1, 2, 1, 2, 0, 1};
+  Vector table{0, 1, 2, 0, 1, 2};
   thrust::inclusive_scan(
     data.begin(), data.end(), data.begin(), const_ref_plus_mod3<T>(thrust::raw_pointer_cast(&table[0])));
 
-  ASSERT_EQUAL(data[0], T(0));
-  ASSERT_EQUAL(data[1], T(1));
-  ASSERT_EQUAL(data[2], T(0));
-  ASSERT_EQUAL(data[3], T(1));
-  ASSERT_EQUAL(data[4], T(0));
-  ASSERT_EQUAL(data[5], T(0));
-  ASSERT_EQUAL(data[6], T(1));
+  ASSERT_EQUAL(data, (Vector{0, 1, 0, 1, 0, 0, 1}));
 }
 DECLARE_INTEGRAL_VECTOR_UNITTEST(TestInclusiveScanWithConstAccumulator);
 
@@ -749,3 +662,90 @@ void TestInclusiveScanWithUserDefinedType()
   ASSERT_EQUAL(static_cast<Int>(vec.back()).i, 5);
 }
 DECLARE_UNITTEST(TestInclusiveScanWithUserDefinedType);
+
+// Represents a permutation as a tuple of integers, see also: https://en.wikipedia.org/wiki/Permutation
+// We need a distinct type (instead of an alias) for operator<< to be found via ADL
+struct permutation_t : ::cuda::std::array<int, 5>
+{
+  permutation_t() = default;
+
+  constexpr _CCCL_HOST_DEVICE permutation_t(int a, int b, int c, int d, int e)
+      : ::cuda::std::array<int, 5>{a, b, c, d, e}
+  {}
+
+  friend std::ostream& operator<<(std::ostream& os, const permutation_t& p)
+  {
+    os << '{';
+    for (std::size_t i = 0; i < p.size(); i++)
+    {
+      if (i > 0)
+      {
+        os << ", ";
+      }
+      os << p[i];
+    }
+    return os << '}';
+  }
+};
+
+// Composes two permutations. This operation is associative, but not commutative.
+struct composition_op_t
+{
+  _CCCL_HOST_DEVICE permutation_t operator()(permutation_t lhs, permutation_t rhs) const
+  {
+    permutation_t result;
+    for (std::size_t i = 0; i < lhs.size(); i++)
+    {
+      result[i] = rhs[lhs[i]];
+    }
+    return result;
+  }
+};
+
+void TestInclusiveScanWithNonCommutativeOp()
+{
+  const thrust::device_vector<permutation_t> input = {
+    {3, 2, 0, 1, 4},
+    {2, 4, 0, 1, 3},
+    {3, 2, 1, 4, 0},
+    {4, 3, 1, 0, 2},
+    {0, 3, 2, 4, 1},
+    {3, 2, 1, 0, 4},
+    {3, 4, 1, 2, 0},
+    {4, 2, 1, 0, 3},
+    {4, 0, 1, 3, 2},
+    {0, 2, 3, 1, 4}};
+  thrust::device_vector<permutation_t> output(10);
+  constexpr auto identity = permutation_t{0, 1, 2, 3, 4};
+
+  thrust::inclusive_scan(input.begin(), input.end(), output.begin(), composition_op_t{});
+  ASSERT_EQUAL(
+    output,
+    (thrust::device_vector<permutation_t>{
+      {3, 2, 0, 1, 4},
+      {1, 0, 2, 4, 3},
+      {2, 3, 1, 0, 4},
+      {1, 0, 3, 4, 2},
+      {3, 0, 4, 1, 2},
+      {0, 3, 4, 2, 1},
+      {3, 2, 0, 1, 4},
+      {0, 1, 4, 2, 3},
+      {4, 0, 2, 1, 3},
+      {4, 0, 3, 2, 1}}));
+
+  thrust::exclusive_scan(input.begin(), input.end(), output.begin(), identity, composition_op_t{});
+  ASSERT_EQUAL(
+    output,
+    (thrust::device_vector<permutation_t>{
+      {0, 1, 2, 3, 4},
+      {3, 2, 0, 1, 4},
+      {1, 0, 2, 4, 3},
+      {2, 3, 1, 0, 4},
+      {1, 0, 3, 4, 2},
+      {3, 0, 4, 1, 2},
+      {0, 3, 4, 2, 1},
+      {3, 2, 0, 1, 4},
+      {0, 1, 4, 2, 3},
+      {4, 0, 2, 1, 3}}));
+}
+DECLARE_UNITTEST(TestInclusiveScanWithNonCommutativeOp);
diff --git a/thrust/testing/unittest/assertions.h b/thrust/testing/unittest/assertions.h
index 8f97df1cb43..c0138118604 100644
--- a/thrust/testing/unittest/assertions.h
+++ b/thrust/testing/unittest/assertions.h
@@ -7,6 +7,8 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/universal_vector.h>
 
+#include <cuda/std/utility>
+
 #include <unittest/exceptions.h>
 #include <unittest/util.h>
 
@@ -386,6 +388,17 @@ class almost_equal_to<THRUST_NS_QUALIFIER::complex<T>>
 ////
 // check sequences
 
+inline int promote_char(char c)
+{
+  return c;
+}
+
+template <typename T>
+T&& promote_char(T&& t)
+{
+  return ::cuda::std::forward<T>(t);
+}
+
 template <typename ForwardIterator1, typename ForwardIterator2, typename BinaryPredicate>
 void assert_equal(
   ForwardIterator1 first1,
@@ -435,16 +448,7 @@ void assert_equal(
 
       if (mismatches <= MAX_OUTPUT_LINES)
       {
-        _CCCL_IF_CONSTEXPR (sizeof(InputType) == 1)
-        {
-          f << "  [" << i << "] " << *first1 + InputType() << "  " << *first2 + InputType() << "\n"; // unprintable
-                                                                                                     // chars are a
-                                                                                                     // problem
-        }
-        else
-        {
-          f << "  [" << i << "] " << *first1 << "  " << *first2 << "\n";
-        }
+        f << "  [" << i << "] " << promote_char(*first1) << "  " << promote_char(*first2) << "\n";
       }
     }
 
diff --git a/thrust/thrust/cmake/README.md b/thrust/thrust/cmake/README.md
index ae296b635b5..09cbc6debb5 100644
--- a/thrust/thrust/cmake/README.md
+++ b/thrust/thrust/cmake/README.md
@@ -32,10 +32,10 @@ detailed below.
 
 **Note:** If CMake is unable to locate Thrust, specify the path to Thrust's CMake
 configuration directory (where this README file is located) as `Thrust_DIR`.
-If cloning Thrust from github, this would be
+If using Thrust from the CCCL sources, this would be
 
 ```
-$ cmake . -DThrust_DIR=<thrust git repo root>/thrust/cmake/
+$ cmake . -DThrust_DIR=<CCCL git repo root>/thrust/thrust/cmake/
 ```
 
 #### TBB / OpenMP
diff --git a/thrust/thrust/detail/allocator/default_construct_range.h b/thrust/thrust/detail/allocator/value_initialize_range.h
similarity index 87%
rename from thrust/thrust/detail/allocator/default_construct_range.h
rename to thrust/thrust/detail/allocator/value_initialize_range.h
index 8ff22630a6d..42a68fc85c2 100644
--- a/thrust/thrust/detail/allocator/default_construct_range.h
+++ b/thrust/thrust/detail/allocator/value_initialize_range.h
@@ -31,9 +31,9 @@ namespace detail
 {
 
 template <typename Allocator, typename Pointer, typename Size>
-_CCCL_HOST_DEVICE inline void default_construct_range(Allocator& a, Pointer p, Size n);
+_CCCL_HOST_DEVICE inline void value_initialize_range(Allocator& a, Pointer p, Size n);
 
 } // namespace detail
 THRUST_NAMESPACE_END
 
-#include <thrust/detail/allocator/default_construct_range.inl>
+#include <thrust/detail/allocator/value_initialize_range.inl>
diff --git a/thrust/thrust/detail/allocator/default_construct_range.inl b/thrust/thrust/detail/allocator/value_initialize_range.inl
similarity index 92%
rename from thrust/thrust/detail/allocator/default_construct_range.inl
rename to thrust/thrust/detail/allocator/value_initialize_range.inl
index 36db08c111f..944f7057802 100644
--- a/thrust/thrust/detail/allocator/default_construct_range.inl
+++ b/thrust/thrust/detail/allocator/value_initialize_range.inl
@@ -73,7 +73,7 @@ struct needs_default_construct_via_allocator<std::allocator<U>, T>
 template <typename Allocator, typename Pointer, typename Size>
 _CCCL_HOST_DEVICE ::cuda::std::__enable_if_t<
   needs_default_construct_via_allocator<Allocator, typename pointer_element<Pointer>::type>::value>
-default_construct_range(Allocator& a, Pointer p, Size n)
+value_initialize_range(Allocator& a, Pointer p, Size n)
 {
   thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, construct1_via_allocator<Allocator>(a));
 }
@@ -81,7 +81,7 @@ default_construct_range(Allocator& a, Pointer p, Size n)
 template <typename Allocator, typename Pointer, typename Size>
 _CCCL_HOST_DEVICE
 typename disable_if<needs_default_construct_via_allocator<Allocator, typename pointer_element<Pointer>::type>::value>::type
-default_construct_range(Allocator& a, Pointer p, Size n)
+value_initialize_range(Allocator& a, Pointer p, Size n)
 {
   thrust::uninitialized_fill_n(allocator_system<Allocator>::get(a), p, n, typename pointer_element<Pointer>::type());
 }
@@ -89,9 +89,9 @@ default_construct_range(Allocator& a, Pointer p, Size n)
 } // namespace allocator_traits_detail
 
 template <typename Allocator, typename Pointer, typename Size>
-_CCCL_HOST_DEVICE void default_construct_range(Allocator& a, Pointer p, Size n)
+_CCCL_HOST_DEVICE void value_initialize_range(Allocator& a, Pointer p, Size n)
 {
-  return allocator_traits_detail::default_construct_range(a, p, n);
+  return allocator_traits_detail::value_initialize_range(a, p, n);
 }
 
 } // namespace detail
diff --git a/thrust/thrust/detail/complex/complex.inl b/thrust/thrust/detail/complex/complex.inl
index 6532f32fc67..3f4c1269312 100644
--- a/thrust/thrust/detail/complex/complex.inl
+++ b/thrust/thrust/detail/complex/complex.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2021 NVIDIA Corporation
+ *  Copyright 2008-2024 NVIDIA Corporation
  *  Copyright 2013 Filipe RNC Maia
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,97 +26,44 @@ THRUST_NAMESPACE_BEGIN
 
 /* --- Constructors --- */
 
-#if THRUST_CPP_DIALECT < 2011
-template <typename T>
-_CCCL_HOST_DEVICE complex<T>::complex()
-{
-  real(T());
-  imag(T());
-}
-#endif
-
 template <typename T>
 _CCCL_HOST_DEVICE complex<T>::complex(const T& re)
-#if THRUST_CPP_DIALECT >= 2011
     // Initialize the storage in the member initializer list using C++ unicorn
     // initialization. This allows `complex<T const>` to work.
     : data{re, T()}
 {}
-#else
-{
-  real(re);
-  imag(T());
-}
-#endif
 
 template <typename T>
 _CCCL_HOST_DEVICE complex<T>::complex(const T& re, const T& im)
-#if THRUST_CPP_DIALECT >= 2011
     // Initialize the storage in the member initializer list using C++ unicorn
     // initialization. This allows `complex<T const>` to work.
     : data{re, im}
 {}
-#else
-{
-  real(re);
-  imag(im);
-}
-#endif
-
-#if THRUST_CPP_DIALECT < 2011
-template <typename T>
-_CCCL_HOST_DEVICE complex<T>::complex(const complex<T>& z)
-{
-  real(z.real());
-  imag(z.imag());
-}
-#endif
 
 template <typename T>
 template <typename U>
 _CCCL_HOST_DEVICE complex<T>::complex(const complex<U>& z)
-#if THRUST_CPP_DIALECT >= 2011
     // Initialize the storage in the member initializer list using C++ unicorn
     // initialization. This allows `complex<T const>` to work.
     // We do a functional-style cast here to suppress conversion warnings.
     : data{T(z.real()), T(z.imag())}
 {}
-#else
-{
-  real(T(z.real()));
-  imag(T(z.imag()));
-}
-#endif
 
 template <typename T>
 _CCCL_HOST THRUST_STD_COMPLEX_DEVICE complex<T>::complex(const std::complex<T>& z)
-#if THRUST_CPP_DIALECT >= 2011
     // Initialize the storage in the member initializer list using C++ unicorn
     // initialization. This allows `complex<T const>` to work.
     : data{THRUST_STD_COMPLEX_REAL(z), THRUST_STD_COMPLEX_IMAG(z)}
 {}
-#else
-{
-  real(THRUST_STD_COMPLEX_REAL(z));
-  imag(THRUST_STD_COMPLEX_IMAG(z));
-}
-#endif
 
 template <typename T>
 template <typename U>
 _CCCL_HOST THRUST_STD_COMPLEX_DEVICE complex<T>::complex(const std::complex<U>& z)
-#if THRUST_CPP_DIALECT >= 2011
     // Initialize the storage in the member initializer list using C++ unicorn
     // initialization. This allows `complex<T const>` to work.
     // We do a functional-style cast here to suppress conversion warnings.
     : data{T(THRUST_STD_COMPLEX_REAL(z)), T(THRUST_STD_COMPLEX_IMAG(z))}
 {}
-#else
-{
-  real(T(THRUST_STD_COMPLEX_REAL(z)));
-  imag(T(THRUST_STD_COMPLEX_IMAG(z)));
-}
-#endif
 
 /* --- Assignment Operators --- */
 
@@ -128,16 +75,6 @@ _CCCL_HOST_DEVICE complex<T>& complex<T>::operator=(const T& re)
   return *this;
 }
 
-#if THRUST_CPP_DIALECT < 2011
-template <typename T>
-_CCCL_HOST_DEVICE complex<T>& complex<T>::operator=(const complex<T>& z)
-{
-  real(z.real());
-  imag(z.imag());
-  return *this;
-}
-#endif
-
 template <typename T>
 template <typename U>
 _CCCL_HOST_DEVICE complex<T>& complex<T>::operator=(const complex<U>& z)
diff --git a/thrust/thrust/detail/contiguous_storage.h b/thrust/thrust/detail/contiguous_storage.h
index 85ee76fd0d9..db80694eb26 100644
--- a/thrust/thrust/detail/contiguous_storage.h
+++ b/thrust/thrust/detail/contiguous_storage.h
@@ -102,7 +102,7 @@ class contiguous_storage
 
   _CCCL_HOST_DEVICE void swap(contiguous_storage& x);
 
-  _CCCL_HOST_DEVICE void default_construct_n(iterator first, size_type n);
+  _CCCL_HOST_DEVICE void value_initialize_n(iterator first, size_type n);
 
   _CCCL_HOST_DEVICE void uninitialized_fill_n(iterator first, size_type n, const value_type& value);
 
diff --git a/thrust/thrust/detail/contiguous_storage.inl b/thrust/thrust/detail/contiguous_storage.inl
index 84580add0db..3994bdd206b 100644
--- a/thrust/thrust/detail/contiguous_storage.inl
+++ b/thrust/thrust/detail/contiguous_storage.inl
@@ -27,9 +27,9 @@
 #endif // no system header
 #include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/detail/allocator/copy_construct_range.h>
-#include <thrust/detail/allocator/default_construct_range.h>
 #include <thrust/detail/allocator/destroy_range.h>
 #include <thrust/detail/allocator/fill_construct_range.h>
+#include <thrust/detail/allocator/value_initialize_range.h>
 #include <thrust/detail/contiguous_storage.h>
 #include <thrust/detail/swap.h>
 
@@ -203,10 +203,10 @@ _CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::swap(contiguous_storage& x)
 } // end contiguous_storage::swap()
 
 template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::default_construct_n(iterator first, size_type n)
+_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::value_initialize_n(iterator first, size_type n)
 {
-  default_construct_range(m_allocator, first.base(), n);
-} // end contiguous_storage::default_construct_n()
+  value_initialize_range(m_allocator, first.base(), n);
+} // end contiguous_storage::value_initialize_n()
 
 template <typename T, typename Alloc>
 _CCCL_HOST_DEVICE void
diff --git a/thrust/thrust/detail/function.h b/thrust/thrust/detail/function.h
index e5a61f527ad..27f3cd5d75f 100644
--- a/thrust/thrust/detail/function.h
+++ b/thrust/thrust/detail/function.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2024 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -31,119 +31,19 @@ THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
-
 template <typename Function, typename Result>
 struct wrapped_function
 {
   // mutable because Function::operator() might be const
   mutable Function m_f;
 
-  inline _CCCL_HOST_DEVICE wrapped_function()
-      : m_f()
-  {}
-
-  inline _CCCL_HOST_DEVICE wrapped_function(const Function& f)
-      : m_f(f)
-  {}
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename Argument>
-  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE Result operator()(Argument& x) const
-  {
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
-  }
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename Argument>
-  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE Result operator()(const Argument& x) const
-  {
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
-  }
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename Argument1, typename Argument2>
-  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE Result operator()(Argument1& x, Argument2& y) const
-  {
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename Argument1, typename Argument2>
-  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE Result operator()(const Argument1& x, Argument2& y) const
-  {
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename Argument1, typename Argument2>
-  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE Result operator()(const Argument1& x, const Argument2& y) const
-  {
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename Argument1, typename Argument2>
-  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE Result operator()(Argument1& x, const Argument2& y) const
-  {
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-}; // end wrapped_function
-
-// Specialize for void return types:
-template <typename Function>
-struct wrapped_function<Function, void>
-{
-  // mutable because Function::operator() might be const
-  mutable Function m_f;
-  inline _CCCL_HOST_DEVICE wrapped_function()
-      : m_f()
-  {}
-
-  inline _CCCL_HOST_DEVICE wrapped_function(const Function& f)
-      : m_f(f)
-  {}
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename Argument>
-  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE void operator()(Argument& x) const
-  {
-    m_f(thrust::raw_reference_cast(x));
-  }
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename Argument>
-  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE void operator()(const Argument& x) const
-  {
-    m_f(thrust::raw_reference_cast(x));
-  }
-
   _CCCL_EXEC_CHECK_DISABLE
-  template <typename Argument1, typename Argument2>
-  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE void operator()(Argument1& x, Argument2& y) const
+  template <typename... Ts>
+  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE Result operator()(Ts&&... args) const
   {
-    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
-  }
-
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename Argument1, typename Argument2>
-  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE void operator()(const Argument1& x, Argument2& y) const
-  {
-    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
-  }
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename Argument1, typename Argument2>
-  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE void operator()(const Argument1& x, const Argument2& y) const
-  {
-    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
-  }
-  _CCCL_EXEC_CHECK_DISABLE
-  template <typename Argument1, typename Argument2>
-  _CCCL_FORCEINLINE _CCCL_HOST_DEVICE void operator()(Argument1& x, const Argument2& y) const
-  {
-    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(::cuda::std::forward<Ts>(args))...));
   }
 }; // end wrapped_function
-
 } // namespace detail
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/temporary_array.inl b/thrust/thrust/detail/temporary_array.inl
index 93385d6fd7d..80d9915ff95 100644
--- a/thrust/thrust/detail/temporary_array.inl
+++ b/thrust/thrust/detail/temporary_array.inl
@@ -51,7 +51,7 @@ _CCCL_HOST_DEVICE ::cuda::std::__enable_if_t<avoid_initialization<T>::value> con
 template <typename T, typename TemporaryArray, typename Size>
 _CCCL_HOST_DEVICE ::cuda::std::__enable_if_t<!avoid_initialization<T>::value> construct_values(TemporaryArray& a, Size n)
 {
-  a.default_construct_n(a.begin(), n);
+  a.value_initialize_n(a.begin(), n);
 } // end construct_values()
 
 } // namespace temporary_array_detail
diff --git a/thrust/thrust/detail/type_traits/function_traits.h b/thrust/thrust/detail/type_traits/is_commutative.h
similarity index 83%
rename from thrust/thrust/detail/type_traits/function_traits.h
rename to thrust/thrust/detail/type_traits/is_commutative.h
index 9951a65c22c..e97e5b0466b 100644
--- a/thrust/thrust/detail/type_traits/function_traits.h
+++ b/thrust/thrust/detail/type_traits/is_commutative.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2024 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -27,31 +27,12 @@
 #endif // no system header
 
 #include <thrust/detail/type_traits.h>
+#include <thrust/functional.h>
 
 THRUST_NAMESPACE_BEGIN
-
-// forward definitions for is_commutative
-template <typename T>
-struct plus;
-template <typename T>
-struct multiplies;
-template <typename T>
-struct minimum;
-template <typename T>
-struct maximum;
-template <typename T>
-struct logical_or;
-template <typename T>
-struct logical_and;
-template <typename T>
-struct bit_or;
-template <typename T>
-struct bit_and;
-template <typename T>
-struct bit_xor;
-
 namespace detail
 {
+
 template <typename BinaryFunction>
 struct is_commutative : public thrust::detail::false_type
 {};
diff --git a/thrust/thrust/detail/type_traits/pointer_traits.h b/thrust/thrust/detail/type_traits/pointer_traits.h
index 3a2098a3c35..9ac54ee7860 100644
--- a/thrust/thrust/detail/type_traits/pointer_traits.h
+++ b/thrust/thrust/detail/type_traits/pointer_traits.h
@@ -40,43 +40,10 @@ namespace detail
 template <typename Ptr>
 struct pointer_element;
 
-template <template <typename> class Ptr, typename Arg>
-struct pointer_element<Ptr<Arg>>
+template <template <typename...> class Ptr, typename FirstArg, typename... Args>
+struct pointer_element<Ptr<FirstArg, Args...>>
 {
-  using type = Arg;
-};
-
-template <template <typename, typename> class Ptr, typename Arg1, typename Arg2>
-struct pointer_element<Ptr<Arg1, Arg2>>
-{
-  using type = Arg1;
-};
-
-template <template <typename, typename, typename> class Ptr, typename Arg1, typename Arg2, typename Arg3>
-struct pointer_element<Ptr<Arg1, Arg2, Arg3>>
-{
-  using type = Arg1;
-};
-
-template <template <typename, typename, typename, typename> class Ptr,
-          typename Arg1,
-          typename Arg2,
-          typename Arg3,
-          typename Arg4>
-struct pointer_element<Ptr<Arg1, Arg2, Arg3, Arg4>>
-{
-  using type = Arg1;
-};
-
-template <template <typename, typename, typename, typename, typename> class Ptr,
-          typename Arg1,
-          typename Arg2,
-          typename Arg3,
-          typename Arg4,
-          typename Arg5>
-struct pointer_element<Ptr<Arg1, Arg2, Arg3, Arg4, Arg5>>
-{
-  using type = Arg1;
+  using type = FirstArg;
 };
 
 template <typename T>
@@ -174,8 +141,6 @@ struct rebind_pointer<Ptr<OldT, Tag, typename std::add_lvalue_reference<OldT>::t
   using type = Ptr<T, Tag, typename std::add_lvalue_reference<T>::type, DerivedPtr<T, DerivedPtrTail...>>;
 };
 
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_raw_pointer, raw_pointer)
-
 namespace pointer_traits_detail
 {
 
@@ -190,7 +155,7 @@ struct pointer_raw_pointer_impl<T*>
 };
 
 template <typename Ptr>
-struct pointer_raw_pointer_impl<Ptr, ::cuda::std::__enable_if_t<has_raw_pointer<Ptr>::value>>
+struct pointer_raw_pointer_impl<Ptr, ::cuda::std::void_t<typename Ptr::raw_pointer>>
 {
   using type = typename Ptr::raw_pointer;
 };
diff --git a/thrust/thrust/detail/type_traits/result_of_adaptable_function.h b/thrust/thrust/detail/type_traits/result_of_adaptable_function.h
index 0a0d526cd97..1e3ab718e9a 100644
--- a/thrust/thrust/detail/type_traits/result_of_adaptable_function.h
+++ b/thrust/thrust/detail/type_traits/result_of_adaptable_function.h
@@ -26,7 +26,6 @@
 #  pragma system_header
 #endif // no system header
 #include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
 
 #include <cuda/std/__type_traits/void_t.h>
 
diff --git a/thrust/thrust/detail/vector_base.h b/thrust/thrust/detail/vector_base.h
index bc7ce2e41ac..e11b19cfbe3 100644
--- a/thrust/thrust/detail/vector_base.h
+++ b/thrust/thrust/detail/vector_base.h
@@ -77,14 +77,12 @@ class vector_base
    */
   explicit vector_base(const Alloc& alloc);
 
-  /*! This constructor creates a vector_base with default-constructed
-   *  elements.
+  /*! This constructor creates a vector_base with value-initialized elements.
    *  \param n The number of elements to create.
    */
   explicit vector_base(size_type n);
 
-  /*! This constructor creates a vector_base with default-constructed
-   *  elements.
+  /*! This constructor creates a vector_base with value-initialized elements.
    *  \param n The number of elements to create.
    *  \param alloc The allocator to use by this vector_base.
    */
@@ -210,7 +208,7 @@ class vector_base
    *  This method will resize this vector_base to the specified number of
    *  elements. If the number is smaller than this vector_base's current
    *  size this vector_base is truncated, otherwise this vector_base is
-   *  extended and new elements are default constructed.
+   *  extended and new elements are value initialized.
    */
   void resize(size_type new_size);
 
@@ -499,7 +497,7 @@ class vector_base
   template <typename ForwardIterator>
   void range_init(ForwardIterator first, ForwardIterator last, thrust::random_access_traversal_tag);
 
-  void default_init(size_type n);
+  void value_init(size_type n);
 
   void fill_init(size_type n, const T& x);
 
@@ -512,7 +510,7 @@ class vector_base
   template <typename InputIteratorOrIntegralType>
   void insert_dispatch(iterator position, InputIteratorOrIntegralType n, InputIteratorOrIntegralType x, true_type);
 
-  // this method appends n default-constructed elements at the end
+  // this method appends n value-initialized elements at the end
   void append(size_type n);
 
   // this method performs insertion from a fill value
diff --git a/thrust/thrust/detail/vector_base.inl b/thrust/thrust/detail/vector_base.inl
index c98034b3bcd..5f0cb87e12e 100644
--- a/thrust/thrust/detail/vector_base.inl
+++ b/thrust/thrust/detail/vector_base.inl
@@ -64,7 +64,7 @@ vector_base<T, Alloc>::vector_base(size_type n)
     : m_storage()
     , m_size(0)
 {
-  default_init(n);
+  value_init(n);
 } // end vector_base::vector_base()
 
 template <typename T, typename Alloc>
@@ -72,7 +72,7 @@ vector_base<T, Alloc>::vector_base(size_type n, const Alloc& alloc)
     : m_storage(alloc)
     , m_size(0)
 {
-  default_init(n);
+  value_init(n);
 } // end vector_base::vector_base()
 
 template <typename T, typename Alloc>
@@ -212,16 +212,16 @@ void vector_base<T, Alloc>::init_dispatch(IteratorOrIntegralType n, IteratorOrIn
 } // end vector_base::init_dispatch()
 
 template <typename T, typename Alloc>
-void vector_base<T, Alloc>::default_init(size_type n)
+void vector_base<T, Alloc>::value_init(size_type n)
 {
   if (n > 0)
   {
     m_storage.allocate(n);
     m_size = n;
 
-    m_storage.default_construct_n(begin(), size());
+    m_storage.value_initialize_n(begin(), size());
   } // end if
-} // end vector_base::default_init()
+} // end vector_base::value_init()
 
 template <typename T, typename Alloc>
 void vector_base<T, Alloc>::fill_init(size_type n, const T& x)
@@ -792,7 +792,7 @@ void vector_base<T, Alloc>::append(size_type n)
       // we've got room for all of them
 
       // default construct new elements at the end of the vector
-      m_storage.default_construct_n(end(), n);
+      m_storage.value_initialize_n(end(), n);
 
       // extend the size
       m_size += n;
@@ -822,7 +822,7 @@ void vector_base<T, Alloc>::append(size_type n)
         new_end = m_storage.uninitialized_copy(begin(), end(), new_storage.begin());
 
         // construct new elements to insert
-        new_storage.default_construct_n(new_end, n);
+        new_storage.value_initialize_n(new_end, n);
         new_end += n;
       } // end try
       catch (...)
diff --git a/thrust/thrust/device_reference.h b/thrust/thrust/device_reference.h
index cdfdb61dce4..40a6790a5a1 100644
--- a/thrust/thrust/device_reference.h
+++ b/thrust/thrust/device_reference.h
@@ -298,7 +298,7 @@ class device_reference : public thrust::reference<T, thrust::device_ptr<T>, thru
   }
 
 // declare these members for the purpose of Doxygenating them
-// they actually exist in a derived-from class
+// they actually exist in a base class
 #if 0
     /*! Address-of operator returns a \p device_ptr pointing to the object
      *  referenced by this \p device_reference. It does not return the
@@ -960,7 +960,7 @@ _CCCL_HOST_DEVICE void swap(device_reference<T>& x, device_reference<T>& y)
 }
 
 // declare these methods for the purpose of Doxygenating them
-// they actually are defined for a derived-from class
+// they actually are defined for a base class
 #if THRUST_DOXYGEN
 /*! Writes to an output stream the value of a \p device_reference.
  *
diff --git a/thrust/thrust/device_vector.h b/thrust/thrust/device_vector.h
index 1d01691d996..a63def5ba89 100644
--- a/thrust/thrust/device_vector.h
+++ b/thrust/thrust/device_vector.h
@@ -272,7 +272,7 @@ class device_vector : public detail::vector_base<T, Alloc>
   {}
 
 // declare these members for the purpose of Doxygenating them
-// they actually exist in a derived-from class
+// they actually exist in a base class
 #if 0
     /*! \brief Resizes this vector to the specified number of elements.
      *  \param new_size Number of elements this vector should contain.
diff --git a/thrust/thrust/functional.h b/thrust/thrust/functional.h
index 581568f4457..4b88f469547 100644
--- a/thrust/thrust/functional.h
+++ b/thrust/thrust/functional.h
@@ -214,34 +214,13 @@ struct THRUST_DEPRECATED binary_function
  *  \see binary_function
  */
 template <typename T = void>
-struct plus
+struct plus : public ::cuda::std::plus<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = T;
-
-  /*! Function call operator. The return value is <tt>lhs + rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr T operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs + rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end plus
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(plus, +);
-
 /*! \p minus is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>minus<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x-y</tt>.
@@ -277,34 +256,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(plus, +);
  *  \see binary_function
  */
 template <typename T = void>
-struct minus
+struct minus : public ::cuda::std::minus<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = T;
-
-  /*! Function call operator. The return value is <tt>lhs - rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr T operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs - rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end minus
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(minus, -);
-
 /*! \p multiplies is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>multiplies<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x*y</tt>.
@@ -340,34 +298,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(minus, -);
  *  \see binary_function
  */
 template <typename T = void>
-struct multiplies
+struct multiplies : public ::cuda::std::multiplies<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = T;
-
-  /*! Function call operator. The return value is <tt>lhs * rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr T operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs * rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end multiplies
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(multiplies, *);
-
 /*! \p divides is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>divides<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x/y</tt>.
@@ -403,34 +340,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(multiplies, *);
  *  \see binary_function
  */
 template <typename T = void>
-struct divides
+struct divides : public ::cuda::std::divides<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = T;
-
-  /*! Function call operator. The return value is <tt>lhs / rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr T operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs / rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end divides
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(divides, /);
-
 /*! \p modulus is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>modulus<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x \% y</tt>.
@@ -466,34 +382,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(divides, /);
  *  \see binary_function
  */
 template <typename T = void>
-struct modulus
+struct modulus : public ::cuda::std::modulus<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = T;
-
-  /*! Function call operator. The return value is <tt>lhs % rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr T operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs % rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end modulus
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(modulus, %);
-
 /*! \p negate is a function object. Specifically, it is an Adaptable Unary Function.
  *  If \c f is an object of class <tt>negate<T></tt>, and \c x is an object
  *  of class \c T, then <tt>f(x)</tt> returns <tt>-x</tt>.
@@ -624,34 +519,13 @@ THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(square, x* x);
  *  \see binary_function
  */
 template <typename T = void>
-struct equal_to
+struct equal_to : public ::cuda::std::equal_to<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = bool;
-
-  /*! Function call operator. The return value is <tt>lhs == rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr bool operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs == rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end equal_to
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(equal_to, ==);
-
 /*! \p not_equal_to is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>not_equal_to<T></tt> and \c x
@@ -665,34 +539,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(equal_to, ==);
  *  \see binary_function
  */
 template <typename T = void>
-struct not_equal_to
+struct not_equal_to : public ::cuda::std::not_equal_to<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = bool;
-
-  /*! Function call operator. The return value is <tt>lhs != rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr bool operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs != rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end not_equal_to
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(not_equal_to, !=);
-
 /*! \p greater is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>greater<T></tt> and \c x
@@ -706,34 +559,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(not_equal_to, !=);
  *  \see binary_function
  */
 template <typename T = void>
-struct greater
+struct greater : public ::cuda::std::greater<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = bool;
-
-  /*! Function call operator. The return value is <tt>lhs > rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr bool operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs > rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end greater
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(greater, >);
-
 /*! \p less is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>less<T></tt> and \c x
@@ -747,34 +579,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(greater, >);
  *  \see binary_function
  */
 template <typename T = void>
-struct less
+struct less : public ::cuda::std::less<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = bool;
-
-  /*! Function call operator. The return value is <tt>lhs < rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr bool operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs < rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end less
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(less, <);
-
 /*! \p greater_equal is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>greater_equal<T></tt> and \c x
@@ -788,34 +599,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(less, <);
  *  \see binary_function
  */
 template <typename T = void>
-struct greater_equal
+struct greater_equal : public ::cuda::std::greater_equal<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = bool;
-
-  /*! Function call operator. The return value is <tt>lhs >= rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr bool operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs >= rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end greater_equal
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(greater_equal, >=);
-
 /*! \p less_equal is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>less_equal<T></tt> and \c x
@@ -829,34 +619,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(greater_equal, >=);
  *  \see binary_function
  */
 template <typename T = void>
-struct less_equal
+struct less_equal : public ::cuda::std::less_equal<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = bool;
-
-  /*! Function call operator. The return value is <tt>lhs <= rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr bool operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs <= rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end less_equal
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(less_equal, <=);
-
 /*! \}
  */
 
@@ -877,34 +646,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(less_equal, <=);
  *  \see binary_function
  */
 template <typename T = void>
-struct logical_and
+struct logical_and : public ::cuda::std::logical_and<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = bool;
-
-  /*! Function call operator. The return value is <tt>lhs && rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr bool operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs && rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end logical_and
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(logical_and, &&);
-
 /*! \p logical_or is a function object. Specifically, it is an Adaptable Binary Predicate,
  *  which means it is a function object that tests the truth or falsehood of some condition.
  *  If \c f is an object of class <tt>logical_or<T></tt> and \c x and \c y are objects of
@@ -917,34 +665,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(logical_and, &&);
  *  \see binary_function
  */
 template <typename T = void>
-struct logical_or
+struct logical_or : public ::cuda::std::logical_or<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = bool;
-
-  /*! Function call operator. The return value is <tt>lhs || rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr bool operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs || rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end logical_or
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(logical_or, ||);
-
 /*! \p logical_not is a function object. Specifically, it is an Adaptable Predicate,
  *  which means it is a function object that tests the truth or falsehood of some condition.
  *  If \c f is an object of class <tt>logical_not<T></tt> and \c x is an object of
@@ -971,34 +698,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(logical_or, ||);
  *  \see unary_function
  */
 template <typename T = void>
-struct logical_not
+struct logical_not : public ::cuda::std::logical_not<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = bool;
-
-  /*! Function call operator. The return value is <tt>!x</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr bool operator()(const T& x) const
-  {
-    return !x;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end logical_not
 
-THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(logical_not, !THRUST_FWD(x));
-
 /*! \}
  */
 
@@ -1041,34 +747,13 @@ THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(logical_not, !THRUST_FWD(x));
  *  \see binary_function
  */
 template <typename T = void>
-struct bit_and
+struct bit_and : public ::cuda::std::bit_and<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = T;
-
-  /*! Function call operator. The return value is <tt>lhs & rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr T operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs & rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end bit_and
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_and, &);
-
 /*! \p bit_or is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x|y</tt>.
@@ -1103,34 +788,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_and, &);
  *  \see binary_function
  */
 template <typename T = void>
-struct bit_or
+struct bit_or : public ::cuda::std::bit_or<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = T;
-
-  /*! Function call operator. The return value is <tt>lhs | rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr T operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs | rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end bit_or
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_or, |);
-
 /*! \p bit_xor is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x^y</tt>.
@@ -1165,34 +829,13 @@ THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_or, |);
  *  \see binary_function
  */
 template <typename T = void>
-struct bit_xor
+struct bit_xor : public ::cuda::std::bit_xor<T>
 {
-  /*! \typedef first_argument_type
-   *  \brief The type of the function object's first argument.
-   */
-  using first_argument_type = T;
-
-  /*! \typedef second_argument_type
-   *  \brief The type of the function object's second argument.
-   */
-  using second_argument_type = T;
-
-  /*! \typedef result_type
-   *  \brief The type of the function object's result;
-   */
-  using result_type = T;
-
-  /*! Function call operator. The return value is <tt>lhs ^ rhs</tt>.
-   */
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE constexpr T operator()(const T& lhs, const T& rhs) const
-  {
-    return lhs ^ rhs;
-  }
+  using first_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11  = T;
+  using second_argument_type _LIBCUDACXX_DEPRECATED_IN_CXX11 = T;
+  using result_type _LIBCUDACXX_DEPRECATED_IN_CXX11          = T;
 }; // end bit_xor
 
-THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_xor, ^);
-
 /*! \}
  */
 
@@ -1606,8 +1249,7 @@ _CCCL_SUPPRESS_DEPRECATED_PUSH
  *  \param pred The Adaptable Binary Predicate to negate.
  *  \return A new object, <tt>npred</tt> such that <tt>npred(x,y)</tt> always returns
  *          the same value as <tt>!pred(x,y)</tt>.
- *  \tparam Binary Predicate is a model of <a
- * href="https://en.cppreference.com/w/cpp/utility/functional/AdaptableBinaryPredicate">Adaptable Binary Predicate</a>.
+ *  \tparam Binary Predicate is a model of an Adaptable Binary Predicate.
  *  \see binary_negate
  *  \see not1
  */
@@ -1754,3 +1396,4 @@ THRUST_NAMESPACE_END
 
 #include <thrust/detail/functional.inl>
 #include <thrust/detail/functional/operators.h>
+#include <thrust/detail/type_traits/is_commutative.h>
diff --git a/thrust/thrust/host_vector.h b/thrust/thrust/host_vector.h
index baad2aeb849..6ede3b41e29 100644
--- a/thrust/thrust/host_vector.h
+++ b/thrust/thrust/host_vector.h
@@ -274,7 +274,7 @@ class host_vector : public detail::vector_base<T, Alloc>
   {}
 
 // declare these members for the purpose of Doxygenating them
-// they actually exist in a derived-from class
+// they actually exist in a base class
 #if 0
     /*! \brief Resizes this vector to the specified number of elements.
      *  \param new_size Number of elements this vector should contain.
diff --git a/thrust/thrust/iterator/detail/transform_iterator.inl b/thrust/thrust/iterator/detail/transform_iterator.inl
index 4b478099737..267e8781619 100644
--- a/thrust/thrust/iterator/detail/transform_iterator.inl
+++ b/thrust/thrust/iterator/detail/transform_iterator.inl
@@ -25,11 +25,9 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/result_of_adaptable_function.h>
 #include <thrust/iterator/iterator_adaptor.h>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/type_traits/remove_cvref.h>
 
 THRUST_NAMESPACE_BEGIN
@@ -40,27 +38,29 @@ class transform_iterator;
 namespace detail
 {
 
-// Compute the iterator_adaptor instantiation to be used for transform_iterator
+// Type function to compute the iterator_adaptor instantiation to be used for transform_iterator
 template <class UnaryFunc, class Iterator, class Reference, class Value>
-struct transform_iterator_base
+struct make_transform_iterator_base
 {
 private:
-  // By default, dereferencing the iterator yields the same as the function.
-  using reference = typename thrust::detail::ia_dflt_help<
-    Reference,
-    thrust::detail::result_of_adaptable_function<UnaryFunc(typename thrust::iterator_value<Iterator>::type)>>::type;
+  // FIXME(bgruber): the next line should be correct, but thrust::identity<T> lies and advertises a ::return_type of T,
+  // while its operator() returns const T& (which __invoke_of correctly detects), which causes transform_iterator to
+  // crash during dereferencing.
+  // using wrapped_func_ret_t = ::cuda::std::__invoke_of<UnaryFunc, iterator_value_t<Iterator>>;
+  using wrapped_func_ret_t = result_of_adaptable_function<UnaryFunc(iterator_value_t<Iterator>)>;
 
-  // To get the default for Value: remove cvref on the result type.
-  using value_type = typename thrust::detail::ia_dflt_help<Value, thrust::remove_cvref<reference>>::type;
+  // By default, dereferencing the iterator yields the same as the function.
+  using reference  = typename ia_dflt_help<Reference, wrapped_func_ret_t>::type;
+  using value_type = typename ia_dflt_help<Value, remove_cvref<reference>>::type;
 
 public:
   using type =
-    thrust::iterator_adaptor<transform_iterator<UnaryFunc, Iterator, Reference, Value>,
-                             Iterator,
-                             value_type,
-                             thrust::use_default,
-                             typename thrust::iterator_traits<Iterator>::iterator_category,
-                             reference>;
+    iterator_adaptor<transform_iterator<UnaryFunc, Iterator, Reference, Value>,
+                     Iterator,
+                     value_type,
+                     use_default,
+                     typename iterator_traits<Iterator>::iterator_category,
+                     reference>;
 };
 
 } // namespace detail
diff --git a/thrust/thrust/iterator/transform_iterator.h b/thrust/thrust/iterator/transform_iterator.h
index 19bbb23941a..081bf675bb0 100644
--- a/thrust/thrust/iterator/transform_iterator.h
+++ b/thrust/thrust/iterator/transform_iterator.h
@@ -194,13 +194,14 @@ THRUST_NAMESPACE_BEGIN
  */
 template <class AdaptableUnaryFunction, class Iterator, class Reference = use_default, class Value = use_default>
 class transform_iterator
-    : public detail::transform_iterator_base<AdaptableUnaryFunction, Iterator, Reference, Value>::type
+    : public detail::make_transform_iterator_base<AdaptableUnaryFunction, Iterator, Reference, Value>::type
 {
   /*! \cond
    */
 
 public:
-  using super_t = typename detail::transform_iterator_base<AdaptableUnaryFunction, Iterator, Reference, Value>::type;
+  using super_t =
+    typename detail::make_transform_iterator_base<AdaptableUnaryFunction, Iterator, Reference, Value>::type;
 
   friend class thrust::iterator_core_access;
   /*! \endcond
diff --git a/thrust/thrust/optional.h b/thrust/thrust/optional.h
index a0add00e61f..f761f24f85d 100644
--- a/thrust/thrust/optional.h
+++ b/thrust/thrust/optional.h
@@ -736,7 +736,7 @@ struct nullopt_t
 /// foo(thrust::nullopt); //pass an empty optional
 /// ```
 #ifdef __CUDA_ARCH__
-__device__ static _LIBCUDACXX_CONSTEXPR_GLOBAL
+__device__ static _CCCL_CONSTEXPR_GLOBAL
 #else
 static constexpr
 #endif // __CUDA_ARCH__
diff --git a/thrust/thrust/partition.h b/thrust/thrust/partition.h
index 8489db5a311..31f6e18b09b 100644
--- a/thrust/thrust/partition.h
+++ b/thrust/thrust/partition.h
@@ -161,10 +161,11 @@ ForwardIterator partition(ForwardIterator first, ForwardIterator last, Predicate
 /*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
  *  object \p pred applied to a stencil range <tt>[stencil, stencil + (last - first))</tt>,
  *  such that all of the elements whose corresponding stencil element satisfies \p pred precede all of the elements
- * whose corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator \c middle in
- * the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator \c stencil_i in the range
- * <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i in the range <tt>[stencil
- * + (middle - first), stencil + (last - first))</tt>. The return value of \p stable_partition is \c middle.
+ *  whose corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator \c middle in
+ *  the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator \c stencil_i in the range
+ *  <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i in the range
+ *  <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
+ *  The return value of \p stable_partition is \c middle.
  *
  *  Note that the relative order of elements in the two reordered sequences is not
  *  necessarily the same as it was in the original sequence. A different algorithm,
@@ -229,10 +230,11 @@ _CCCL_HOST_DEVICE ForwardIterator partition(
 /*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
  *  object \p pred applied to a stencil range <tt>[stencil, stencil + (last - first))</tt>,
  *  such that all of the elements whose corresponding stencil element satisfies \p pred precede all of the elements
- * whose corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator \c middle in
- * the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator \c stencil_i in the range
- * <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i in the range <tt>[stencil
- * + (middle - first), stencil + (last - first))</tt>. The return value of \p stable_partition is \c middle.
+ *  whose corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator \c middle in
+ *  the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator \c stencil_i in the range
+ *  <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i in the range
+ *  <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
+ *  The return value of \p stable_partition is \c middle.
  *
  *  Note that the relative order of elements in the two reordered sequences is not
  *  necessarily the same as it was in the original sequence. A different algorithm,
diff --git a/thrust/thrust/remove.h b/thrust/thrust/remove.h
index 067c87d8f25..80dfa786dcc 100644
--- a/thrust/thrust/remove.h
+++ b/thrust/thrust/remove.h
@@ -550,8 +550,8 @@ OutputIterator remove_copy_if(InputIterator first, InputIterator last, OutputIte
  * href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
- *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last
- * - first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range
+ *  <tt>[result, result + (last - first))</tt>.
  *
  *  The following code snippet demonstrates how to use \p remove_if to remove
  *  specific elements from an array of integers using the \p thrust::host execution policy for
@@ -610,8 +610,8 @@ _CCCL_HOST_DEVICE ForwardIterator remove_if(
  * href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
- *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last
- * - first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range
+ *  <tt>[result, result + (last - first))</tt>.
  *
  *  The following code snippet demonstrates how to use \p remove_if to remove
  *  specific elements from an array of integers.
@@ -668,8 +668,8 @@ ForwardIterator remove_if(ForwardIterator first, ForwardIterator last, InputIter
  * href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>. \tparam Predicate is a model
  * of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
- *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last
- * - first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range
+ *  <tt>[result, result + (last - first))</tt>.
  *
  *  The following code snippet demonstrates how to use \p remove_copy_if to copy
  *  a sequence of numbers to an output range while omitting specific elements using the \p thrust::host
@@ -732,8 +732,8 @@ _CCCL_HOST_DEVICE OutputIterator remove_copy_if(
  * href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>. \tparam Predicate is a model
  * of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
- *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last
- * - first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range
+ *  <tt>[result, result + (last - first))</tt>.
  *
  *  The following code snippet demonstrates how to use \p remove_copy_if to copy
  *  a sequence of numbers to an output range while omitting specific elements.
diff --git a/thrust/thrust/replace.h b/thrust/thrust/replace.h
index 5a455c385a0..5d0b58e6d06 100644
--- a/thrust/thrust/replace.h
+++ b/thrust/thrust/replace.h
@@ -54,7 +54,7 @@ THRUST_NAMESPACE_BEGIN
  *  \tparam DerivedPolicy The name of the derived execution policy.
  *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward
  * Iterator</a>, and \p ForwardIterator is mutable. \tparam T is a model of <a
- * href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable>Assignable">Assignable</a>, \p T is a model of <a
+ * href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>, \p T is a model of <a
  * href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">EqualityComparable</a>, objects of \p T may be
  * compared for equality with objects of \p ForwardIterator's \c value_type, and \p T is convertible to \p
  * ForwardIterator's \c value_type.
@@ -105,7 +105,7 @@ replace(const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
  *
  *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward
  * Iterator</a>, and \p ForwardIterator is mutable. \tparam T is a model of <a
- * href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable>Assignable">Assignable</a>, \p T is a model of <a
+ * href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>, \p T is a model of <a
  * href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">EqualityComparable</a>, objects of \p T may be
  * compared for equality with objects of \p ForwardIterator's \c value_type, and \p T is convertible to \p
  * ForwardIterator's \c value_type.
diff --git a/thrust/thrust/scan.h b/thrust/thrust/scan.h
index 174e6fe145a..44a32ebaec8 100644
--- a/thrust/thrust/scan.h
+++ b/thrust/thrust/scan.h
@@ -74,8 +74,8 @@ THRUST_NAMESPACE_BEGIN
  * Iterator</a>, and if \c x and \c y are objects of \c OutputIterator's \c value_type, then <tt>x + y</tt> is defined.
  * If \c T is \c OutputIterator's \c value_type, then <tt>T(0)</tt> is defined.
  *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
- * first))</tt> shall not overlap otherwise.
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range
+ *  <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p inclusive_scan to compute an in-place
  *  prefix sum using the \p thrust::host execution policy for parallelization:
@@ -131,8 +131,8 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan(
  * Iterator</a>, and if \c x and \c y are objects of \c OutputIterator's \c value_type, then <tt>x + y</tt> is defined.
  * If \c T is \c OutputIterator's \c value_type, then <tt>T(0)</tt> is defined.
  *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
- * first))</tt> shall not overlap otherwise.
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range
+ *  <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p inclusive_scan
  *
@@ -186,8 +186,8 @@ OutputIterator inclusive_scan(InputIterator first, InputIterator last, OutputIte
  * href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a> and \c
  * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
- * first))</tt> shall not overlap otherwise.
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range
+ *  <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p inclusive_scan to compute an in-place
  *  prefix sum using the \p thrust::host execution policy for parallelization:
@@ -242,8 +242,8 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan(
  * href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a> and \c
  * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
- * first))</tt> shall not overlap otherwise.
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range
+ *  <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p inclusive_scan
  *
@@ -291,8 +291,8 @@ inclusive_scan(InputIterator first, InputIterator last, OutputIterator result, A
  * Iterator</a>, and if \c x and \c y are objects of \c OutputIterator's \c value_type, then <tt>x + y</tt> is defined.
  * If \c T is \c OutputIterator's \c value_type, then <tt>T(0)</tt> is defined.
  *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
- * first))</tt> shall not overlap otherwise.
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range
+ *  <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan to compute an in-place
  *  prefix sum using the \p thrust::host execution policy for parallelization:
@@ -342,8 +342,8 @@ _CCCL_HOST_DEVICE OutputIterator exclusive_scan(
  * Iterator</a>, and if \c x and \c y are objects of \c OutputIterator's \c value_type, then <tt>x + y</tt> is defined.
  * If \c T is \c OutputIterator's \c value_type, then <tt>T(0)</tt> is defined.
  *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
- * first))</tt> shall not overlap otherwise.
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range
+ *  <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan
  *
@@ -391,8 +391,8 @@ OutputIterator exclusive_scan(InputIterator first, InputIterator last, OutputIte
  * Iterator</a>, and if \c x and \c y are objects of \c OutputIterator's \c value_type, then <tt>x + y</tt> is defined.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
  *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
- * first))</tt> shall not overlap otherwise.
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range
+ *  <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan to compute an in-place
  *  prefix sum using the \p thrust::host execution policy for parallelization:
@@ -443,8 +443,8 @@ _CCCL_HOST_DEVICE OutputIterator exclusive_scan(
  * Iterator</a>, and if \c x and \c y are objects of \c OutputIterator's \c value_type, then <tt>x + y</tt> is defined.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
  *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
- * first))</tt> shall not overlap otherwise.
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range
+ *  <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan
  *
@@ -498,8 +498,8 @@ OutputIterator exclusive_scan(InputIterator first, InputIterator last, OutputIte
  * href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a> and \c
  * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
- * first))</tt> shall not overlap otherwise.
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range
+ *  <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan to compute an in-place
  *  prefix sum using the \p thrust::host execution policy for parallelization:
@@ -561,8 +561,8 @@ _CCCL_HOST_DEVICE OutputIterator exclusive_scan(
  * href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a> and \c
  * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
- * first))</tt> shall not overlap otherwise.
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range
+ *  <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan
  *
@@ -628,9 +628,10 @@ exclusive_scan(InputIterator first, InputIterator last, OutputIterator result, T
  * href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>, and if \c x and \c y are
  * objects of \c OutputIterator's \c value_type, then <tt>binary_op(x,y)</tt> is defined.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p inclusive_scan_by_key using the \p thrust::host
  *  execution policy for parallelization:
@@ -694,9 +695,10 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan_by_key(
  * href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>, and if \c x and \c y are
  * objects of \c OutputIterator's \c value_type, then <tt>binary_op(x,y)</tt> is defined.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p inclusive_scan_by_key
  *
@@ -759,9 +761,10 @@ inclusive_scan_by_key(InputIterator1 first1, InputIterator1 last1, InputIterator
  * objects of \c OutputIterator's \c value_type, then <tt>binary_op(x,y)</tt> is defined. \tparam BinaryPredicate is a
  * model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p inclusive_scan_by_key using the \p thrust::host
  *  execution policy for parallelization:
@@ -835,9 +838,10 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan_by_key(
  * objects of \c OutputIterator's \c value_type, then <tt>binary_op(x,y)</tt> is defined. \tparam BinaryPredicate is a
  * model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p inclusive_scan_by_key
  *
@@ -911,9 +915,10 @@ OutputIterator inclusive_scan_by_key(
  * href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a> and \c
  * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p inclusive_scan_by_key using the \p thrust::host
  *  execution policy for parallelization:
@@ -994,9 +999,10 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan_by_key(
  * href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a> and \c
  * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p inclusive_scan_by_key
  *
@@ -1061,9 +1067,10 @@ OutputIterator inclusive_scan_by_key(
  *  \param first2 The beginning of the input value sequence.
  *  \param result The beginning of the output value sequence.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the
  *  \p thrust::host execution policy for parallelization:
@@ -1118,9 +1125,10 @@ _CCCL_HOST_DEVICE OutputIterator exclusive_scan_by_key(
  *  \param first2 The beginning of the input value sequence.
  *  \param result The beginning of the output value sequence.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan_by_key.
  *
@@ -1166,9 +1174,10 @@ exclusive_scan_by_key(InputIterator1 first1, InputIterator1 last1, InputIterator
  *  \param init The initial of the exclusive sum value.
  *  \return The end of the output sequence.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the \p
  *  thrust::host execution policy for parallelization:
@@ -1223,9 +1232,10 @@ _CCCL_HOST_DEVICE OutputIterator exclusive_scan_by_key(
  *  \param init The initial of the exclusive sum value.
  *  \return The end of the output sequence.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan_by_key
  *
@@ -1281,9 +1291,10 @@ OutputIterator exclusive_scan_by_key(
  *  \param binary_pred The binary predicate used to determine equality of keys.
  *  \return The end of the output sequence.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the
  *  \p thrust::host execution policy for parallelization:
@@ -1352,9 +1363,10 @@ _CCCL_HOST_DEVICE OutputIterator exclusive_scan_by_key(
  *  \param binary_pred The binary predicate used to determine equality of keys.
  *  \return The end of the output sequence.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan_by_key
  *
@@ -1434,9 +1446,10 @@ OutputIterator exclusive_scan_by_key(
  * is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a> and
  * \c AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the
  *  \p thrust::host execution policy for parallelization:
@@ -1525,9 +1538,10 @@ _CCCL_HOST_DEVICE OutputIterator exclusive_scan_by_key(
  * is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a> and
  * \c AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1
- * - first1))</tt> shall not overlap otherwise. \pre \p first2 may equal \p result but the range <tt>[first2, first2 +
- * (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and the range
+ *  <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *
  *  The following code snippet demonstrates how to use \p exclusive_scan_by_key
  *
diff --git a/thrust/thrust/scatter.h b/thrust/thrust/scatter.h
index ccfb0b3d609..de46c812fb4 100644
--- a/thrust/thrust/scatter.h
+++ b/thrust/thrust/scatter.h
@@ -64,13 +64,13 @@ THRUST_NAMESPACE_BEGIN
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
  * `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map
- * + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
+ *  `[map, map + (last - first))` for all iterators `i` in the range `[map, map + (last - first))`.
  *
- *  \pre The expression `result[*i]` shall be valid for all iterators in the range `[map,map + (last - first))`.
+ *  \pre The expression `result[*i]` shall be valid for all iterators in the range `[map, map + (last - first))`.
  *
- *  The following code snippet demonstrates how to use \p scatter to
- *  reorder a range using the \p thrust::device execution policy for parallelization:
+ *  The following code snippet demonstrates how to use \p scatter to reorder a range using
+ *  the \p thrust::device execution policy for parallelization:
  *
  *  \code
  *  #include <thrust/scatter.h>
@@ -125,13 +125,12 @@ scatter(const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
  * `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map
- * + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
+ *  `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
  *
  *  \pre The expression `result[*i]` shall be valid for all iterators in the range `[map,map + (last - first))`.
  *
- *  The following code snippet demonstrates how to use \p scatter to
- *  reorder a range.
+ *  The following code snippet demonstrates how to use \p scatter to reorder a range.
  *
  *  \code
  *  #include <thrust/scatter.h>
@@ -187,14 +186,14 @@ void scatter(InputIterator1 first, InputIterator1 last, InputIterator2 map, Rand
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
  * `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map
- * + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
+ *  `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
  * `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
  *
  *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for
- * which the following condition holds: `*(stencil + i) != false`.
+ *  which the following condition holds: `*(stencil + i) != false`.
  *
  *  \code
  *  #include <thrust/scatter.h>
@@ -251,14 +250,14 @@ _CCCL_HOST_DEVICE void scatter_if(
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
  * `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map
- * + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
+ *  `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
  * `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
  *
  *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for
- * which the following condition holds: `*(stencil + i) != false`.
+ *  which the following condition holds: `*(stencil + i) != false`.
  *
  *  \code
  *  #include <thrust/scatter.h>
@@ -311,8 +310,8 @@ void scatter_if(
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
  * `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map
- * + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
+ *  `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
  * `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
@@ -391,8 +390,8 @@ _CCCL_HOST_DEVICE void scatter_if(
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
  * `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map
- * + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
+ *  `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range
  * `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
diff --git a/thrust/thrust/system/cuda/detail/async/copy.h b/thrust/thrust/system/cuda/detail/async/copy.h
index 8f15d95868a..bf1958792b1 100644
--- a/thrust/thrust/system/cuda/detail/async/copy.h
+++ b/thrust/thrust/system/cuda/detail/async/copy.h
@@ -330,7 +330,8 @@ auto async_copy(thrust::cuda::execution_policy<FromPolicy>& from_exec,
                 ForwardIt first,
                 Sentinel last,
                 OutputIt output)
-  THRUST_RETURNS(thrust::system::cuda::detail::async_copy_n(from_exec, to_exec, first, distance(first, last), output))
+  THRUST_RETURNS(
+    thrust::system::cuda::detail::async_copy_n(from_exec, to_exec, first, thrust::distance(first, last), output))
 
   // ADL entry point.
   template <typename FromPolicy, typename ToPolicy, typename ForwardIt, typename Sentinel, typename OutputIt>
@@ -339,7 +340,8 @@ auto async_copy(thrust::cuda::execution_policy<FromPolicy>& from_exec,
                   ForwardIt first,
                   Sentinel last,
                   OutputIt output)
-    THRUST_RETURNS(thrust::system::cuda::detail::async_copy_n(from_exec, to_exec, first, distance(first, last), output))
+    THRUST_RETURNS(
+      thrust::system::cuda::detail::async_copy_n(from_exec, to_exec, first, thrust::distance(first, last), output))
 
   // ADL entry point.
   template <typename FromPolicy, typename ToPolicy, typename ForwardIt, typename Sentinel, typename OutputIt>
@@ -348,7 +350,8 @@ auto async_copy(thrust::cuda::execution_policy<FromPolicy>& from_exec,
                   ForwardIt first,
                   Sentinel last,
                   OutputIt output)
-    THRUST_RETURNS(thrust::system::cuda::detail::async_copy_n(from_exec, to_exec, first, distance(first, last), output))
+    THRUST_RETURNS(
+      thrust::system::cuda::detail::async_copy_n(from_exec, to_exec, first, thrust::distance(first, last), output))
 
 } // namespace cuda_cub
 
diff --git a/thrust/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/thrust/system/cuda/detail/async/exclusive_scan.h
index 4607f69545c..eecd39be29c 100644
--- a/thrust/thrust/system/cuda/detail/async/exclusive_scan.h
+++ b/thrust/thrust/system/cuda/detail/async/exclusive_scan.h
@@ -155,7 +155,7 @@ auto async_exclusive_scan(
   InitialValueType&& init,
   BinaryOp&& op)
   THRUST_RETURNS(thrust::system::cuda::detail::async_exclusive_scan_n(
-    policy, first, distance(first, THRUST_FWD(last)), THRUST_FWD(out), THRUST_FWD(init), THRUST_FWD(op)))
+    policy, first, thrust::distance(first, THRUST_FWD(last)), THRUST_FWD(out), THRUST_FWD(init), THRUST_FWD(op)))
 
 } // namespace cuda_cub
 
diff --git a/thrust/thrust/system/cuda/detail/async/for_each.h b/thrust/thrust/system/cuda/detail/async/for_each.h
index 5cb39d6a17b..b0fb05a90c4 100644
--- a/thrust/thrust/system/cuda/detail/async/for_each.h
+++ b/thrust/thrust/system/cuda/detail/async/for_each.h
@@ -122,7 +122,8 @@ namespace cuda_cub
 // ADL entry point.
 template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename UnaryFunction>
 auto async_for_each(execution_policy<DerivedPolicy>& policy, ForwardIt first, Sentinel last, UnaryFunction&& func)
-  THRUST_RETURNS(thrust::system::cuda::detail::async_for_each_n(policy, first, distance(first, last), THRUST_FWD(func)));
+  THRUST_RETURNS(
+    thrust::system::cuda::detail::async_for_each_n(policy, first, thrust::distance(first, last), THRUST_FWD(func)));
 
 } // namespace cuda_cub
 
diff --git a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
index 6306866461a..6388c59fab5 100644
--- a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
+++ b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
@@ -138,7 +138,7 @@ template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typenam
 auto async_inclusive_scan(
   execution_policy<DerivedPolicy>& policy, ForwardIt first, Sentinel&& last, OutputIt&& out, BinaryOp&& op)
   THRUST_RETURNS(thrust::system::cuda::detail::async_inclusive_scan_n(
-    policy, first, distance(first, THRUST_FWD(last)), THRUST_FWD(out), THRUST_FWD(op)))
+    policy, first, thrust::distance(first, THRUST_FWD(last)), THRUST_FWD(out), THRUST_FWD(op)))
 
 } // namespace cuda_cub
 
diff --git a/thrust/thrust/system/cuda/detail/async/sort.h b/thrust/thrust/system/cuda/detail/async/sort.h
index e8af73a85ae..96fdb179b9d 100644
--- a/thrust/thrust/system/cuda/detail/async/sort.h
+++ b/thrust/thrust/system/cuda/detail/async/sort.h
@@ -314,7 +314,8 @@ template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typenam
 auto async_stable_sort(execution_policy<DerivedPolicy>& policy, ForwardIt first, Sentinel last, StrictWeakOrdering comp)
   // A GCC 5 bug requires an explicit trailing return type here, so stick with
   // THRUST_DECLTYPE_RETURNS for now.
-  THRUST_DECLTYPE_RETURNS(thrust::system::cuda::detail::async_stable_sort_n(policy, first, distance(first, last), comp))
+  THRUST_DECLTYPE_RETURNS(
+    thrust::system::cuda::detail::async_stable_sort_n(policy, first, thrust::distance(first, last), comp))
 
 } // namespace cuda_cub
 
diff --git a/thrust/thrust/system/cuda/detail/async/transform.h b/thrust/thrust/system/cuda/detail/async/transform.h
index 68ab438adfb..d754081a2da 100644
--- a/thrust/thrust/system/cuda/detail/async/transform.h
+++ b/thrust/thrust/system/cuda/detail/async/transform.h
@@ -127,8 +127,8 @@ namespace cuda_cub
 template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename OutputIt, typename UnaryOperation>
 auto async_transform(
   execution_policy<DerivedPolicy>& policy, ForwardIt first, Sentinel last, OutputIt output, UnaryOperation&& op)
-  THRUST_RETURNS(
-    thrust::system::cuda::detail::async_transform_n(policy, first, distance(first, last), output, THRUST_FWD(op)));
+  THRUST_RETURNS(thrust::system::cuda::detail::async_transform_n(
+    policy, first, thrust::distance(first, last), output, THRUST_FWD(op)));
 
 } // namespace cuda_cub
 
diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
index 35a772d3f24..90c99688f7c 100644
--- a/thrust/thrust/system/cuda/detail/dispatch.h
+++ b/thrust/thrust/system/cuda/detail/dispatch.h
@@ -32,9 +32,8 @@
 #include <cstdint>
 
 /**
- * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
- * implementation. This version assumes that callables for both branches consist
- * of the same tokens, and is intended to be used with Thrust-style dispatch
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm implementation. This version assumes
+ * that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style dispatch
  * interfaces, that always deduce the size type from the arguments.
  */
 #define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments)         \
@@ -50,13 +49,11 @@
   }
 
 /**
- * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
- * implementation. This version assumes that callables for both branches consist
- * of the same tokens, and is intended to be used with Thrust-style dispatch
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm implementation. This version assumes
+ * that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style dispatch
  * interfaces, that always deduce the size type from the arguments.
  *
- * This version of the macro supports providing two count variables, which is
- * necessary for set algorithms.
+ * This version of the macro supports providing two count variables, which is necessary for set algorithms.
  */
 #define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \
   if (count1 + count2 <= thrust::detail::integer_traits<std::int32_t>::const_max)  \
@@ -71,14 +68,13 @@
     auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int64_t>(count2);       \
     status                              = call arguments;                          \
   }
+
 /**
- * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
- * implementation. This version allows using different token sequences for callables
- * in both branches, and is intended to be used with CUB-style dispatch interfaces,
- * where the "simple" interface always forces the size to be `int` (making it harder
- * for us to use), but the complex interface that we end up using doesn't actually
- * provide a way to fully deduce the type from just the call, making the size type
- * appear in the token sequence of the callable.
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm implementation. This version allows
+ * using different token sequences for callables in both branches, and is intended to be used with CUB-style dispatch
+ * interfaces, where the "simple" interface always forces the size to be `int` (making it harder for us to use), but the
+ * complex interface that we end up using doesn't actually provide a way to fully deduce the type from just the call,
+ * making the size type appear in the token sequence of the callable.
  *
  * See reduce_n_impl to see an example of how this is meant to be used.
  */
@@ -93,3 +89,18 @@
     auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int64_t>(count);      \
     status                             = call_64 arguments;                     \
   }
+
+/// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts.
+#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \
+  if (count1 + count2 <= thrust::detail::integer_traits<std::int32_t>::const_max)               \
+  {                                                                                             \
+    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int32_t>(count1);                    \
+    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int32_t>(count2);                    \
+    status                              = call_32 arguments;                                    \
+  }                                                                                             \
+  else                                                                                          \
+  {                                                                                             \
+    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int64_t>(count1);                    \
+    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int64_t>(count2);                    \
+    status                              = call_64 arguments;                                    \
+  }
diff --git a/thrust/thrust/system/cuda/detail/merge.h b/thrust/thrust/system/cuda/detail/merge.h
index 97e8d9a4efe..c08fc94fffe 100644
--- a/thrust/thrust/system/cuda/detail/merge.h
+++ b/thrust/thrust/system/cuda/detail/merge.h
@@ -1,5 +1,5 @@
 /******************************************************************************
-j * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,17 +38,13 @@ j * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
-#  include <thrust/detail/mpl/math.h>
+#  include <cub/device/device_merge.cuh>
+
 #  include <thrust/detail/temporary_array.h>
 #  include <thrust/distance.h>
-#  include <thrust/extrema.h>
-#  include <thrust/merge.h>
+#  include <thrust/iterator/iterator_traits.h>
 #  include <thrust/pair.h>
-#  include <thrust/system/cuda/detail/cdp_dispatch.h>
-#  include <thrust/system/cuda/detail/core/agent_launcher.h>
-#  include <thrust/system/cuda/detail/core/util.h>
-#  include <thrust/system/cuda/detail/execution_policy.h>
-#  include <thrust/system/cuda/detail/par_to_seq.h>
+#  include <thrust/system/cuda/detail/dispatch.h>
 #  include <thrust/system/cuda/detail/util.h>
 
 #  include <cstdint>
@@ -56,740 +52,81 @@ j * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
-
-namespace __merge
-{
-
-template <class KeysIt1, class KeysIt2, class Size, class BinaryPred>
-Size THRUST_DEVICE_FUNCTION
-merge_path(KeysIt1 keys1, KeysIt2 keys2, Size keys1_count, Size keys2_count, Size diag, BinaryPred binary_pred)
-{
-  using key1_type = typename iterator_traits<KeysIt1>::value_type;
-  using key2_type = typename iterator_traits<KeysIt2>::value_type;
-
-  Size keys1_begin = thrust::max<Size>(0, diag - keys2_count);
-  Size keys1_end   = thrust::min<Size>(diag, keys1_count);
-
-  while (keys1_begin < keys1_end)
-  {
-    Size mid       = (keys1_begin + keys1_end) >> 1;
-    key1_type key1 = keys1[mid];
-    key2_type key2 = keys2[diag - 1 - mid];
-    bool pred      = binary_pred(key2, key1);
-    if (pred)
-    {
-      keys1_end = mid;
-    }
-    else
-    {
-      keys1_begin = mid + 1;
-    }
-  }
-  return keys1_begin;
-}
-
-template <class It, class T2, class CompareOp, int ITEMS_PER_THREAD>
-THRUST_DEVICE_FUNCTION void serial_merge(
-  It keys_shared,
-  int keys1_beg,
-  int keys2_beg,
-  int keys1_count,
-  int keys2_count,
-  T2 (&output)[ITEMS_PER_THREAD],
-  int (&indices)[ITEMS_PER_THREAD],
-  CompareOp compare_op)
-{
-  int keys1_end = keys1_beg + keys1_count;
-  int keys2_end = keys2_beg + keys2_count;
-
-  using key_type = typename iterator_value<It>::type;
-
-  key_type key1 = keys_shared[keys1_beg];
-  key_type key2 = keys_shared[keys2_beg];
-
-#  pragma unroll
-  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-  {
-    bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
-
-    output[ITEM]  = p ? key2 : key1;
-    indices[ITEM] = p ? keys2_beg++ : keys1_beg++;
-
-    if (p)
-    {
-      key2 = keys_shared[keys2_beg];
-    }
-    else
-    {
-      key1 = keys_shared[keys1_beg];
-    }
-  }
-}
-
-template <int _BLOCK_THREADS,
-          int _ITEMS_PER_THREAD                     = 1,
-          cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
-          cub::CacheLoadModifier _LOAD_MODIFIER     = cub::LOAD_LDG,
-          cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT>
-struct PtxPolicy
-{
-  enum
-  {
-    BLOCK_THREADS    = _BLOCK_THREADS,
-    ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
-    ITEMS_PER_TILE   = _BLOCK_THREADS * _ITEMS_PER_THREAD,
-  };
-
-  static const cub::BlockLoadAlgorithm LOAD_ALGORITHM   = _LOAD_ALGORITHM;
-  static const cub::CacheLoadModifier LOAD_MODIFIER     = _LOAD_MODIFIER;
-  static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
-}; // PtxPolicy
-
-template <class KeysIt1, class KeysIt2, class Size, class CompareOp>
-struct PartitionAgent
-{
-  template <class Arch>
-  struct PtxPlan : PtxPolicy<256>
-  {};
-
-  using ptx_plan = core::specialize_plan<PtxPlan>;
-
-  THRUST_AGENT_ENTRY(
-    KeysIt1 keys1,
-    KeysIt2 keys2,
-    Size keys1_count,
-    Size keys2_count,
-    Size num_partitions,
-    Size* merge_partitions,
-    CompareOp compare_op,
-    int items_per_tile,
-    char* /*shmem*/)
-  {
-    Size partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
-    if (partition_idx < num_partitions)
-    {
-      Size partition_at               = (thrust::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
-      Size partition_diag             = merge_path(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op);
-      merge_partitions[partition_idx] = partition_diag;
-    }
-  }
-}; // struct PartitionAgent
-
-template <class Arch, class TSize>
-struct Tuning;
-
-namespace mpl = thrust::detail::mpl::math;
-
-template <int NOMINAL_4B_ITEMS_PER_THREAD, size_t INPUT_SIZE>
-struct items_per_thread
-{
-  enum
-  {
-    ITEMS_PER_THREAD =
-      mpl::min<int,
-               NOMINAL_4B_ITEMS_PER_THREAD,
-               mpl::max<int, 1, static_cast<int>(NOMINAL_4B_ITEMS_PER_THREAD * 4 / INPUT_SIZE)>::value>::value,
-    value = mpl::is_odd<int, ITEMS_PER_THREAD>::value ? ITEMS_PER_THREAD : ITEMS_PER_THREAD + 1
-  };
-};
-
-template <class TSize>
-struct Tuning<sm30, TSize>
-{
-  const static int INPUT_SIZE = TSize::value;
-  enum
-  {
-    NOMINAL_4B_ITEMS_PER_THREAD = 7,
-    ITEMS_PER_THREAD            = items_per_thread<NOMINAL_4B_ITEMS_PER_THREAD, INPUT_SIZE>::value
-  };
-
-  using type =
-    PtxPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_STORE_WARP_TRANSPOSE>;
-}; // Tuning sm300
-
-template <class TSize>
-struct Tuning<sm60, TSize> : Tuning<sm30, TSize>
-{
-  enum
-  {
-    NOMINAL_4B_ITEMS_PER_THREAD = 15,
-    ITEMS_PER_THREAD            = items_per_thread<NOMINAL_4B_ITEMS_PER_THREAD, Tuning::INPUT_SIZE>::value
-  };
-
-  using type =
-    PtxPolicy<512, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_STORE_WARP_TRANSPOSE>;
-}; // Tuning sm52
-
-template <class TSize>
-struct Tuning<sm52, TSize> : Tuning<sm30, TSize>
-{
-  enum
-  {
-    NOMINAL_4B_ITEMS_PER_THREAD = 13,
-    ITEMS_PER_THREAD            = items_per_thread<NOMINAL_4B_ITEMS_PER_THREAD, Tuning::INPUT_SIZE>::value
-  };
-
-  using type =
-    PtxPolicy<512, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_STORE_WARP_TRANSPOSE>;
-}; // Tuning sm52
-
-template <class TSize>
-struct Tuning<sm35, TSize> : Tuning<sm30, TSize>
-{
-  const static int INPUT_SIZE = TSize::value;
-  enum
-  {
-    NOMINAL_4B_ITEMS_PER_THREAD = 11,
-    ITEMS_PER_THREAD            = items_per_thread<NOMINAL_4B_ITEMS_PER_THREAD, Tuning::INPUT_SIZE>::value
-  };
-
-  using type =
-    PtxPolicy<256, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_STORE_WARP_TRANSPOSE>;
-}; // Tuning sm350
-
-template <size_t VALUE>
-struct integer_constant : thrust::detail::integral_constant<size_t, VALUE>
-{};
-
-template <class KeysIt1,
-          class KeysIt2,
-          class ItemsIt1,
-          class ItemsIt2,
-          class Size,
-          class KeysOutputIt,
-          class ItemsOutputIt,
-          class CompareOp,
-          class MERGE_ITEMS>
-struct MergeAgent
-{
-  using key1_type  = typename iterator_traits<KeysIt1>::value_type;
-  using key2_type  = typename iterator_traits<KeysIt2>::value_type;
-  using item1_type = typename iterator_traits<ItemsIt1>::value_type;
-  using item2_type = typename iterator_traits<ItemsIt2>::value_type;
-
-  using key_type  = key1_type;
-  using item_type = item1_type;
-
-  using tuning_type =
-    ::cuda::std::__conditional_t<MERGE_ITEMS::value,
-                                 integer_constant<sizeof(key_type) + sizeof(item_type)>,
-                                 integer_constant<sizeof(key_type)>>;
-
-  template <class Arch>
-  struct PtxPlan : Tuning<Arch, tuning_type>::type
-  {
-    using tuning = Tuning<Arch, tuning_type>;
-
-    using KeysLoadIt1  = typename core::LoadIterator<PtxPlan, KeysIt1>::type;
-    using KeysLoadIt2  = typename core::LoadIterator<PtxPlan, KeysIt2>::type;
-    using ItemsLoadIt1 = typename core::LoadIterator<PtxPlan, ItemsIt1>::type;
-    using ItemsLoadIt2 = typename core::LoadIterator<PtxPlan, ItemsIt2>::type;
-
-    using BlockLoadKeys1  = typename core::BlockLoad<PtxPlan, KeysLoadIt1>::type;
-    using BlockLoadKeys2  = typename core::BlockLoad<PtxPlan, KeysLoadIt2>::type;
-    using BlockLoadItems1 = typename core::BlockLoad<PtxPlan, ItemsLoadIt1>::type;
-    using BlockLoadItems2 = typename core::BlockLoad<PtxPlan, ItemsLoadIt2>::type;
-
-    using BlockStoreKeys  = typename core::BlockStore<PtxPlan, KeysOutputIt, key_type>::type;
-    using BlockStoreItems = typename core::BlockStore<PtxPlan, ItemsOutputIt, item_type>::type;
-
-    // gather required temporary storage in a union
-    //
-    union TempStorage
-    {
-      typename BlockLoadKeys1::TempStorage load_keys1;
-      typename BlockLoadKeys2::TempStorage load_keys2;
-      typename BlockLoadItems1::TempStorage load_items1;
-      typename BlockLoadItems2::TempStorage load_items2;
-      typename BlockStoreKeys::TempStorage store_keys;
-      typename BlockStoreItems::TempStorage store_items;
-
-      core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE + 1> items_shared;
-      core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + 1> keys_shared;
-    }; // union TempStorage
-  }; // struct PtxPlan
-
-  using ptx_plan = typename core::specialize_plan_msvc10_war<PtxPlan>::type::type;
-
-  using KeysLoadIt1     = typename ptx_plan::KeysLoadIt1;
-  using KeysLoadIt2     = typename ptx_plan::KeysLoadIt2;
-  using ItemsLoadIt1    = typename ptx_plan::ItemsLoadIt1;
-  using ItemsLoadIt2    = typename ptx_plan::ItemsLoadIt2;
-  using BlockLoadKeys1  = typename ptx_plan::BlockLoadKeys1;
-  using BlockLoadKeys2  = typename ptx_plan::BlockLoadKeys2;
-  using BlockLoadItems1 = typename ptx_plan::BlockLoadItems1;
-  using BlockLoadItems2 = typename ptx_plan::BlockLoadItems2;
-  using BlockStoreKeys  = typename ptx_plan::BlockStoreKeys;
-  using BlockStoreItems = typename ptx_plan::BlockStoreItems;
-  using TempStorage     = typename ptx_plan::TempStorage;
-
-  enum
-  {
-    ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
-    BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
-    ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
-  };
-
-  struct impl
-  {
-    //---------------------------------------------------------------------
-    // Per thread data
-    //---------------------------------------------------------------------
-
-    TempStorage& storage;
-    KeysLoadIt1 keys1_in;
-    KeysLoadIt2 keys2_in;
-    ItemsLoadIt1 items1_in;
-    ItemsLoadIt2 items2_in;
-    Size keys1_count;
-    Size keys2_count;
-    KeysOutputIt keys_out;
-    ItemsOutputIt items_out;
-    CompareOp compare_op;
-    Size* merge_partitions;
-
-    //---------------------------------------------------------------------
-    // Utility functions
-    //---------------------------------------------------------------------
-
-    template <bool IS_FULL_TILE, class T, class It1, class It2>
-    THRUST_DEVICE_FUNCTION void
-    gmem_to_reg(T (&output)[ITEMS_PER_THREAD], It1 input1, It2 input2, int count1, int count2)
-    {
-      if (IS_FULL_TILE)
-      {
-#  pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          int idx = BLOCK_THREADS * ITEM + threadIdx.x;
-          if (idx < count1)
-          {
-            output[ITEM] = input1[idx];
-          }
-          else
-          {
-            output[ITEM] = input2[idx - count1];
-          }
-        }
-      }
-      else
-      {
-#  pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          int idx = BLOCK_THREADS * ITEM + threadIdx.x;
-          if (idx < count1 + count2)
-          {
-            if (idx < count1)
-            {
-              output[ITEM] = input1[idx];
-            }
-            else
-            {
-              output[ITEM] = input2[idx - count1];
-            }
-          }
-        }
-      }
-    }
-
-    template <class T, class It>
-    THRUST_DEVICE_FUNCTION void reg_to_shared(It output, T (&input)[ITEMS_PER_THREAD])
-    {
-#  pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-      {
-        int idx     = BLOCK_THREADS * ITEM + threadIdx.x;
-        output[idx] = input[ITEM];
-      }
-    }
-
-    //---------------------------------------------------------------------
-    // Tile processing
-    //---------------------------------------------------------------------
-
-    template <bool IS_FULL_TILE>
-    void THRUST_DEVICE_FUNCTION consume_tile(Size tile_idx, Size tile_base, int num_remaining)
-    {
-      using core::sync_threadblock;
-      using core::uninitialized_array;
-
-      Size partition_beg = merge_partitions[tile_idx + 0];
-      Size partition_end = merge_partitions[tile_idx + 1];
-
-      Size diag0 = ITEMS_PER_TILE * tile_idx;
-      Size diag1 = (thrust::min)(keys1_count + keys2_count, diag0 + ITEMS_PER_TILE);
-
-      // compute bounding box for keys1 & keys2
-      //
-      Size keys1_beg = partition_beg;
-      Size keys1_end = partition_end;
-      Size keys2_beg = diag0 - keys1_beg;
-      Size keys2_end = diag1 - keys1_end;
-
-      // number of keys per tile
-      //
-      int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
-      int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
-
-      key_type keys_loc[ITEMS_PER_THREAD];
-      gmem_to_reg<IS_FULL_TILE>(keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
-      reg_to_shared(&storage.keys_shared[0], keys_loc);
-
-      sync_threadblock();
-
-      // use binary search in shared memory
-      // to find merge path for each of thread
-      // we can use int type here, because the number of
-      // items in shared memory is limited
-      //
-      int diag0_loc = min<int>(num_keys1 + num_keys2, ITEMS_PER_THREAD * threadIdx.x);
-
-      int keys1_beg_loc = merge_path(
-        &storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
-      int keys1_end_loc = num_keys1;
-      int keys2_beg_loc = diag0_loc - keys1_beg_loc;
-      int keys2_end_loc = num_keys2;
-
-      int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
-      int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
-
-      // perform serial merge
-      //
-      int indices[ITEMS_PER_THREAD];
-
-      serial_merge(
-        &storage.keys_shared[0],
-        keys1_beg_loc,
-        keys2_beg_loc + num_keys1,
-        num_keys1_loc,
-        num_keys2_loc,
-        keys_loc,
-        indices,
-        compare_op);
-
-      sync_threadblock();
-
-      // write keys
-      //
-      if (IS_FULL_TILE)
-      {
-        BlockStoreKeys(storage.store_keys).Store(keys_out + tile_base, keys_loc);
-      }
-      else
-      {
-        BlockStoreKeys(storage.store_keys).Store(keys_out + tile_base, keys_loc, num_remaining);
-      }
-
-      // if items are provided, merge them
-      if (MERGE_ITEMS::value)
-      {
-        item_type items_loc[ITEMS_PER_THREAD];
-        gmem_to_reg<IS_FULL_TILE>(items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
-
-        sync_threadblock();
-
-        reg_to_shared(&storage.items_shared[0], items_loc);
-
-        sync_threadblock();
-
-        // gather items from shared mem
-        //
-#  pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          items_loc[ITEM] = storage.items_shared[indices[ITEM]];
-        }
-
-        sync_threadblock();
-
-        // write form reg to gmem
-        //
-        if (IS_FULL_TILE)
-        {
-          BlockStoreItems(storage.store_items).Store(items_out + tile_base, items_loc);
-        }
-        else
-        {
-          BlockStoreItems(storage.store_items).Store(items_out + tile_base, items_loc, num_remaining);
-        }
-      }
-    }
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    THRUST_DEVICE_FUNCTION
-    impl(TempStorage& storage_,
-         KeysLoadIt1 keys1_in_,
-         KeysLoadIt2 keys2_in_,
-         ItemsLoadIt1 items1_in_,
-         ItemsLoadIt2 items2_in_,
-         Size keys1_count_,
-         Size keys2_count_,
-         KeysOutputIt keys_out_,
-         ItemsOutputIt items_out_,
-         CompareOp compare_op_,
-         Size* merge_partitions_)
-        : storage(storage_)
-        , keys1_in(keys1_in_)
-        , keys2_in(keys2_in_)
-        , items1_in(items1_in_)
-        , items2_in(items2_in_)
-        , keys1_count(keys1_count_)
-        , keys2_count(keys2_count_)
-        , keys_out(keys_out_)
-        , items_out(items_out_)
-        , compare_op(compare_op_)
-        , merge_partitions(merge_partitions_)
-    {
-      // XXX with 8.5 chaging type to Size (or long long) results in error!
-      int tile_idx      = blockIdx.x;
-      Size tile_base    = tile_idx * ITEMS_PER_TILE;
-      int items_in_tile = static_cast<int>(min<Size>(ITEMS_PER_TILE, keys1_count + keys2_count - tile_base));
-      if (items_in_tile == ITEMS_PER_TILE)
-      {
-        // full tile
-        consume_tile<true>(tile_idx, tile_base, ITEMS_PER_TILE);
-      }
-      else
-      {
-        // partial tile
-        consume_tile<false>(tile_idx, tile_base, items_in_tile);
-      }
-    }
-  }; // struct impl
-
-  //---------------------------------------------------------------------
-  // Agent entry point
-  //---------------------------------------------------------------------
-
-  THRUST_AGENT_ENTRY(
-    KeysIt1 keys1_in,
-    KeysIt2 keys2_in,
-    ItemsIt1 items1_in,
-    ItemsIt2 items2_in,
-    Size keys1_count,
-    Size keys2_count,
-    KeysOutputIt keys_out,
-    ItemsOutputIt items_out,
-    CompareOp compare_op,
-    Size* merge_partitions,
-    char* shmem)
-  {
-    TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
-
-    impl(storage,
-         core::make_load_iterator(ptx_plan(), keys1_in),
-         core::make_load_iterator(ptx_plan(), keys2_in),
-         core::make_load_iterator(ptx_plan(), items1_in),
-         core::make_load_iterator(ptx_plan(), items2_in),
-         keys1_count,
-         keys2_count,
-         keys_out,
-         items_out,
-         compare_op,
-         merge_partitions);
-  }
-}; // struct MergeAgent;
-
-//---------------------------------------------------------------------
-// Two-step internal API
-//---------------------------------------------------------------------
-
-template <class MERGE_ITEMS,
-          class KeysIt1,
-          class KeysIt2,
-          class ItemsIt1,
-          class ItemsIt2,
-          class Size,
-          class KeysOutputIt,
-          class ItemsOutputIt,
-          class CompareOp>
-cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
-  void* d_temp_storage,
-  size_t& temp_storage_bytes,
-  KeysIt1 keys1,
-  KeysIt2 keys2,
-  ItemsIt1 items1,
-  ItemsIt2 items2,
-  Size num_keys1,
-  Size num_keys2,
-  KeysOutputIt keys_result,
-  ItemsOutputIt items_result,
-  CompareOp compare_op,
-  cudaStream_t stream)
-{
-  if (num_keys1 + num_keys2 == 0)
-  {
-    return cudaErrorNotSupported;
-  }
-
-  using core::AgentPlan;
-  using core::get_agent_plan;
-  using merge_agent = core::AgentLauncher<
-    MergeAgent<KeysIt1, KeysIt2, ItemsIt1, ItemsIt2, Size, KeysOutputIt, ItemsOutputIt, CompareOp, MERGE_ITEMS>>;
-
-  using partition_agent = core::AgentLauncher<PartitionAgent<KeysIt1, KeysIt2, Size, CompareOp>>;
-
-  cudaError_t status = cudaSuccess;
-
-  AgentPlan partition_plan = partition_agent::get_plan();
-  AgentPlan merge_plan     = merge_agent::get_plan(stream);
-
-  int tile_size  = merge_plan.items_per_tile;
-  Size num_tiles = (num_keys1 + num_keys2 + tile_size - 1) / tile_size;
-
-  size_t temp_storage1 = (1 + num_tiles) * sizeof(Size);
-  size_t temp_storage2 = core::vshmem_size(merge_plan.shared_memory_size, num_tiles);
-
-  void* allocations[2]       = {nullptr, nullptr};
-  size_t allocation_sizes[2] = {temp_storage1, temp_storage2};
-
-  status = core::alias_storage(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
-  CUDA_CUB_RET_IF_FAIL(status);
-
-  if (d_temp_storage == nullptr)
-  {
-    return status;
-  }
-
-  // partition data into work balanced tiles
-  Size* merge_partitions = (Size*) allocations[0];
-  char* vshmem_ptr       = temp_storage2 > 0 ? (char*) allocations[1] : nullptr;
-
-  {
-    Size num_partitions = num_tiles + 1;
-
-    partition_agent(partition_plan, num_partitions, stream, "partition agent")
-      .launch(
-        keys1, keys2, num_keys1, num_keys2, num_partitions, merge_partitions, compare_op, merge_plan.items_per_tile);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-  }
-
-  merge_agent(merge_plan, num_keys1 + num_keys2, stream, vshmem_ptr, "merge agent")
-    .launch(keys1, keys2, items1, items2, num_keys1, num_keys2, keys_result, items_result, compare_op, merge_partitions);
-  CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-  return status;
-}
-
-template <typename MERGE_ITEMS,
-          typename Derived,
-          typename KeysIt1,
-          typename KeysIt2,
-          typename ItemsIt1,
-          typename ItemsIt2,
-          typename KeysOutputIt,
-          typename ItemsOutputIt,
-          typename CompareOp>
-THRUST_RUNTIME_FUNCTION pair<KeysOutputIt, ItemsOutputIt> merge(
-  execution_policy<Derived>& policy,
-  KeysIt1 keys1_first,
-  KeysIt1 keys1_last,
-  KeysIt2 keys2_first,
-  KeysIt2 keys2_last,
-  ItemsIt1 items1_first,
-  ItemsIt2 items2_first,
-  KeysOutputIt keys_result,
-  ItemsOutputIt items_result,
-  CompareOp compare_op)
-{
-  using size_type = typename iterator_traits<KeysIt1>::difference_type;
-
-  size_type num_keys1 = static_cast<size_type>(thrust::distance(keys1_first, keys1_last));
-  size_type num_keys2 = static_cast<size_type>(thrust::distance(keys2_first, keys2_last));
-
-  size_type const count = num_keys1 + num_keys2;
-
-  if (count == 0)
-  {
-    return thrust::make_pair(keys_result, items_result);
-  }
-
-  size_t storage_size = 0;
-  cudaStream_t stream = cuda_cub::stream(policy);
-
-  cudaError_t status;
-  status = doit_step<MERGE_ITEMS>(
-    nullptr,
-    storage_size,
-    keys1_first,
-    keys2_first,
-    items1_first,
-    items2_first,
-    num_keys1,
-    num_keys2,
-    keys_result,
-    items_result,
-    compare_op,
-    stream);
-  cuda_cub::throw_on_error(status, "merge: failed on 1st step");
-
-  // Allocate temporary storage.
-  thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
-  void* ptr = static_cast<void*>(tmp.data().get());
-
-  status = doit_step<MERGE_ITEMS>(
-    ptr,
-    storage_size,
-    keys1_first,
-    keys2_first,
-    items1_first,
-    items2_first,
-    num_keys1,
-    num_keys2,
-    keys_result,
-    items_result,
-    compare_op,
-    stream);
-  cuda_cub::throw_on_error(status, "merge: failed on 2nd step");
-
-  status = cuda_cub::synchronize_optional(policy);
-  cuda_cub::throw_on_error(status, "merge: failed to synchronize");
-
-  return thrust::make_pair(keys_result + count, items_result + count);
-}
-} // namespace __merge
-
-//-------------------------
-// Thrust API entry points
-//-------------------------
-
 _CCCL_EXEC_CHECK_DISABLE
-template <class Derived, class KeysIt1, class KeysIt2, class ResultIt, class CompareOp>
+template <class Derived, class KeysIt1, class KeysIt2, class ResultIt, class CompareOp = less<>>
 ResultIt _CCCL_HOST_DEVICE
 merge(execution_policy<Derived>& policy,
-      KeysIt1 keys1_first,
-      KeysIt1 keys1_last,
-      KeysIt2 keys2_first,
-      KeysIt2 keys2_last,
-      ResultIt result,
-      CompareOp compare_op)
-
+      KeysIt1 keys1_begin,
+      KeysIt1 keys1_end,
+      KeysIt2 keys2_begin,
+      KeysIt2 keys2_end,
+      ResultIt result_begin,
+      CompareOp compare_op = {})
 {
   THRUST_CDP_DISPATCH(
-    (using keys_type = thrust::iterator_value_t<KeysIt1>; keys_type* null_ = nullptr;
-     auto tmp                                                              = __merge::merge<thrust::detail::false_type>(
-       policy, keys1_first, keys1_last, keys2_first, keys2_last, null_, null_, result, null_, compare_op);
-     result = tmp.first;),
-    (result = thrust::merge(
-       cvt_to_seq(derived_cast(policy)), keys1_first, keys1_last, keys2_first, keys2_last, result, compare_op);));
-  return result;
-}
-
-template <class Derived, class KeysIt1, class KeysIt2, class ResultIt>
-ResultIt _CCCL_HOST_DEVICE
-merge(execution_policy<Derived>& policy,
-      KeysIt1 keys1_first,
-      KeysIt1 keys1_last,
-      KeysIt2 keys2_first,
-      KeysIt2 keys2_last,
-      ResultIt result)
-{
-  using keys_type = typename thrust::iterator_value<KeysIt1>::type;
-  return cuda_cub::merge(policy, keys1_first, keys1_last, keys2_first, keys2_last, result, less<keys_type>());
+    (using size_type         = typename iterator_traits<KeysIt1>::difference_type;
+     const auto num_keys1    = static_cast<size_type>(thrust::distance(keys1_begin, keys1_end));
+     const auto num_keys2    = static_cast<size_type>(thrust::distance(keys2_begin, keys2_end));
+     const auto num_keys_out = num_keys1 + num_keys2;
+     if (num_keys_out == 0) { return result_begin; }
+
+     using dispatch32_t = cub::detail::merge::
+       dispatch_t<KeysIt1, cub::NullType*, KeysIt2, cub::NullType*, ResultIt, cub::NullType*, std::int32_t, CompareOp>;
+     using dispatch64_t = cub::detail::merge::
+       dispatch_t<KeysIt1, cub::NullType*, KeysIt2, cub::NullType*, ResultIt, cub::NullType*, std::int64_t, CompareOp>;
+
+     const auto stream = cuda_cub::stream(policy);
+     cudaError_t status;
+     size_t storage_size = 0;
+     THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(
+       status,
+       dispatch32_t::dispatch,
+       dispatch64_t::dispatch,
+       num_keys1,
+       num_keys2,
+       (nullptr,
+        storage_size,
+        keys1_begin,
+        nullptr,
+        num_keys1_fixed,
+        keys2_begin,
+        nullptr,
+        num_keys2_fixed,
+        result_begin,
+        nullptr,
+        compare_op,
+        stream));
+     throw_on_error(status, "merge: failed on 1st step");
+
+     thrust::detail::temporary_array<char, Derived> temp_storage(policy, storage_size);
+     THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(
+       status,
+       dispatch32_t::dispatch,
+       dispatch64_t::dispatch,
+       num_keys1,
+       num_keys2,
+       (temp_storage.data().get(),
+        storage_size,
+        keys1_begin,
+        nullptr,
+        num_keys1_fixed,
+        keys2_begin,
+        nullptr,
+        num_keys2_fixed,
+        result_begin,
+        nullptr,
+        compare_op,
+        stream));
+     throw_on_error(status, "merge: failed on 2nd step");
+
+     status = cuda_cub::synchronize_optional(policy);
+     throw_on_error(status, "merge: failed to synchronize");
+
+     return result_begin + num_keys_out;),
+    (return thrust::merge(
+              cvt_to_seq(derived_cast(policy)), keys1_begin, keys1_end, keys2_begin, keys2_end, result_begin, compare_op);
+
+     ));
 }
 
 _CCCL_EXEC_CHECK_DISABLE
@@ -800,72 +137,92 @@ template <class Derived,
           class ItemsIt2,
           class KeysOutputIt,
           class ItemsOutputIt,
-          class CompareOp>
+          class CompareOp = less<>>
 pair<KeysOutputIt, ItemsOutputIt> _CCCL_HOST_DEVICE merge_by_key(
   execution_policy<Derived>& policy,
-  KeysIt1 keys1_first,
-  KeysIt1 keys1_last,
-  KeysIt2 keys2_first,
-  KeysIt2 keys2_last,
-  ItemsIt1 items1_first,
-  ItemsIt2 items2_first,
-  KeysOutputIt keys_result,
-  ItemsOutputIt items_result,
-  CompareOp compare_op)
+  KeysIt1 keys1_begin,
+  KeysIt1 keys1_end,
+  KeysIt2 keys2_begin,
+  KeysIt2 keys2_end,
+  ItemsIt1 items1_begin,
+  ItemsIt2 items2_begin,
+  KeysOutputIt keys_out_begin,
+  ItemsOutputIt items_out_begin,
+  CompareOp compare_op = {})
 {
-  auto ret = thrust::make_pair(keys_result, items_result);
   THRUST_CDP_DISPATCH(
-    (ret = __merge::merge<thrust::detail::true_type>(
-       policy,
-       keys1_first,
-       keys1_last,
-       keys2_first,
-       keys2_last,
-       items1_first,
-       items2_first,
-       keys_result,
-       items_result,
-       compare_op);),
-    (ret = thrust::merge_by_key(
-       cvt_to_seq(derived_cast(policy)),
-       keys1_first,
-       keys1_last,
-       keys2_first,
-       keys2_last,
-       items1_first,
-       items2_first,
-       keys_result,
-       items_result,
-       compare_op);));
-  return ret;
-}
-
-template <class Derived, class KeysIt1, class KeysIt2, class ItemsIt1, class ItemsIt2, class KeysOutputIt, class ItemsOutputIt>
-pair<KeysOutputIt, ItemsOutputIt> _CCCL_HOST_DEVICE merge_by_key(
-  execution_policy<Derived>& policy,
-  KeysIt1 keys1_first,
-  KeysIt1 keys1_last,
-  KeysIt2 keys2_first,
-  KeysIt2 keys2_last,
-  ItemsIt1 items1_first,
-  ItemsIt2 items2_first,
-  KeysOutputIt keys_result,
-  ItemsOutputIt items_result)
-{
-  using keys_type = typename thrust::iterator_value<KeysIt1>::type;
-  return cuda_cub::merge_by_key(
-    policy,
-    keys1_first,
-    keys1_last,
-    keys2_first,
-    keys2_last,
-    items1_first,
-    items2_first,
-    keys_result,
-    items_result,
-    thrust::less<keys_type>());
+    (using size_type = typename iterator_traits<KeysIt1>::difference_type;
+
+     const auto num_keys1    = static_cast<size_type>(thrust::distance(keys1_begin, keys1_end));
+     const auto num_keys2    = static_cast<size_type>(thrust::distance(keys2_begin, keys2_end));
+     const auto num_keys_out = num_keys1 + num_keys2;
+     if (num_keys_out == 0) { return {keys_out_begin, items_out_begin}; }
+
+     using dispatch32_t = cub::detail::merge::
+       dispatch_t<KeysIt1, ItemsIt1, KeysIt2, ItemsIt2, KeysOutputIt, ItemsOutputIt, std::int32_t, CompareOp>;
+     using dispatch64_t = cub::detail::merge::
+       dispatch_t<KeysIt1, ItemsIt1, KeysIt2, ItemsIt2, KeysOutputIt, ItemsOutputIt, std::int64_t, CompareOp>;
+
+     const auto stream = cuda_cub::stream(policy);
+     cudaError_t status;
+     size_t storage_size = 0;
+     THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(
+       status,
+       dispatch32_t::dispatch,
+       dispatch64_t::dispatch,
+       num_keys1,
+       num_keys2,
+       (nullptr,
+        storage_size,
+        keys1_begin,
+        items1_begin,
+        num_keys1_fixed,
+        keys2_begin,
+        items2_begin,
+        num_keys2_fixed,
+        keys_out_begin,
+        items_out_begin,
+        compare_op,
+        stream));
+     throw_on_error(status, "merge: failed on 1st step");
+
+     thrust::detail::temporary_array<char, Derived> temp_storage(policy, storage_size);
+     THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(
+       status,
+       dispatch32_t::dispatch,
+       dispatch64_t::dispatch,
+       num_keys1,
+       num_keys2,
+       (temp_storage.data().get(),
+        storage_size,
+        keys1_begin,
+        items1_begin,
+        num_keys1_fixed,
+        keys2_begin,
+        items2_begin,
+        num_keys2_fixed,
+        keys_out_begin,
+        items_out_begin,
+        compare_op,
+        stream));
+     throw_on_error(status, "merge: failed on 2nd step");
+
+     status = cuda_cub::synchronize_optional(policy);
+     throw_on_error(status, "merge: failed to synchronize");
+
+     return {keys_out_begin + num_keys_out, items_out_begin + num_keys_out};),
+    (return thrust::merge_by_key(
+              cvt_to_seq(derived_cast(policy)),
+              keys1_begin,
+              keys1_end,
+              keys2_begin,
+              keys2_end,
+              items1_begin,
+              items2_begin,
+              keys_out_begin,
+              items_out_begin,
+              compare_op);));
 }
-
 } // namespace cuda_cub
 THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/thrust/system/detail/generic/binary_search.inl b/thrust/thrust/system/detail/generic/binary_search.inl
index e2f5cc2a7ba..ee049bbf1a5 100644
--- a/thrust/thrust/system/detail/generic/binary_search.inl
+++ b/thrust/thrust/system/detail/generic/binary_search.inl
@@ -83,7 +83,7 @@ struct bsf
   {
     RandomAccessIterator iter = thrust::system::detail::generic::scalar::lower_bound(begin, end, value, comp);
 
-    thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp(comp);
+    thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp{comp};
 
     return iter != end && !wrapped_comp(value, *iter);
   }
diff --git a/thrust/thrust/system/detail/generic/reduce_by_key.inl b/thrust/thrust/system/detail/generic/reduce_by_key.inl
index 56368e25520..2d0d73859f0 100644
--- a/thrust/thrust/system/detail/generic/reduce_by_key.inl
+++ b/thrust/thrust/system/detail/generic/reduce_by_key.inl
@@ -29,7 +29,6 @@
 #include <thrust/detail/internal_functional.h>
 #include <thrust/detail/temporary_array.h>
 #include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
 #include <thrust/detail/type_traits/iterator/is_output_iterator.h>
 #include <thrust/iterator/detail/minimum_system.h>
 #include <thrust/iterator/iterator_traits.h>
diff --git a/thrust/thrust/system/detail/generic/scalar/binary_search.inl b/thrust/thrust/system/detail/generic/scalar/binary_search.inl
index ba291c4dea4..52e1a637f23 100644
--- a/thrust/thrust/system/detail/generic/scalar/binary_search.inl
+++ b/thrust/thrust/system/detail/generic/scalar/binary_search.inl
@@ -48,7 +48,7 @@ _CCCL_HOST_DEVICE RandomAccessIterator
 lower_bound_n(RandomAccessIterator first, Size n, const T& val, BinaryPredicate comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<BinaryPredicate, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<BinaryPredicate, bool> wrapped_comp{comp};
 
   Size start = 0, i;
   while (start < n)
@@ -82,7 +82,7 @@ _CCCL_HOST_DEVICE RandomAccessIterator
 upper_bound_n(RandomAccessIterator first, Size n, const T& val, BinaryPredicate comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<BinaryPredicate, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<BinaryPredicate, bool> wrapped_comp{comp};
 
   Size start = 0, i;
   while (start < n)
@@ -123,7 +123,7 @@ _CCCL_HOST_DEVICE bool binary_search(RandomAccessIterator first, RandomAccessIte
   RandomAccessIterator iter = thrust::system::detail::generic::scalar::lower_bound(first, last, value, comp);
 
   // wrap comp
-  thrust::detail::wrapped_function<Compare, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<Compare, bool> wrapped_comp{comp};
 
   return iter != last && !wrapped_comp(value, *iter);
 }
diff --git a/thrust/thrust/system/detail/generic/transform_scan.inl b/thrust/thrust/system/detail/generic/transform_scan.inl
index 0bdba28e61e..94615850757 100644
--- a/thrust/thrust/system/detail/generic/transform_scan.inl
+++ b/thrust/thrust/system/detail/generic/transform_scan.inl
@@ -26,7 +26,6 @@
 #  pragma system_header
 #endif // no system header
 #include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
 #include <thrust/detail/type_traits/iterator/is_output_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
diff --git a/thrust/thrust/system/detail/sequential/binary_search.h b/thrust/thrust/system/detail/sequential/binary_search.h
index 50ddda80e51..b81843ecead 100644
--- a/thrust/thrust/system/detail/sequential/binary_search.h
+++ b/thrust/thrust/system/detail/sequential/binary_search.h
@@ -53,7 +53,7 @@ _CCCL_HOST_DEVICE ForwardIterator lower_bound(
   StrictWeakOrdering comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp{comp};
 
   using difference_type = typename thrust::iterator_difference<ForwardIterator>::type;
 
@@ -91,7 +91,7 @@ _CCCL_HOST_DEVICE ForwardIterator upper_bound(
   StrictWeakOrdering comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp{comp};
 
   using difference_type = typename thrust::iterator_difference<ForwardIterator>::type;
 
@@ -131,7 +131,7 @@ _CCCL_HOST_DEVICE bool binary_search(
   ForwardIterator iter = sequential::lower_bound(exec, first, last, val, comp);
 
   // wrap comp
-  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp{comp};
 
   return iter != last && !wrapped_comp(val, *iter);
 }
diff --git a/thrust/thrust/system/detail/sequential/copy_if.h b/thrust/thrust/system/detail/sequential/copy_if.h
index fb880a9e01d..360b9bc6a2b 100644
--- a/thrust/thrust/system/detail/sequential/copy_if.h
+++ b/thrust/thrust/system/detail/sequential/copy_if.h
@@ -54,7 +54,7 @@ _CCCL_HOST_DEVICE OutputIterator copy_if(
   OutputIterator result,
   Predicate pred)
 {
-  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred(pred);
+  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
   while (first != last)
   {
diff --git a/thrust/thrust/system/detail/sequential/extrema.h b/thrust/thrust/system/detail/sequential/extrema.h
index 475c1a4c7d2..ba8dfcd8e90 100644
--- a/thrust/thrust/system/detail/sequential/extrema.h
+++ b/thrust/thrust/system/detail/sequential/extrema.h
@@ -47,7 +47,7 @@ _CCCL_HOST_DEVICE ForwardIterator min_element(
   sequential::execution_policy<DerivedPolicy>&, ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<BinaryPredicate, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<BinaryPredicate, bool> wrapped_comp{comp};
 
   ForwardIterator imin = first;
 
@@ -68,7 +68,7 @@ _CCCL_HOST_DEVICE ForwardIterator max_element(
   sequential::execution_policy<DerivedPolicy>&, ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<BinaryPredicate, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<BinaryPredicate, bool> wrapped_comp{comp};
 
   ForwardIterator imax = first;
 
@@ -89,7 +89,7 @@ _CCCL_HOST_DEVICE thrust::pair<ForwardIterator, ForwardIterator> minmax_element(
   sequential::execution_policy<DerivedPolicy>&, ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<BinaryPredicate, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<BinaryPredicate, bool> wrapped_comp{comp};
 
   ForwardIterator imin = first;
   ForwardIterator imax = first;
diff --git a/thrust/thrust/system/detail/sequential/find.h b/thrust/thrust/system/detail/sequential/find.h
index 7983cdcc6fe..4edee4e621c 100644
--- a/thrust/thrust/system/detail/sequential/find.h
+++ b/thrust/thrust/system/detail/sequential/find.h
@@ -46,7 +46,7 @@ _CCCL_HOST_DEVICE InputIterator
 find_if(execution_policy<DerivedPolicy>&, InputIterator first, InputIterator last, Predicate pred)
 {
   // wrap pred
-  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred(pred);
+  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
   while (first != last)
   {
diff --git a/thrust/thrust/system/detail/sequential/for_each.h b/thrust/thrust/system/detail/sequential/for_each.h
index 52413c8561b..e072fa21689 100644
--- a/thrust/thrust/system/detail/sequential/for_each.h
+++ b/thrust/thrust/system/detail/sequential/for_each.h
@@ -46,7 +46,7 @@ _CCCL_HOST_DEVICE InputIterator
 for_each(sequential::execution_policy<DerivedPolicy>&, InputIterator first, InputIterator last, UnaryFunction f)
 {
   // wrap f
-  thrust::detail::wrapped_function<UnaryFunction, void> wrapped_f(f);
+  thrust::detail::wrapped_function<UnaryFunction, void> wrapped_f{f};
 
   for (; first != last; ++first)
   {
@@ -61,7 +61,7 @@ _CCCL_HOST_DEVICE InputIterator
 for_each_n(sequential::execution_policy<DerivedPolicy>&, InputIterator first, Size n, UnaryFunction f)
 {
   // wrap f
-  thrust::detail::wrapped_function<UnaryFunction, void> wrapped_f(f);
+  thrust::detail::wrapped_function<UnaryFunction, void> wrapped_f{f};
 
   for (Size i = 0; i != n; i++)
   {
diff --git a/thrust/thrust/system/detail/sequential/insertion_sort.h b/thrust/thrust/system/detail/sequential/insertion_sort.h
index 8435b707964..e66a54fd88f 100644
--- a/thrust/thrust/system/detail/sequential/insertion_sort.h
+++ b/thrust/thrust/system/detail/sequential/insertion_sort.h
@@ -50,7 +50,7 @@ _CCCL_HOST_DEVICE void insertion_sort(RandomAccessIterator first, RandomAccessIt
   }
 
   // wrap comp
-  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp{comp};
 
   for (RandomAccessIterator i = first + 1; i != last; ++i)
   {
@@ -95,7 +95,7 @@ _CCCL_HOST_DEVICE void insertion_sort_by_key(
   }
 
   // wrap comp
-  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp{comp};
 
   RandomAccessIterator1 i1 = first1 + 1;
   RandomAccessIterator2 i2 = first2 + 1;
diff --git a/thrust/thrust/system/detail/sequential/merge.inl b/thrust/thrust/system/detail/sequential/merge.inl
index b376839a12c..1d2bd8bb6c4 100644
--- a/thrust/thrust/system/detail/sequential/merge.inl
+++ b/thrust/thrust/system/detail/sequential/merge.inl
@@ -54,7 +54,7 @@ _CCCL_HOST_DEVICE OutputIterator merge(
   StrictWeakOrdering comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp{comp};
 
   while (first1 != last1 && first2 != last2)
   {
@@ -97,7 +97,7 @@ _CCCL_HOST_DEVICE thrust::pair<OutputIterator1, OutputIterator2> merge_by_key(
   StrictWeakOrdering comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp{comp};
 
   while (keys_first1 != keys_last1 && keys_first2 != keys_last2)
   {
diff --git a/thrust/thrust/system/detail/sequential/partition.h b/thrust/thrust/system/detail/sequential/partition.h
index 8b410bcd551..5c9a3cfa326 100644
--- a/thrust/thrust/system/detail/sequential/partition.h
+++ b/thrust/thrust/system/detail/sequential/partition.h
@@ -76,7 +76,7 @@ partition(sequential::execution_policy<DerivedPolicy>&, ForwardIterator first, F
   }
 
   // wrap pred
-  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred(pred);
+  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
   while (wrapped_pred(*first))
   {
@@ -116,7 +116,7 @@ _CCCL_HOST_DEVICE ForwardIterator partition(
   }
 
   // wrap pred
-  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred(pred);
+  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
   while (wrapped_pred(*stencil_first))
   {
@@ -158,7 +158,7 @@ _CCCL_HOST_DEVICE ForwardIterator stable_partition(
   }
 
   // wrap pred
-  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred(pred);
+  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
   using T = typename thrust::iterator_value<ForwardIterator>::type;
 
@@ -200,7 +200,7 @@ _CCCL_HOST_DEVICE ForwardIterator stable_partition(
   Predicate pred)
 {
   // wrap pred
-  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred(pred);
+  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
   using T = typename thrust::iterator_value<ForwardIterator>::type;
 
@@ -249,7 +249,7 @@ _CCCL_HOST_DEVICE thrust::pair<OutputIterator1, OutputIterator2> stable_partitio
   Predicate pred)
 {
   // wrap pred
-  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred(pred);
+  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
   for (; first != last; ++first)
   {
@@ -285,7 +285,7 @@ _CCCL_HOST_DEVICE thrust::pair<OutputIterator1, OutputIterator2> stable_partitio
   Predicate pred)
 {
   // wrap pred
-  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred(pred);
+  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
   for (; first != last; ++first, ++stencil)
   {
diff --git a/thrust/thrust/system/detail/sequential/reduce.h b/thrust/thrust/system/detail/sequential/reduce.h
index 5fcb0055716..e75fbfd5409 100644
--- a/thrust/thrust/system/detail/sequential/reduce.h
+++ b/thrust/thrust/system/detail/sequential/reduce.h
@@ -50,7 +50,7 @@ _CCCL_HOST_DEVICE OutputType reduce(
   BinaryFunction binary_op)
 {
   // wrap binary_op
-  thrust::detail::wrapped_function<BinaryFunction, OutputType> wrapped_binary_op(binary_op);
+  thrust::detail::wrapped_function<BinaryFunction, OutputType> wrapped_binary_op{binary_op};
 
   // initialize the result
   OutputType result = init;
diff --git a/thrust/thrust/system/detail/sequential/remove.h b/thrust/thrust/system/detail/sequential/remove.h
index cd2de6e018a..acaa5cdcf9b 100644
--- a/thrust/thrust/system/detail/sequential/remove.h
+++ b/thrust/thrust/system/detail/sequential/remove.h
@@ -46,7 +46,7 @@ _CCCL_HOST_DEVICE ForwardIterator
 remove_if(sequential::execution_policy<DerivedPolicy>&, ForwardIterator first, ForwardIterator last, Predicate pred)
 {
   // wrap pred
-  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred(pred);
+  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
   // advance iterators until wrapped_pred(*first) is true or we reach the end of input
   while (first != last && !wrapped_pred(*first))
@@ -87,7 +87,7 @@ _CCCL_HOST_DEVICE ForwardIterator remove_if(
   Predicate pred)
 {
   // wrap pred
-  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred(pred);
+  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
   // advance iterators until wrapped_pred(*stencil) is true or we reach the end of input
   while (first != last && !wrapped_pred(*stencil))
@@ -131,7 +131,7 @@ _CCCL_HOST_DEVICE OutputIterator remove_copy_if(
   Predicate pred)
 {
   // wrap pred
-  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred(pred);
+  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
   while (first != last)
   {
@@ -162,7 +162,7 @@ _CCCL_HOST_DEVICE OutputIterator remove_copy_if(
   Predicate pred)
 {
   // wrap pred
-  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred(pred);
+  thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
   while (first != last)
   {
diff --git a/thrust/thrust/system/detail/sequential/scan.h b/thrust/thrust/system/detail/sequential/scan.h
index bf02db98650..b76074c8295 100644
--- a/thrust/thrust/system/detail/sequential/scan.h
+++ b/thrust/thrust/system/detail/sequential/scan.h
@@ -31,7 +31,6 @@
 #endif // no system header
 #include <thrust/detail/function.h>
 #include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
 #include <thrust/detail/type_traits/iterator/is_output_iterator.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
@@ -59,7 +58,7 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan(
   using ValueType = typename thrust::iterator_value<InputIterator>::type;
 
   // wrap binary_op
-  thrust::detail::wrapped_function<BinaryFunction, ValueType> wrapped_binary_op(binary_op);
+  thrust::detail::wrapped_function<BinaryFunction, ValueType> wrapped_binary_op{binary_op};
 
   if (first != last)
   {
diff --git a/thrust/thrust/system/detail/sequential/scan_by_key.h b/thrust/thrust/system/detail/sequential/scan_by_key.h
index 70b8d400b61..3f1e7bb49c9 100644
--- a/thrust/thrust/system/detail/sequential/scan_by_key.h
+++ b/thrust/thrust/system/detail/sequential/scan_by_key.h
@@ -61,7 +61,7 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan_by_key(
   using ValueType = typename thrust::iterator_traits<InputIterator2>::value_type;
 
   // wrap binary_op
-  thrust::detail::wrapped_function<BinaryFunction, ValueType> wrapped_binary_op(binary_op);
+  thrust::detail::wrapped_function<BinaryFunction, ValueType> wrapped_binary_op{binary_op};
 
   if (first1 != last1)
   {
diff --git a/thrust/thrust/system/detail/sequential/set_operations.h b/thrust/thrust/system/detail/sequential/set_operations.h
index 2a667d89854..1908ee72363 100644
--- a/thrust/thrust/system/detail/sequential/set_operations.h
+++ b/thrust/thrust/system/detail/sequential/set_operations.h
@@ -57,7 +57,7 @@ _CCCL_HOST_DEVICE OutputIterator set_difference(
   StrictWeakOrdering comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp{comp};
 
   while (first1 != last1 && first2 != last2)
   {
@@ -97,7 +97,7 @@ _CCCL_HOST_DEVICE OutputIterator set_intersection(
   StrictWeakOrdering comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp{comp};
 
   while (first1 != last1 && first2 != last2)
   {
@@ -137,7 +137,7 @@ _CCCL_HOST_DEVICE OutputIterator set_symmetric_difference(
   StrictWeakOrdering comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp{comp};
 
   while (first1 != last1 && first2 != last2)
   {
@@ -179,7 +179,7 @@ _CCCL_HOST_DEVICE OutputIterator set_union(
   StrictWeakOrdering comp)
 {
   // wrap comp
-  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp(comp);
+  thrust::detail::wrapped_function<StrictWeakOrdering, bool> wrapped_comp{comp};
 
   while (first1 != last1 && first2 != last2)
   {
diff --git a/thrust/thrust/system/omp/detail/for_each.inl b/thrust/thrust/system/omp/detail/for_each.inl
index 654d5c04ab6..5fe95310c3f 100644
--- a/thrust/thrust/system/omp/detail/for_each.inl
+++ b/thrust/thrust/system/omp/detail/for_each.inl
@@ -59,7 +59,7 @@ RandomAccessIterator for_each_n(execution_policy<DerivedPolicy>&, RandomAccessIt
   }
 
   // create a wrapped function for f
-  thrust::detail::wrapped_function<UnaryFunction, void> wrapped_f(f);
+  thrust::detail::wrapped_function<UnaryFunction, void> wrapped_f{f};
 
   // use a signed type for the iteration variable or suffer the consequences of warnings
   using DifferenceType    = typename thrust::iterator_difference<RandomAccessIterator>::type;
diff --git a/thrust/thrust/system/omp/detail/reduce_intervals.inl b/thrust/thrust/system/omp/detail/reduce_intervals.inl
index e5ccd2d7927..44be63ad844 100644
--- a/thrust/thrust/system/omp/detail/reduce_intervals.inl
+++ b/thrust/thrust/system/omp/detail/reduce_intervals.inl
@@ -67,7 +67,7 @@ void reduce_intervals(
   using OutputType = typename thrust::iterator_value<OutputIterator>::type;
 
   // wrap binary_op
-  thrust::detail::wrapped_function<BinaryFunction, OutputType> wrapped_binary_op(binary_op);
+  thrust::detail::wrapped_function<BinaryFunction, OutputType> wrapped_binary_op{binary_op};
 
   using index_type = std::intptr_t;
 
diff --git a/thrust/thrust/system/tbb/detail/copy_if.inl b/thrust/thrust/system/tbb/detail/copy_if.inl
index f35458be68c..79bf2300b71 100644
--- a/thrust/thrust/system/tbb/detail/copy_if.inl
+++ b/thrust/thrust/system/tbb/detail/copy_if.inl
@@ -56,7 +56,7 @@ struct body
       : first(first)
       , stencil(stencil)
       , result(result)
-      , pred(pred)
+      , pred{pred}
       , sum(0)
   {}
 
@@ -64,7 +64,7 @@ struct body
       : first(b.first)
       , stencil(b.stencil)
       , result(b.result)
-      , pred(b.pred)
+      , pred{b.pred}
       , sum(0)
   {}
 
diff --git a/thrust/thrust/system/tbb/detail/reduce.inl b/thrust/thrust/system/tbb/detail/reduce.inl
index 0892fdfd7d4..c7f77ead927 100644
--- a/thrust/thrust/system/tbb/detail/reduce.inl
+++ b/thrust/thrust/system/tbb/detail/reduce.inl
@@ -57,7 +57,7 @@ struct body
       : first(first)
       , sum(init)
       , first_call(true)
-      , binary_op(binary_op)
+      , binary_op{binary_op}
   {}
 
   // note: we only initalize sum with b.sum to avoid calling OutputType's default constructor
@@ -65,7 +65,7 @@ struct body
       : first(b.first)
       , sum(b.sum)
       , first_call(true)
-      , binary_op(b.binary_op)
+      , binary_op{b.binary_op}
   {}
 
   template <typename Size>
diff --git a/thrust/thrust/system/tbb/detail/reduce_by_key.inl b/thrust/thrust/system/tbb/detail/reduce_by_key.inl
index c92be73937a..f799dfa6087 100644
--- a/thrust/thrust/system/tbb/detail/reduce_by_key.inl
+++ b/thrust/thrust/system/tbb/detail/reduce_by_key.inl
@@ -63,18 +63,12 @@ inline L divide_ri(const L x, const R y)
 template <typename InputIterator, typename BinaryFunction, typename SFINAE = void>
 struct partial_sum_type
 {
-  using type = typename thrust::iterator_value<InputIterator>::type;
-};
-
-template <typename InputIterator, typename BinaryFunction>
-struct partial_sum_type<InputIterator, BinaryFunction, ::cuda::std::void_t<typename BinaryFunction::result_type>>
-{
-  using type = typename BinaryFunction::result_type;
+  using type = thrust::iterator_value_t<InputIterator>;
 };
 
 template <typename InputIterator1, typename InputIterator2, typename BinaryPredicate, typename BinaryFunction>
 thrust::pair<InputIterator1,
-             thrust::pair<typename thrust::iterator_value<InputIterator1>::type,
+             thrust::pair<thrust::iterator_value_t<InputIterator1>,
                           typename partial_sum_type<InputIterator2, BinaryFunction>::type>>
 reduce_last_segment_backward(
   InputIterator1 keys_first,
@@ -90,7 +84,7 @@ reduce_last_segment_backward(
   thrust::reverse_iterator<InputIterator1> keys_last_r(keys_first);
   thrust::reverse_iterator<InputIterator2> values_first_r(values_first + n);
 
-  typename thrust::iterator_value<InputIterator1>::type result_key             = *keys_first_r;
+  thrust::iterator_value_t<InputIterator1> result_key                          = *keys_first_r;
   typename partial_sum_type<InputIterator2, BinaryFunction>::type result_value = *values_first_r;
 
   // consume the entirety of the first key's sequence
@@ -111,7 +105,7 @@ template <typename InputIterator1,
           typename BinaryFunction>
 thrust::tuple<OutputIterator1,
               OutputIterator2,
-              typename thrust::iterator_value<InputIterator1>::type,
+              thrust::iterator_value_t<InputIterator1>,
               typename partial_sum_type<InputIterator2, BinaryFunction>::type>
 reduce_by_key_with_carry(
   InputIterator1 keys_first,
@@ -124,8 +118,7 @@ reduce_by_key_with_carry(
 {
   // first, consume the last sequence to produce the carry
   // XXX is there an elegant way to pose this such that we don't need to default construct carry?
-  thrust::pair<typename thrust::iterator_value<InputIterator1>::type,
-               typename partial_sum_type<InputIterator2, BinaryFunction>::type>
+  thrust::pair<thrust::iterator_value_t<InputIterator1>, typename partial_sum_type<InputIterator2, BinaryFunction>::type>
     carry;
 
   thrust::tie(keys_last, carry) =
@@ -215,7 +208,7 @@ struct serial_reduce_by_key_body
     Iterator6 my_carry_result  = carry_result + interval_idx;
 
     // consume the rest of the interval with reduce_by_key
-    using key_type   = typename thrust::iterator_value<Iterator1>::type;
+    using key_type   = thrust::iterator_value_t<Iterator1>;
     using value_type = typename partial_sum_type<Iterator2, BinaryFunction>::type;
 
     // XXX is there a way to pose this so that we don't require default construction of carry?
diff --git a/thrust/thrust/system/tbb/detail/reduce_intervals.h b/thrust/thrust/system/tbb/detail/reduce_intervals.h
index 789fb554127..6270650b04d 100644
--- a/thrust/thrust/system/tbb/detail/reduce_intervals.h
+++ b/thrust/thrust/system/tbb/detail/reduce_intervals.h
@@ -33,6 +33,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 
 #include <cassert>
+#include <type_traits>
 
 #include <tbb/parallel_for.h>
 
@@ -81,7 +82,7 @@ struct body
     RandomAccessIterator1 my_last  = first + offset_to_last;
 
     // carefully pass the init value for the interval with raw_reference_cast
-    using sum_type = typename BinaryFunction::result_type;
+    using sum_type = typename std::decay<decltype(binary_op(*my_first, *my_first))>::type;
     result[interval_idx] =
       thrust::reduce(thrust::seq, my_first + 1, my_last, sum_type(thrust::raw_reference_cast(*my_first)), binary_op);
   }
diff --git a/thrust/thrust/system/tbb/detail/scan.inl b/thrust/thrust/system/tbb/detail/scan.inl
index b2db92bf57d..5372eda9f6b 100644
--- a/thrust/thrust/system/tbb/detail/scan.inl
+++ b/thrust/thrust/system/tbb/detail/scan.inl
@@ -28,7 +28,6 @@
 #include <thrust/advance.h>
 #include <thrust/detail/function.h>
 #include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
 #include <thrust/detail/type_traits/iterator/is_output_iterator.h>
 #include <thrust/distance.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -59,7 +58,7 @@ struct inclusive_body
   inclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType dummy)
       : input(input)
       , output(output)
-      , binary_op(binary_op)
+      , binary_op{binary_op}
       , sum(dummy)
       , first_call(true)
   {}
@@ -67,7 +66,7 @@ struct inclusive_body
   inclusive_body(inclusive_body& b, ::tbb::split)
       : input(b.input)
       , output(b.output)
-      , binary_op(b.binary_op)
+      , binary_op{b.binary_op}
       , sum(b.sum)
       , first_call(true)
   {}
@@ -153,7 +152,7 @@ struct exclusive_body
   exclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType init)
       : input(input)
       , output(output)
-      , binary_op(binary_op)
+      , binary_op{binary_op}
       , sum(init)
       , first_call(true)
   {}
@@ -161,7 +160,7 @@ struct exclusive_body
   exclusive_body(exclusive_body& b, ::tbb::split)
       : input(b.input)
       , output(b.output)
-      , binary_op(b.binary_op)
+      , binary_op{b.binary_op}
       , sum(b.sum)
       , first_call(true)
   {}
diff --git a/thrust/thrust/unique.h b/thrust/thrust/unique.h
index 205a4ed6eed..e5711bd2d40 100644
--- a/thrust/thrust/unique.h
+++ b/thrust/thrust/unique.h
@@ -1013,17 +1013,16 @@ _CCCL_HOST_DEVICE typename thrust::iterator_traits<ForwardIterator>::difference_
  *
  *  This version of \p unique_count uses the function object \p binary_pred to test for equality.
  *
- *  \param exec The execution policy to use for parallelization.
  *  \param first The beginning of the input range.
  *  \param last  The end of the input range.
  *  \param binary_pred  The binary predicate used to determine equality.
  *  \return The number of runs of equal elements in <tt>[first, new_last)</tt>
  *
- *  \tparam DerivedPolicy The name of the derived execution policy.
  *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward
- * Iterator</a>, and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type
- * and to \p BinaryPredicate's \c second_argument_type. \tparam BinaryPredicate is a model of <a
- * href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  Iterator</a>, and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type
+ *  and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a
+ *  href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p unique_count to
  *  determine the number of runs of equal elements:
@@ -1051,17 +1050,16 @@ unique_count(ForwardIterator first, ForwardIterator last, BinaryPredicate binary
  *
  *  This version of \p unique_count uses \c operator== to test for equality.
  *
- *  \param exec The execution policy to use for parallelization.
  *  \param first The beginning of the input range.
  *  \param last  The end of the input range.
  *  \param binary_pred  The binary predicate used to determine equality.
  *  \return The number of runs of equal elements in <tt>[first, new_last)</tt>
  *
- *  \tparam DerivedPolicy The name of the derived execution policy.
  *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward
- * Iterator</a>, and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type
- * and to \p BinaryPredicate's \c second_argument_type. \tparam BinaryPredicate is a model of <a
- * href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  Iterator</a>, and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type
+ *  and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a
+ *  href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p unique_count to
  *  determine the number of runs of equal elements: