diff --git a/.clang-format b/.clang-format index 21fd8c447ad..3cd3f6da331 100644 --- a/.clang-format +++ b/.clang-format @@ -117,12 +117,15 @@ IncludeCategories: - Regex: '^$' - Priority: 5 - SortPriority: 4 + Priority: 6 + SortPriority: 5 - Regex: '^ ### Prerequisites +- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) - [Docker](https://docs.docker.com/desktop/install/linux-install/) ### Steps diff --git a/.devcontainer/cuda11.1-gcc6/devcontainer.json b/.devcontainer/cuda11.1-gcc6/devcontainer.json index 2114e5fd8fb..ed345016ec1 100644 --- a/.devcontainer/cuda11.1-gcc6/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc6/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc6-cuda11.1", + "image": "rapidsai/devcontainers:24.10-cpp-gcc6-cuda11.1", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda11.1-gcc7/devcontainer.json b/.devcontainer/cuda11.1-gcc7/devcontainer.json index 7a9a07355fc..b1ff078547b 100644 --- a/.devcontainer/cuda11.1-gcc7/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc7/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc7-cuda11.1", + "image": "rapidsai/devcontainers:24.10-cpp-gcc7-cuda11.1", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda11.1-gcc8/devcontainer.json b/.devcontainer/cuda11.1-gcc8/devcontainer.json index 50c1bdca6b2..f480d0003a3 100644 --- a/.devcontainer/cuda11.1-gcc8/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc8/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc8-cuda11.1", + "image": "rapidsai/devcontainers:24.10-cpp-gcc8-cuda11.1", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda11.1-gcc9/devcontainer.json b/.devcontainer/cuda11.1-gcc9/devcontainer.json index f069ed0a116..a622e145191 100644 --- a/.devcontainer/cuda11.1-gcc9/devcontainer.json +++ b/.devcontainer/cuda11.1-gcc9/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc9-cuda11.1", + "image": "rapidsai/devcontainers:24.10-cpp-gcc9-cuda11.1", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda11.1-llvm9/devcontainer.json b/.devcontainer/cuda11.1-llvm9/devcontainer.json index 0b95a93677f..3eaa29a8b88 100644 --- a/.devcontainer/cuda11.1-llvm9/devcontainer.json +++ b/.devcontainer/cuda11.1-llvm9/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm9-cuda11.1", + "image": "rapidsai/devcontainers:24.10-cpp-llvm9-cuda11.1", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json index 20b430c4c16..4d03dc2de06 100644 --- a/.devcontainer/cuda11.8-gcc11/devcontainer.json +++ b/.devcontainer/cuda11.8-gcc11/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc11-cuda11.8", + "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda11.8", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.0-gcc10/devcontainer.json b/.devcontainer/cuda12.0-gcc10/devcontainer.json index 659f5a0320c..1371a181a9d 100644 --- a/.devcontainer/cuda12.0-gcc10/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc10/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc10-cuda12.0", + "image": "rapidsai/devcontainers:24.10-cpp-gcc10-cuda12.0", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.0-gcc11/devcontainer.json b/.devcontainer/cuda12.0-gcc11/devcontainer.json index 62a89b837dc..2096821c111 100644 --- a/.devcontainer/cuda12.0-gcc11/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc11/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc11-cuda12.0", + "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda12.0", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.0-gcc12/devcontainer.json b/.devcontainer/cuda12.0-gcc12/devcontainer.json index 1eb084299de..e99c8debae8 100644 --- a/.devcontainer/cuda12.0-gcc12/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc12/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc12-cuda12.0", + "image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.0", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.0-gcc9/devcontainer.json b/.devcontainer/cuda12.0-gcc9/devcontainer.json index daa1ba6a92f..31548082329 100644 --- a/.devcontainer/cuda12.0-gcc9/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc9/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc9-cuda12.0", + "image": "rapidsai/devcontainers:24.10-cpp-gcc9-cuda12.0", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.0-llvm10/devcontainer.json b/.devcontainer/cuda12.0-llvm10/devcontainer.json index 8bb371e0137..b4bf89b341a 100644 --- a/.devcontainer/cuda12.0-llvm10/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm10/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm10-cuda12.0", + "image": "rapidsai/devcontainers:24.10-cpp-llvm10-cuda12.0", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.0-llvm11/devcontainer.json b/.devcontainer/cuda12.0-llvm11/devcontainer.json index ff1f07c59b2..b87d457cb73 100644 --- a/.devcontainer/cuda12.0-llvm11/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm11/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm11-cuda12.0", + "image": "rapidsai/devcontainers:24.10-cpp-llvm11-cuda12.0", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.0-llvm12/devcontainer.json b/.devcontainer/cuda12.0-llvm12/devcontainer.json index 3053ac9c8b9..829ec1cb2e7 100644 --- a/.devcontainer/cuda12.0-llvm12/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm12/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm12-cuda12.0", + "image": "rapidsai/devcontainers:24.10-cpp-llvm12-cuda12.0", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.0-llvm13/devcontainer.json b/.devcontainer/cuda12.0-llvm13/devcontainer.json index 0e736940583..60abc033bef 100644 --- a/.devcontainer/cuda12.0-llvm13/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm13/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm13-cuda12.0", + "image": "rapidsai/devcontainers:24.10-cpp-llvm13-cuda12.0", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.0-llvm14/devcontainer.json b/.devcontainer/cuda12.0-llvm14/devcontainer.json index 63a6eff1708..a48b0bcd0cc 100644 --- a/.devcontainer/cuda12.0-llvm14/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm14/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm14-cuda12.0", + "image": "rapidsai/devcontainers:24.10-cpp-llvm14-cuda12.0", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.0-llvm9/devcontainer.json b/.devcontainer/cuda12.0-llvm9/devcontainer.json index f4eb0a86f58..465478e431d 100644 --- a/.devcontainer/cuda12.0-llvm9/devcontainer.json +++ b/.devcontainer/cuda12.0-llvm9/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm9-cuda12.0", + "image": "rapidsai/devcontainers:24.10-cpp-llvm9-cuda12.0", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.2-rapids-conda b/.devcontainer/cuda12.2-rapids-conda deleted file mode 120000 index 74d19d8f3fc..00000000000 --- a/.devcontainer/cuda12.2-rapids-conda +++ /dev/null @@ -1 +0,0 @@ -../ci/rapids/cuda12.2-conda \ No newline at end of file diff --git a/.devcontainer/cuda12.5-gcc10/devcontainer.json b/.devcontainer/cuda12.5-gcc10/devcontainer.json index 70a20f9cb90..5a59153bf39 100644 --- a/.devcontainer/cuda12.5-gcc10/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc10/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc10-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc10-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-gcc11/devcontainer.json b/.devcontainer/cuda12.5-gcc11/devcontainer.json index 43f561a41c0..42b668abf18 100644 --- a/.devcontainer/cuda12.5-gcc11/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc11/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc11-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-gcc12/devcontainer.json b/.devcontainer/cuda12.5-gcc12/devcontainer.json index 85ff2f26c67..d807d4cd30e 100644 --- a/.devcontainer/cuda12.5-gcc12/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc12/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc12-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-gcc13/devcontainer.json b/.devcontainer/cuda12.5-gcc13/devcontainer.json index c1b2aac5185..01364fdbc23 100644 --- a/.devcontainer/cuda12.5-gcc13/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc13/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-gcc7/devcontainer.json b/.devcontainer/cuda12.5-gcc7/devcontainer.json index ed1c7c6ea90..a6327695055 100644 --- a/.devcontainer/cuda12.5-gcc7/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc7/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc7-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc7-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-gcc8/devcontainer.json b/.devcontainer/cuda12.5-gcc8/devcontainer.json index 5496286eb64..f0aff7ba7b1 100644 --- a/.devcontainer/cuda12.5-gcc8/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc8/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc8-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc8-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-gcc9/devcontainer.json b/.devcontainer/cuda12.5-gcc9/devcontainer.json index 84c77a2fc9b..e050d233038 100644 --- a/.devcontainer/cuda12.5-gcc9/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc9/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc9-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc9-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-llvm10/devcontainer.json b/.devcontainer/cuda12.5-llvm10/devcontainer.json index c1f68f9c1c7..0cda7b0a667 100644 --- a/.devcontainer/cuda12.5-llvm10/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm10/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm10-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm10-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-llvm11/devcontainer.json b/.devcontainer/cuda12.5-llvm11/devcontainer.json index f7a9e773152..1a513873f1d 100644 --- a/.devcontainer/cuda12.5-llvm11/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm11/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm11-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm11-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-llvm12/devcontainer.json b/.devcontainer/cuda12.5-llvm12/devcontainer.json index 4f1d2043747..a11a351e30f 100644 --- a/.devcontainer/cuda12.5-llvm12/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm12/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm12-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm12-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-llvm13/devcontainer.json b/.devcontainer/cuda12.5-llvm13/devcontainer.json index 8904b179715..0136655f0c0 100644 --- a/.devcontainer/cuda12.5-llvm13/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm13/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm13-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm13-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-llvm14/devcontainer.json b/.devcontainer/cuda12.5-llvm14/devcontainer.json index 0a01a2261c7..dd9d6a62f04 100644 --- a/.devcontainer/cuda12.5-llvm14/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm14/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm14-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm14-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-llvm15/devcontainer.json b/.devcontainer/cuda12.5-llvm15/devcontainer.json index bb7378358cc..51fd6a14660 100644 --- a/.devcontainer/cuda12.5-llvm15/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm15/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm15-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm15-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-llvm16/devcontainer.json b/.devcontainer/cuda12.5-llvm16/devcontainer.json index 12bf9b07d72..882025ddaf2 100644 --- a/.devcontainer/cuda12.5-llvm16/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm16/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm16-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm16-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-llvm17/devcontainer.json b/.devcontainer/cuda12.5-llvm17/devcontainer.json index 6abd7c8292d..55fa86ff532 100644 --- a/.devcontainer/cuda12.5-llvm17/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm17/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm17-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm17-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-llvm9/devcontainer.json b/.devcontainer/cuda12.5-llvm9/devcontainer.json index e8164f36eab..3b2a328c2ea 100644 --- a/.devcontainer/cuda12.5-llvm9/devcontainer.json +++ b/.devcontainer/cuda12.5-llvm9/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-llvm9-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-llvm9-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json b/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json index 18b8a5c317d..5e4b04e19b4 100644 --- a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json +++ b/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-oneapi2023.2.0-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-oneapi2023.2.0-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.devcontainer/cuda12.5-rapids-conda b/.devcontainer/cuda12.5-rapids-conda new file mode 120000 index 00000000000..f187cbd1366 --- /dev/null +++ b/.devcontainer/cuda12.5-rapids-conda @@ -0,0 +1 @@ +../ci/rapids/cuda12.5-conda \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index c1b2aac5185..01364fdbc23 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5", + "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5", "hostRequirements": { "gpu": "optional" }, diff --git a/.github/actions/workflow-build/action.yml b/.github/actions/workflow-build/action.yml index 57a94989523..3842886589a 100644 --- a/.github/actions/workflow-build/action.yml +++ b/.github/actions/workflow-build/action.yml @@ -25,6 +25,15 @@ inputs: description: "Path to the matrix parser script (default if blank: build-workflow.py from action dir)" default: "" required: false + slack_token: + description: "The Slack token to use for notifications. No notifications will be sent if not provided." + required: false + slack_log: + description: "Slack channel ID for verbose notifications." + required: false + slack_alert: + description: "Slack channel ID for alert notifications." + required: false outputs: workflow: @@ -35,6 +44,20 @@ runs: using: "composite" steps: + - name: Send Slack log notification + if: ${{inputs.slack_token != '' && inputs.slack_log != '' }} + uses: slackapi/slack-github-action@v1.26.0 + env: + SLACK_BOT_TOKEN: ${{ inputs.slack_token }} + WORKFLOW_TYPE: ${{ github.workflow }} # nightly, weekly, pr, etc. + SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} + with: + channel-id: ${{ inputs.slack_log }} + slack-message: | + Workflow '${{ env.WORKFLOW_TYPE }}' starting... + + Details: ${{ env.SUMMARY_URL }} + - name: Inspect changes if: ${{ inputs.inspect_changes_script != '' && inputs.inspect_changes_base_sha != '' }} id: inspect-changes @@ -99,3 +122,18 @@ runs: name: workflow path: workflow/ compression-level: 0 + + - name: Send Slack error notification + if: ${{ failure() && inputs.slack_token != '' && (inputs.slack_alert != '' || inputs.slack_log != '') }} + uses: slackapi/slack-github-action@v1.26.0 + env: + SLACK_BOT_TOKEN: ${{ inputs.slack_token }} + WORKFLOW_TYPE: ${{ github.workflow }} # nightly, weekly, pr, etc. + SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} + CHANNEL_SEP: ${{ (inputs.slack_log != '' && inputs.slack_alert != '') && ',' || ''}} + with: + channel-id: '${{ inputs.slack_log }}${{env.CHANNEL_SEP}}${{ inputs.slack_alert }}' + slack-message: | + Workflow '${{ env.WORKFLOW_TYPE }}' encountered an error while preparing to run. + + Details: ${{ env.SUMMARY_URL }} diff --git a/.github/actions/workflow-results/action.yml b/.github/actions/workflow-results/action.yml index 3f5bc92afad..f14d6d496e8 100644 --- a/.github/actions/workflow-results/action.yml +++ b/.github/actions/workflow-results/action.yml @@ -8,11 +8,20 @@ inputs: pr_number: description: "The PR number to comment on, if applicable. No comment will be made if not provided." required: false + slack_token: + description: "The Slack token to use for notifications. No notifications will be sent if not provided." + required: false + slack_log: + description: "Slack channel ID for verbose notifications." + required: false + slack_alert: + description: "Slack channel ID for alert notifications." + required: false outputs: success: description: "Whether any jobs failed." - value: ${{ steps.check-dispatch.outputs.success }} + value: ${{ steps.check-success.outputs.success }} runs: using: "composite" @@ -112,6 +121,10 @@ runs: printf "SUMMARY=%s\n" "$(cat final_summary.md | url_encode_string)" | tee -a "${GITHUB_OUTPUT}" echo "::endgroup::" + echo "::group::GHA Output: EXEC_SUMMARY" + printf "EXEC_SUMMARY=%s\n" "$(cat execution/heading.txt)" | tee -a "${GITHUB_OUTPUT}" + echo "::endgroup::" + cp final_summary.md ${GITHUB_STEP_SUMMARY} - name: Comment on PR @@ -140,7 +153,7 @@ runs: }); - name: Check for job success - id: check-dispatch + id: check-success shell: bash --noprofile --norc -euo pipefail {0} run: | echo "::group::Checking for success artifacts" @@ -162,3 +175,38 @@ runs: fi echo "success=true" >> "${GITHUB_OUTPUT}" + + - name: Send Slack log notification + if: ${{ always() && inputs.slack_token != '' && inputs.slack_log != '' }} + uses: slackapi/slack-github-action@v1.26.0 + env: + SLACK_BOT_TOKEN: ${{ inputs.slack_token }} + WORKFLOW_TYPE: ${{ github.workflow }} # nightly, weekly, pr, etc. + STATUS: ${{ steps.check-success.outcome }} + EXEC_SUMMARY: ${{ steps.final-summary.outputs.EXEC_SUMMARY }} + SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} + with: + channel-id: ${{ inputs.slack_log }} + slack-message: | + Workflow '${{ env.WORKFLOW_TYPE }}' has finished with status `${{ env.STATUS }}`: + + ${{ env.EXEC_SUMMARY }} + + Details: ${{ env.SUMMARY_URL }} + + - name: Send Slack alert notification + if: ${{ failure() && inputs.slack_token != '' && inputs.slack_alert != '' }} + uses: slackapi/slack-github-action@v1.26.0 + env: + SLACK_BOT_TOKEN: ${{ inputs.slack_token }} + WORKFLOW_TYPE: ${{ github.workflow }} # nightly, weekly, pr, etc. + EXEC_SUMMARY: ${{ steps.final-summary.outputs.EXEC_SUMMARY }} + SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} + with: + channel-id: ${{ inputs.slack_alert }} + slack-message: | + Workflow '${{ env.WORKFLOW_TYPE }}' has failed: + + ${{ env.EXEC_SUMMARY }} + + Details: ${{ env.SUMMARY_URL }} diff --git a/.github/actions/workflow-results/parse-job-times.py b/.github/actions/workflow-results/parse-job-times.py index b90bd227a30..b30d585a0a6 100755 --- a/.github/actions/workflow-results/parse-job-times.py +++ b/.github/actions/workflow-results/parse-job-times.py @@ -120,7 +120,7 @@ def main(): for id, stats in result.items(): job_seconds = stats['job_seconds'] command_seconds = stats['command_seconds'] - overhead = (job_seconds - command_seconds) * 100 / command_seconds + overhead = (job_seconds - command_seconds) * 100 / command_seconds if command_seconds > 0 else 100 print(f"{stats['job_duration']:10} {stats['command_duration']:10} {overhead:10.0f} {stats['name']}") print("::endgroup::") diff --git a/.github/workflows/build-rapids.yml b/.github/workflows/build-rapids.yml index fe4a9697be8..2d0cfa6f761 100644 --- a/.github/workflows/build-rapids.yml +++ b/.github/workflows/build-rapids.yml @@ -2,6 +2,12 @@ name: Build all RAPIDS repositories on: workflow_call: + inputs: + enable_slack_alerts: + description: "If true, a message will be posted to the CCCL GHA CI Alert channel if the workflow fails." + required: false + default: false + type: boolean jobs: check-event: @@ -16,6 +22,7 @@ jobs: run: | [[ '${{ github.event_name }}' == 'push' && '${{ github.repository }}' == 'NVIDIA/cccl' ]] || \ [[ '${{ github.event_name }}' == 'schedule' && '${{ github.repository }}' == 'NVIDIA/cccl' ]] || \ + [[ '${{ github.event_name }}' == 'workflow_dispatch' && '${{ github.repository }}' == 'NVIDIA/cccl' ]] || \ [[ '${{ github.event_name }}' == 'pull_request' && '${{ github.repository }}' != 'NVIDIA/cccl' ]] \ && echo "ok=true" | tee -a $GITHUB_OUTPUT \ || echo "ok=false" | tee -a $GITHUB_OUTPUT; @@ -29,10 +36,10 @@ jobs: fail-fast: false matrix: include: - - { cuda: '12.2', libs: 'rmm KvikIO cudf cudf_kafka cuspatial', } - - { cuda: '12.2', libs: 'rmm ucxx raft cuvs', } - - { cuda: '12.2', libs: 'rmm ucxx raft cumlprims_mg cuml', } - - { cuda: '12.2', libs: 'rmm ucxx raft cugraph-ops wholegraph cugraph' } + - { cuda: '12.5', libs: 'rmm KvikIO cudf cudf_kafka cuspatial', } + - { cuda: '12.5', libs: 'rmm ucxx raft cuvs', } + - { cuda: '12.5', libs: 'rmm ucxx raft cumlprims_mg cuml', } + - { cuda: '12.5', libs: 'rmm ucxx raft cugraph-ops wholegraph cugraph' } permissions: id-token: write contents: read @@ -54,20 +61,20 @@ jobs: CI: true RAPIDS_LIBS: ${{ matrix.libs }} # Uncomment any of these to customize the git repo and branch for a RAPIDS lib: - # RAPIDS_cmake_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' - # RAPIDS_cudf_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' - # RAPIDS_cudf_kafka_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' - # RAPIDS_cugraph_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' - # RAPIDS_cugraph_ops_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' - # RAPIDS_cuml_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' - # RAPIDS_cumlprims_mg_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' - # RAPIDS_cuspatial_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' - # RAPIDS_cuvs_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' - # RAPIDS_KvikIO_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' - # RAPIDS_raft_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' - # RAPIDS_rmm_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' - # RAPIDS_ucxx_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-0.39"}' - # RAPIDS_wholegraph_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.08"}' + # RAPIDS_cmake_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' + # RAPIDS_cudf_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' + # RAPIDS_cudf_kafka_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' + # RAPIDS_cugraph_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' + # RAPIDS_cugraph_ops_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' + # RAPIDS_cuml_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' + # RAPIDS_cumlprims_mg_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' + # RAPIDS_cuspatial_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' + # RAPIDS_cuvs_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' + # RAPIDS_KvikIO_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' + # RAPIDS_raft_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' + # RAPIDS_rmm_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' + # RAPIDS_ucxx_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-0.40"}' + # RAPIDS_wholegraph_GIT_REPO: '{"upstream": "rapidsai", "tag": "branch-24.10"}' run: | cat <<"EOF" > "$RUNNER_TEMP/ci-entrypoint.sh" #! /usr/bin/env bash @@ -156,3 +163,22 @@ jobs: --volume "$RUNNER_TEMP/ci.sh:/ci.sh" \ --volume "$RUNNER_TEMP/ci-entrypoint.sh:/ci-entrypoint.sh" \ -- /ci-entrypoint.sh ./ci/rapids/rapids-entrypoint.sh /ci.sh + + notify-failure: + name: Notify Slack of RAPIDS failure + if: ${{ failure() && inputs.enable_slack_alerts }} + needs: build-rapids + runs-on: ubuntu-latest + steps: + - name: Notify + uses: slackapi/slack-github-action@v1.26.0 + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_NOTIFIER_BOT_TOKEN }} + WORKFLOW_TYPE: ${{ github.workflow }} + SUMMARY_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} + with: + channel-id: ${{ secrets.SLACK_CHANNEL_CI_ALERT }} + slack-message: | + RAPIDS build in workflow '${{ env.WORKFLOW_TYPE }}' failed. + + Details: ${{ env.SUMMARY_URL }} diff --git a/.github/workflows/ci-workflow-nightly.yml b/.github/workflows/ci-workflow-nightly.yml index c7406ef817e..fdf281b8063 100644 --- a/.github/workflows/ci-workflow-nightly.yml +++ b/.github/workflows/ci-workflow-nightly.yml @@ -21,6 +21,7 @@ defaults: shell: bash --noprofile --norc -euo pipefail {0} on: + workflow_dispatch: schedule: - cron: '0 7 * * *' # 7AM UTC, 12AM PST, 3AM EST @@ -46,6 +47,9 @@ jobs: uses: ./.github/actions/workflow-build with: workflows: nightly + slack_token: ${{ secrets.SLACK_NOTIFIER_BOT_TOKEN }} + slack_log: ${{ secrets.SLACK_CHANNEL_CI_LOG }} + slack_alert: ${{ secrets.SLACK_CHANNEL_CI_ALERT }} dispatch-groups-linux-two-stage: name: ${{ matrix.name }} @@ -128,6 +132,12 @@ jobs: - name: Check workflow success id: check-workflow uses: ./.github/actions/workflow-results + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + slack_token: ${{ secrets.SLACK_NOTIFIER_BOT_TOKEN }} + slack_log: ${{ secrets.SLACK_CHANNEL_CI_LOG }} + slack_alert: ${{ secrets.SLACK_CHANNEL_CI_ALERT }} + build-rapids: name: Build RAPIDS @@ -139,38 +149,5 @@ jobs: contents: read pull-requests: read uses: ./.github/workflows/build-rapids.yml - - # Check all other job statuses. This job gates branch protection checks. - ci: - name: CI - # !! Important: This job is used for branch protection checks. - # !! Need to use always() instead of !cancelled() because skipped jobs count as success - # !! for Github branch protection checks. Yes, really: by default, branch protections - # !! can be bypassed by cancelling CI. See NVIDIA/cccl#605. - if: ${{ always() }} - needs: - - verify-workflow - runs-on: ubuntu-latest - steps: - - name: Check results - run: | - status="success" - - check_result() { - name=$1 - expected=$2 - result=$3 - - echo "Checking if $name job result ('$result') is '$expected'..." - if [[ "$result" != "$expected" ]]; then - echo "$name job failed" - - status="failed" - fi - } - - check_result "verify-workflow" "success" "${{needs.verify-workflow.result}}" - - if [[ "$status" != "success" ]]; then - exit 1 - fi + with: + enable_slack_alerts: true diff --git a/benchmarks/scripts/analyze.py b/benchmarks/scripts/analyze.py index 248388b6a0b..8006637c462 100755 --- a/benchmarks/scripts/analyze.py +++ b/benchmarks/scripts/analyze.py @@ -273,7 +273,8 @@ def case_top(alpha, N, algname, ct_point_name, case_dfs): for subbench in case_dfs: case_dfs[subbench] = extract_complete_variants(case_dfs[subbench]) - print(extract_scores(case_dfs).head(N)) + with pd.option_context('display.max_rows', None): + print(extract_scores(case_dfs).head(N)) def top(args): diff --git a/ci/matrix.yaml b/ci/matrix.yaml index a7031beff97..fa2f5d92183 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -66,13 +66,13 @@ workflows: - {jobs: ['build'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14], project: ['libcudacxx']} - {jobs: ['build'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc12', std: 'all', project: ['libcudacxx']} - {jobs: ['build'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang9', std: [11], project: ['libcudacxx']} - - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20], project: ['libcudacxx']} - - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang16', std: [17], project: ['libcudacxx']} + # H100 runners are currently flakey, only build since those use CPU-only runners: + - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20]} + - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang16', std: [17]} + - {jobs: ['test'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14], project: ['cub', 'thrust']} - {jobs: ['test'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc12', std: 'all', project: ['cub', 'thrust']} - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang9', std: [11], project: ['cub', 'thrust']} - - {jobs: ['test'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20], project: ['cub', 'thrust']} - - {jobs: ['test'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang16', std: [17], project: ['cub', 'thrust']} # - {jobs: ['test'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14] } # - {jobs: ['test'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc12', std: 'all' } # - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang9', std: [11] } @@ -99,7 +99,7 @@ workflows: # The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers -devcontainer_version: '24.08' +devcontainer_version: '24.10' # All supported C++ standards: all_stds: [11, 14, 17, 20] diff --git a/ci/rapids/cuda12.2-conda/devcontainer.json b/ci/rapids/cuda12.5-conda/devcontainer.json similarity index 97% rename from ci/rapids/cuda12.2-conda/devcontainer.json rename to ci/rapids/cuda12.5-conda/devcontainer.json index 9fb4d3a9086..8ec1a35d43a 100644 --- a/ci/rapids/cuda12.2-conda/devcontainer.json +++ b/ci/rapids/cuda12.5-conda/devcontainer.json @@ -1,13 +1,13 @@ { - "image": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04", + "image": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04", "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-rapids-24.08-cuda12.2-conda" + "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-rapids-24.10-cuda12.5-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" @@ -15,7 +15,7 @@ "containerEnv": { "CI": "${localEnv:CI}", "CUDAARCHS": "70-real", - "CUDA_VERSION": "12.2", + "CUDA_VERSION": "12.5", "DEFAULT_CONDA_ENV": "rapids", "PYTHONSAFEPATH": "1", "PYTHONUNBUFFERED": "1", diff --git a/ci/update_rapids_version.sh b/ci/update_rapids_version.sh new file mode 100755 index 00000000000..d1300d9e411 --- /dev/null +++ b/ci/update_rapids_version.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. +########################## +# RAPIDS Version Updater # +########################## + +## Usage +# bash update_rapids_version.sh + +# Format is YY.MM.PP - no leading 'v' or trailing 'a' +NEXT_FULL_TAG=$1 + +#Get . for next version +NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}') +NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}') +NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}') +NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} +NEXT_UCXX_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})" + +# Need to distutils-normalize the versions for some use cases +NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") + +echo "Updating RAPIDS and devcontainers to $NEXT_FULL_TAG" + +# Inplace sed replace; workaround for Linux and Mac +function sed_runner() { + sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak +} + +# Update CI files +sed_runner "/devcontainer_version/ s/'[0-9.]*'/'${NEXT_SHORT_TAG}'/g" ci/matrix.yaml +for FILE in .github/workflows/*.yml; do + sed_runner "/rapidsai/ s/\"branch-.*\"/\"branch-${NEXT_SHORT_TAG}\"/g" "${FILE}" + sed_runner "/ucxx/ s/\"branch-.*\"/\"branch-${NEXT_UCXX_SHORT_TAG}\"/g" "${FILE}" +done + +function update_devcontainer() { + sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${1}" + sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${1}" + sed_runner "s@\${localWorkspaceFolderBasename}-rapids-[0-9.]*@\${localWorkspaceFolderBasename}-rapids-${NEXT_SHORT_TAG}@g" "${1}" +} + +# Update .devcontainer files +find .devcontainer/ ci/rapids/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do + update_devcontainer "${filename}" +done diff --git a/cub/CMakeLists.txt b/cub/CMakeLists.txt index 6e66813b7bb..91068802d63 100644 --- a/cub/CMakeLists.txt +++ b/cub/CMakeLists.txt @@ -30,7 +30,7 @@ endif() # Support adding CUB to a parent project via add_subdirectory. # See examples/cmake/add_subdir/CMakeLists.txt for details. -if (NOT CUB_TOPLEVEL_PROJECT AND NOT CUB_IN_THRUST) +if (NOT CUB_TOPLEVEL_PROJECT) include(cmake/CubAddSubdir.cmake) return() endif() @@ -51,12 +51,10 @@ mark_as_advanced(CUB_ENABLE_CPP_DIALECT_IN_NAMES) # This option is only used when CUB is built stand-alone; otherwise the Thrust # option has the same effect. -if (NOT CUB_IN_THRUST) - option(CUB_IGNORE_DEPRECATED_API - "Suppress warnings about deprecated Thrust/CUB API." - OFF - ) -endif() +option(CUB_IGNORE_DEPRECATED_API + "Suppress warnings about deprecated Thrust/CUB API." + OFF +) # Check if we're actually building anything before continuing. If not, no need # to search for deps, etc. This is a common approach for packagers that just diff --git a/cub/CONTRIBUTING.md b/cub/CONTRIBUTING.md index 0b6813ea78f..4002779dcdc 100644 --- a/cub/CONTRIBUTING.md +++ b/cub/CONTRIBUTING.md @@ -17,7 +17,7 @@ changes. CUB's tests and examples can be built by configuring Thrust with the CMake option `THRUST_INCLUDE_CUB_CMAKE=ON`. This process is described in more detail in Thrust's -[CONTRIBUTING.md](https://nvidia.github.io/thrust/contributing.html). +[CONTRIBUTING.md](https://nvidia.github.io/cccl/thrust/contributing.html). The CMake options in the following section may be used to customize CUB's build process. Note that some of these are controlled by Thrust for compatibility and @@ -63,8 +63,3 @@ The configuration options for CUB are: - Enable separable compilation on all targets that are agnostic of RDC. - Targets that explicitly require RDC to be enabled or disabled will ignore this setting. - Default is `OFF`. - -# Development Model - -CUB follows the same development model as Thrust, described -[here](https://nvidia.github.io/thrust/releases/versioning.html). diff --git a/cub/cmake/CubBuildTargetList.cmake b/cub/cmake/CubBuildTargetList.cmake index 426eee5e269..f01c6244f4e 100644 --- a/cub/cmake/CubBuildTargetList.cmake +++ b/cub/cmake/CubBuildTargetList.cmake @@ -132,30 +132,15 @@ function(cub_build_target_list) # Handle dialect options: set(num_dialects_enabled 0) foreach (dialect IN LISTS CUB_CPP_DIALECT_OPTIONS) - if (CUB_IN_THRUST) - # Just use Thrust's settings: - if (THRUST_ENABLE_MULTICONFIG) - set(CUB_ENABLE_DIALECT_CPP${dialect} - ${THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect}} - ) - else() - set(val OFF) - if (dialect EQUAL ${THRUST_CPP_DIALECT}) - set(val ON) - endif() - set(CUB_ENABLE_DIALECT_CPP${dialect} ${val}) - endif() - else() - # Create CMake options: - set(default_value OFF) - if (dialect EQUAL 14) # Default to just 14 on: - set(default_value ON) - endif() - option(CUB_ENABLE_DIALECT_CPP${dialect} - "Generate C++${dialect} build configurations." - ${default_value} - ) + # Create CMake options: + set(default_value OFF) + if (dialect EQUAL 14) # Default to just 14 on: + set(default_value ON) endif() + option(CUB_ENABLE_DIALECT_CPP${dialect} + "Generate C++${dialect} build configurations." + ${default_value} + ) if (CUB_ENABLE_DIALECT_CPP${dialect}) math(EXPR num_dialects_enabled "${num_dialects_enabled} + 1") @@ -188,14 +173,8 @@ function(cub_build_target_list) # Generic config flags: macro(add_flag_option flag docstring default) set(cub_opt "CUB_${flag}") - if (CUB_IN_THRUST) - set(thrust_opt "THRUST_${flag}") - # Use thrust's settings: - set(${cub_opt} ${${thrust_opt}}) - else() - option(${cub_opt} "${docstring}" "${default}") - mark_as_advanced(${cub_opt}) - endif() + option(${cub_opt} "${docstring}" "${default}") + mark_as_advanced(${cub_opt}) endmacro() add_flag_option(IGNORE_DEPRECATED_CPP_DIALECT "Don't warn about any deprecated C++ standards and compilers." OFF) add_flag_option(IGNORE_DEPRECATED_CPP_11 "Don't warn about deprecated C++11." OFF) diff --git a/cub/cmake/CubHeaderTesting.cmake b/cub/cmake/CubHeaderTesting.cmake index 7cead875c08..f0ca17186ce 100644 --- a/cub/cmake/CubHeaderTesting.cmake +++ b/cub/cmake/CubHeaderTesting.cmake @@ -31,10 +31,6 @@ function(cub_add_header_test label definitions) cub_clone_target_properties(${headertest_target} ${cub_target}) cub_configure_cuda_target(${headertest_target} RDC ${CUB_FORCE_RDC}) - if (CUB_IN_THRUST) - thrust_fix_clang_nvcc_build_for(${headertest_target}) - endif() - add_dependencies(cub.all.headers ${headertest_target}) add_dependencies(${config_prefix}.all ${headertest_target}) endforeach() diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh new file mode 100644 index 00000000000..adf75535172 --- /dev/null +++ b/cub/cub/agent/agent_merge.cuh @@ -0,0 +1,230 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +#include + +#include + +CUB_NAMESPACE_BEGIN +namespace detail +{ +namespace merge +{ +template +struct agent_policy_t +{ + // do not change data member names, policy_wrapper_t depends on it + static constexpr int BLOCK_THREADS = ThreadsPerBlock; + static constexpr int ITEMS_PER_THREAD = ItemsPerThread; + static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; + static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm; + static constexpr CacheLoadModifier LOAD_MODIFIER = LoadCacheModifier; + static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm; +}; + +// TODO(bgruber): can we unify this one with AgentMerge in agent_merge_sort.cuh? +template +struct agent_t +{ + using policy = Policy; + + // key and value type are taken from the first input sequence (consistent with old Thrust behavior) + using key_type = typename ::cuda::std::iterator_traits::value_type; + using item_type = typename ::cuda::std::iterator_traits::value_type; + + using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + + using block_load_keys1 = typename BlockLoadType::type; + using block_load_keys2 = typename BlockLoadType::type; + using block_load_items1 = typename BlockLoadType::type; + using block_load_items2 = typename BlockLoadType::type; + + using block_store_keys = typename BlockStoreType::type; + using block_store_items = typename BlockStoreType::type; + + union temp_storages + { + typename block_load_keys1::TempStorage load_keys1; + typename block_load_keys2::TempStorage load_keys2; + typename block_load_items1::TempStorage load_items1; + typename block_load_items2::TempStorage load_items2; + typename block_store_keys::TempStorage store_keys; + typename block_store_items::TempStorage store_items; + + key_type keys_shared[Policy::ITEMS_PER_TILE + 1]; + item_type items_shared[Policy::ITEMS_PER_TILE + 1]; + }; + + struct TempStorage : Uninitialized + {}; + + static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD; + static constexpr int threads_per_block = Policy::BLOCK_THREADS; + static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE; + + // Per thread data + temp_storages& storage; + keys_load_it1 keys1_in; + items_load_it1 items1_in; + Offset keys1_count; + keys_load_it2 keys2_in; + items_load_it2 items2_in; + Offset keys2_count; + KeysOutputIt keys_out; + ItemsOutputIt items_out; + CompareOp compare_op; + Offset* merge_partitions; + + template + _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(Offset tile_idx, Offset tile_base, int num_remaining) + { + const Offset partition_beg = merge_partitions[tile_idx + 0]; + const Offset partition_end = merge_partitions[tile_idx + 1]; + + const Offset diag0 = items_per_tile * tile_idx; + const Offset diag1 = (cub::min)(keys1_count + keys2_count, diag0 + items_per_tile); + + // compute bounding box for keys1 & keys2 + const Offset keys1_beg = partition_beg; + const Offset keys1_end = partition_end; + const Offset keys2_beg = diag0 - keys1_beg; + const Offset keys2_end = diag1 - keys1_end; + + // number of keys per tile + const int num_keys1 = static_cast(keys1_end - keys1_beg); + const int num_keys2 = static_cast(keys2_end - keys2_beg); + + key_type keys_loc[items_per_thread]; + gmem_to_reg( + keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2); + reg_to_shared(&storage.keys_shared[0], keys_loc); + CTA_SYNC(); + + // use binary search in shared memory to find merge path for each of thread. + // we can use int type here, because the number of items in shared memory is limited + const int diag0_loc = min(num_keys1 + num_keys2, items_per_thread * threadIdx.x); + + const int keys1_beg_loc = + MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op); + const int keys1_end_loc = num_keys1; + const int keys2_beg_loc = diag0_loc - keys1_beg_loc; + const int keys2_end_loc = num_keys2; + + const int num_keys1_loc = keys1_end_loc - keys1_beg_loc; + const int num_keys2_loc = keys2_end_loc - keys2_beg_loc; + + // perform serial merge + int indices[items_per_thread]; + cub::SerialMerge( + &storage.keys_shared[0], + keys1_beg_loc, + keys2_beg_loc + num_keys1, + num_keys1_loc, + num_keys2_loc, + keys_loc, + indices, + compare_op); + CTA_SYNC(); + + // write keys + if (IsFullTile) + { + block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc); + } + else + { + block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc, num_remaining); + } + + // if items are provided, merge them + static constexpr bool have_items = !std::is_same::value; +#ifdef _CCCL_CUDACC_BELOW_11_8 + if (have_items) // nvcc 11.1 cannot handle #pragma unroll inside if constexpr but 11.8 can. + // nvcc versions between may work +#else + _CCCL_IF_CONSTEXPR (have_items) +#endif + { + item_type items_loc[items_per_thread]; + gmem_to_reg( + items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2); + CTA_SYNC(); // block_store_keys above uses shared memory, so make sure all threads are done before we write to it + reg_to_shared(&storage.items_shared[0], items_loc); + CTA_SYNC(); + + // gather items from shared mem +#pragma unroll + for (int i = 0; i < items_per_thread; ++i) + { + items_loc[i] = storage.items_shared[indices[i]]; + } + CTA_SYNC(); + + // write from reg to gmem + if (IsFullTile) + { + block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc); + } + else + { + block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc, num_remaining); + } + } + } + + _CCCL_DEVICE _CCCL_FORCEINLINE void operator()() + { + // XXX with 8.5 chaging type to Offset (or long long) results in error! + // TODO(bgruber): is the above still true? + const int tile_idx = static_cast(blockIdx.x); + const Offset tile_base = tile_idx * items_per_tile; + // TODO(bgruber): random mixing of int and Offset + const int items_in_tile = + static_cast(cub::min(static_cast(items_per_tile), keys1_count + keys2_count - tile_base)); + if (items_in_tile == items_per_tile) + { + consume_tile(tile_idx, tile_base, items_per_tile); // full tile + } + else + { + consume_tile(tile_idx, tile_base, items_in_tile); // partial tile + } + } +}; +} // namespace merge +} // namespace detail +CUB_NAMESPACE_END diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh index d7c0df7a302..123abb2b986 100644 --- a/cub/cub/agent/agent_merge_sort.cuh +++ b/cub/cub/agent/agent_merge_sort.cuh @@ -172,9 +172,9 @@ struct AgentBlockSort _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(OffsetT tile_base, int num_remaining) { ValueT items_local[ITEMS_PER_THREAD]; - if (!KEYS_ONLY) + _CCCL_IF_CONSTEXPR (!KEYS_ONLY) { - if (IS_LAST_TILE) + _CCCL_IF_CONSTEXPR (IS_LAST_TILE) { BlockLoadItems(storage.load_items) .Load(items_in + tile_base, items_local, num_remaining, *(items_in + tile_base)); @@ -188,7 +188,7 @@ struct AgentBlockSort } KeyT keys_local[ITEMS_PER_THREAD]; - if (IS_LAST_TILE) + _CCCL_IF_CONSTEXPR (IS_LAST_TILE) { BlockLoadKeys(storage.load_keys).Load(keys_in + tile_base, keys_local, num_remaining, *(keys_in + tile_base)); } @@ -199,7 +199,7 @@ struct AgentBlockSort CTA_SYNC(); - if (IS_LAST_TILE) + _CCCL_IF_CONSTEXPR (IS_LAST_TILE) { BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op, num_remaining, keys_local[0]); } @@ -212,7 +212,7 @@ struct AgentBlockSort if (ping) { - if (IS_LAST_TILE) + _CCCL_IF_CONSTEXPR (IS_LAST_TILE) { BlockStoreKeysIt(storage.store_keys_it).Store(keys_out_it + tile_base, keys_local, num_remaining); } @@ -221,11 +221,11 @@ struct AgentBlockSort BlockStoreKeysIt(storage.store_keys_it).Store(keys_out_it + tile_base, keys_local); } - if (!KEYS_ONLY) + _CCCL_IF_CONSTEXPR (!KEYS_ONLY) { CTA_SYNC(); - if (IS_LAST_TILE) + _CCCL_IF_CONSTEXPR (IS_LAST_TILE) { BlockStoreItemsIt(storage.store_items_it).Store(items_out_it + tile_base, items_local, num_remaining); } @@ -237,7 +237,7 @@ struct AgentBlockSort } else { - if (IS_LAST_TILE) + _CCCL_IF_CONSTEXPR (IS_LAST_TILE) { BlockStoreKeysRaw(storage.store_keys_raw).Store(keys_out_raw + tile_base, keys_local, num_remaining); } @@ -246,11 +246,11 @@ struct AgentBlockSort BlockStoreKeysRaw(storage.store_keys_raw).Store(keys_out_raw + tile_base, keys_local); } - if (!KEYS_ONLY) + _CCCL_IF_CONSTEXPR (!KEYS_ONLY) { CTA_SYNC(); - if (IS_LAST_TILE) + _CCCL_IF_CONSTEXPR (IS_LAST_TILE) { BlockStoreItemsRaw(storage.store_items_raw).Store(items_out_raw + tile_base, items_local, num_remaining); } @@ -316,25 +316,25 @@ struct AgentPartition _CCCL_DEVICE _CCCL_FORCEINLINE void Process() { - OffsetT merged_tiles_number = target_merged_tiles_number / 2; + const OffsetT merged_tiles_number = target_merged_tiles_number / 2; // target_merged_tiles_number is a power of two. - OffsetT mask = target_merged_tiles_number - 1; + const OffsetT mask = target_merged_tiles_number - 1; // The first tile number in the tiles group being merged, equal to: // target_merged_tiles_number * (partition_idx / target_merged_tiles_number) - OffsetT list = ~mask & partition_idx; - OffsetT start = items_per_tile * list; - OffsetT size = items_per_tile * merged_tiles_number; + const OffsetT list = ~mask & partition_idx; + const OffsetT start = items_per_tile * list; + const OffsetT size = items_per_tile * merged_tiles_number; // Tile number within the tile group being merged, equal to: // partition_idx / target_merged_tiles_number - OffsetT local_tile_idx = mask & partition_idx; + const OffsetT local_tile_idx = mask & partition_idx; - OffsetT keys1_beg = (cub::min)(keys_count, start); - OffsetT keys1_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(start, size)); - OffsetT keys2_beg = keys1_end; - OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size)); + const OffsetT keys1_beg = (cub::min)(keys_count, start); + const OffsetT keys1_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(start, size)); + const OffsetT keys2_beg = keys1_end; + const OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size)); // The last partition (which is one-past-the-last-tile) is only to mark the end of keys1_end for the merge stage if (partition_idx + 1 == num_partitions) @@ -343,30 +343,77 @@ struct AgentPartition } else { - OffsetT partition_at = (cub::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx); + const OffsetT partition_at = (cub::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx); OffsetT partition_diag = ping - ? MergePath( - keys_ping + keys1_beg, - keys_ping + keys2_beg, - keys1_end - keys1_beg, - keys2_end - keys2_beg, - partition_at, - compare_op) - : MergePath( - keys_pong + keys1_beg, - keys_pong + keys2_beg, - keys1_end - keys1_beg, - keys2_end - keys2_beg, - partition_at, - compare_op); + ? MergePath(keys_ping + keys1_beg, + keys_ping + keys2_beg, + keys1_end - keys1_beg, + keys2_end - keys2_beg, + partition_at, + compare_op) + : MergePath(keys_pong + keys1_beg, + keys_pong + keys2_beg, + keys1_end - keys1_beg, + keys2_end - keys2_beg, + partition_at, + compare_op); merge_partitions[partition_idx] = keys1_beg + partition_diag; } } }; +namespace detail +{ +/** + * \brief Concatenates up to ITEMS_PER_THREAD elements from input{1,2} into output array + * + * Reads data in a coalesced fashion [BLOCK_THREADS * item + tid] and + * stores the result in output[item]. + */ +template +_CCCL_DEVICE _CCCL_FORCEINLINE void +gmem_to_reg(T (&output)[ITEMS_PER_THREAD], It1 input1, It2 input2, int count1, int count2) +{ + _CCCL_IF_CONSTEXPR (IS_FULL_TILE) + { +#pragma unroll + for (int item = 0; item < ITEMS_PER_THREAD; ++item) + { + const int idx = BLOCK_THREADS * item + threadIdx.x; + // It1 and It2 could have different value types. Convert after load. + output[item] = (idx < count1) ? static_cast(input1[idx]) : static_cast(input2[idx - count1]); + } + } + else + { +#pragma unroll + for (int item = 0; item < ITEMS_PER_THREAD; ++item) + { + const int idx = BLOCK_THREADS * item + threadIdx.x; + if (idx < count1 + count2) + { + output[item] = (idx < count1) ? static_cast(input1[idx]) : static_cast(input2[idx - count1]); + } + } + } +} + +/// \brief Stores data in a coalesced fashion in[item] -> out[BLOCK_THREADS * item + tid] +template +_CCCL_DEVICE _CCCL_FORCEINLINE void reg_to_shared(It output, T (&input)[ITEMS_PER_THREAD]) +{ +#pragma unroll + for (int item = 0; item < ITEMS_PER_THREAD; ++item) + { + const int idx = BLOCK_THREADS * item + threadIdx.x; + output[idx] = input[item]; + } +} +} // namespace detail + /// \brief The agent is responsible for merging N consecutive sorted arrays into N/2 sorted arrays. template - _CCCL_DEVICE _CCCL_FORCEINLINE void - gmem_to_reg(T (&output)[ITEMS_PER_THREAD], It1 input1, It2 input2, int count1, int count2) - { - if (IS_FULL_TILE) - { -#pragma unroll - for (int item = 0; item < ITEMS_PER_THREAD; ++item) - { - int idx = BLOCK_THREADS * item + threadIdx.x; - output[item] = (idx < count1) ? input1[idx] : input2[idx - count1]; - } - } - else - { -#pragma unroll - for (int item = 0; item < ITEMS_PER_THREAD; ++item) - { - int idx = BLOCK_THREADS * item + threadIdx.x; - if (idx < count1 + count2) - { - output[item] = (idx < count1) ? input1[idx] : input2[idx - count1]; - } - } - } - } - - /// \brief Stores data in a coalesced fashion in[item] -> out[BLOCK_THREADS * item + tid] - template - _CCCL_DEVICE _CCCL_FORCEINLINE void reg_to_shared(It output, T (&input)[ITEMS_PER_THREAD]) - { -#pragma unroll - for (int item = 0; item < ITEMS_PER_THREAD; ++item) - { - int idx = BLOCK_THREADS * item + threadIdx.x; - output[idx] = input[item]; - } - } - template _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(int tid, OffsetT tile_idx, OffsetT tile_base, int count) { - OffsetT partition_beg = merge_partitions[tile_idx + 0]; - OffsetT partition_end = merge_partitions[tile_idx + 1]; + const OffsetT partition_beg = merge_partitions[tile_idx + 0]; + const OffsetT partition_end = merge_partitions[tile_idx + 1]; // target_merged_tiles_number is a power of two. - OffsetT merged_tiles_number = target_merged_tiles_number / 2; + const OffsetT merged_tiles_number = target_merged_tiles_number / 2; - OffsetT mask = target_merged_tiles_number - 1; + const OffsetT mask = target_merged_tiles_number - 1; // The first tile number in the tiles group being merged, equal to: // target_merged_tiles_number * (tile_idx / target_merged_tiles_number) - OffsetT list = ~mask & tile_idx; - OffsetT start = ITEMS_PER_TILE * list; - OffsetT size = ITEMS_PER_TILE * merged_tiles_number; + const OffsetT list = ~mask & tile_idx; + const OffsetT start = ITEMS_PER_TILE * list; + const OffsetT size = ITEMS_PER_TILE * merged_tiles_number; - OffsetT diag = ITEMS_PER_TILE * tile_idx - start; + const OffsetT diag = ITEMS_PER_TILE * tile_idx - start; - OffsetT keys1_beg = partition_beg - start; - OffsetT keys1_end = partition_end - start; + const OffsetT keys1_beg = partition_beg - start; + OffsetT keys1_end = partition_end - start; - OffsetT keys_end_dist_from_start = keys_count - start; - OffsetT max_keys2 = (keys_end_dist_from_start > size) ? (keys_end_dist_from_start - size) : 0; + const OffsetT keys_end_dist_from_start = keys_count - start; + const OffsetT max_keys2 = (keys_end_dist_from_start > size) ? (keys_end_dist_from_start - size) : 0; // We have the following invariants: // diag >= keys1_beg, because diag is the distance of the total merge path so far (keys1 + keys2) // diag+ITEMS_PER_TILE >= keys1_end, because diag+ITEMS_PER_TILE is the distance of the merge path for the next tile // and keys1_end is key1's component of that path - OffsetT keys2_beg = (cub::min)(max_keys2, diag - keys1_beg); + const OffsetT keys2_beg = (cub::min)(max_keys2, diag - keys1_beg); OffsetT keys2_end = (cub::min)(max_keys2, detail::safe_add_bound_to_max(diag, static_cast(ITEMS_PER_TILE)) - keys1_end); @@ -530,32 +532,32 @@ struct AgentMerge } // number of keys per tile - // - int num_keys1 = static_cast(keys1_end - keys1_beg); - int num_keys2 = static_cast(keys2_end - keys2_beg); + const int num_keys1 = static_cast(keys1_end - keys1_beg); + const int num_keys2 = static_cast(keys2_end - keys2_beg); // load keys1 & keys2 KeyT keys_local[ITEMS_PER_THREAD]; if (ping) { - gmem_to_reg( + detail::gmem_to_reg( keys_local, keys_in_ping + start + keys1_beg, keys_in_ping + start + size + keys2_beg, num_keys1, num_keys2); } else { - gmem_to_reg( + detail::gmem_to_reg( keys_local, keys_in_pong + start + keys1_beg, keys_in_pong + start + size + keys2_beg, num_keys1, num_keys2); } - reg_to_shared(&storage.keys_shared[0], keys_local); + detail::reg_to_shared(&storage.keys_shared[0], keys_local); // preload items into registers already // ValueT items_local[ITEMS_PER_THREAD]; - if (!KEYS_ONLY) + (void) items_local; // TODO(bgruber): replace by [[maybe_unused]] in C++17 + _CCCL_IF_CONSTEXPR (!KEYS_ONLY) { if (ping) { - gmem_to_reg( + detail::gmem_to_reg( items_local, items_in_ping + start + keys1_beg, items_in_ping + start + size + keys2_beg, @@ -564,7 +566,7 @@ struct AgentMerge } else { - gmem_to_reg( + detail::gmem_to_reg( items_local, items_in_pong + start + keys1_beg, items_in_pong + start + size + keys2_beg, @@ -580,16 +582,16 @@ struct AgentMerge // we can use int type here, because the number of // items in shared memory is limited // - int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid); + const int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid); - int keys1_beg_local = MergePath( + const int keys1_beg_local = MergePath( &storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_local, compare_op); - int keys1_end_local = num_keys1; - int keys2_beg_local = diag0_local - keys1_beg_local; - int keys2_end_local = num_keys2; + const int keys1_end_local = num_keys1; + const int keys2_beg_local = diag0_local - keys1_beg_local; + const int keys2_end_local = num_keys2; - int num_keys1_local = keys1_end_local - keys1_beg_local; - int num_keys2_local = keys2_end_local - keys2_beg_local; + const int num_keys1_local = keys1_end_local - keys1_beg_local; + const int num_keys2_local = keys2_end_local - keys2_beg_local; // perform serial merge // @@ -608,10 +610,9 @@ struct AgentMerge CTA_SYNC(); // write keys - // if (ping) { - if (IS_FULL_TILE) + _CCCL_IF_CONSTEXPR (IS_FULL_TILE) { BlockStoreKeysPing(storage.store_keys_ping).Store(keys_out_ping + tile_base, keys_local); } @@ -622,7 +623,7 @@ struct AgentMerge } else { - if (IS_FULL_TILE) + _CCCL_IF_CONSTEXPR (IS_FULL_TILE) { BlockStoreKeysPong(storage.store_keys_pong).Store(keys_out_pong + tile_base, keys_local); } @@ -633,11 +634,16 @@ struct AgentMerge } // if items are provided, merge them - if (!KEYS_ONLY) +#ifdef _CCCL_CUDACC_BELOW_11_8 + if (!KEYS_ONLY) // nvcc 11.1 cannot handle #pragma unroll inside if constexpr but 11.8 can. + // nvcc versions between may work +#else + _CCCL_IF_CONSTEXPR (!KEYS_ONLY) +#endif { CTA_SYNC(); - reg_to_shared(&storage.items_shared[0], items_local); + detail::reg_to_shared(&storage.items_shared[0], items_local); CTA_SYNC(); @@ -655,7 +661,7 @@ struct AgentMerge // if (ping) { - if (IS_FULL_TILE) + _CCCL_IF_CONSTEXPR (IS_FULL_TILE) { BlockStoreItemsPing(storage.store_items_ping).Store(items_out_ping + tile_base, items_local); } @@ -666,7 +672,7 @@ struct AgentMerge } else { - if (IS_FULL_TILE) + _CCCL_IF_CONSTEXPR (IS_FULL_TILE) { BlockStoreItemsPong(storage.store_items_pong).Store(items_out_pong + tile_base, items_local); } @@ -711,11 +717,12 @@ struct AgentMerge _CCCL_DEVICE _CCCL_FORCEINLINE void Process() { - int tile_idx = static_cast(blockIdx.x); - int num_tiles = static_cast(gridDim.x); - OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE; - int tid = static_cast(threadIdx.x); - int items_in_tile = static_cast((cub::min)(static_cast(ITEMS_PER_TILE), keys_count - tile_base)); + const int tile_idx = static_cast(blockIdx.x); + const int num_tiles = static_cast(gridDim.x); + const OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE; + const int tid = static_cast(threadIdx.x); + const int items_in_tile = + static_cast((cub::min)(static_cast(ITEMS_PER_TILE), keys_count - tile_base)); if (tile_idx < num_tiles - 1) { diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh index b75016ac7fa..2fb15e9059b 100644 --- a/cub/cub/block/block_discontinuity.cuh +++ b/cub/cub/block/block_discontinuity.cuh @@ -28,7 +28,7 @@ /** * @file - * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for + * The cub::BlockDiscontinuity class provides [collective](../index.html#sec0) methods for * flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. */ diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh index 256c7fb4888..a781d68e68b 100644 --- a/cub/cub/block/block_exchange.cuh +++ b/cub/cub/block/block_exchange.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file The cub::BlockExchange class provides :ref:`collective ` methods for -//! rearranging data partitioned across a CUDA thread block. +//! @file +//! The cub::BlockExchange class provides :ref:`collective ` methods for +//! rearranging data partitioned across a CUDA thread block. #pragma once @@ -55,11 +56,10 @@ CUB_NAMESPACE_BEGIN //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! -//! - It is commonplace for blocks of threads to rearrange data items between -//! threads. For example, the device-accessible memory subsystem prefers access patterns -//! where data items are "striped" across threads (where consecutive threads access consecutive items), -//! yet most block-wide operations prefer a "blocked" partitioning of items across threads -//! (where consecutive items belong to a single thread). +//! - It is commonplace for blocks of threads to rearrange data items between threads. For example, the +//! device-accessible memory subsystem prefers access patterns where data items are "striped" across threads (where +//! consecutive threads access consecutive items), yet most block-wide operations prefer a "blocked" partitioning of +//! items across threads (where consecutive items belong to a single thread). //! - BlockExchange supports the following types of data exchanges: //! //! - Transposing between :ref:`blocked ` and :ref:`striped ` @@ -76,8 +76,8 @@ CUB_NAMESPACE_BEGIN //! //! @blockcollective{BlockExchange} //! -//! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement -//! of 512 integer items partitioned across 128 threads where each thread owns 4 items. +//! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement of 512 integer items +//! partitioned across 128 threads where each thread owns 4 items. //! //! .. code-block:: c++ //! @@ -98,9 +98,8 @@ CUB_NAMESPACE_BEGIN //! // Collectively exchange data into a blocked arrangement across threads //! BlockExchange(temp_storage).StripedToBlocked(thread_data); //! -//! Suppose the set of striped input ``thread_data`` across the block of threads is -//! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }``. -//! The corresponding output ``thread_data`` in those threads will be +//! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384], +//! [1,129,257,385], ..., [127,255,383,511] }``. The corresponding output ``thread_data`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! //! Performance Considerations @@ -112,33 +111,33 @@ CUB_NAMESPACE_BEGIN //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with -//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to -//! the storage required by BlockExchange. +//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage required +//! by BlockExchange. //! @endrst //! //! @tparam T -//! The data type to be exchanged +//! The data type to be exchanged //! //! @tparam BLOCK_DIM_X -//! The thread block length in threads along the X dimension +//! The thread block length in threads along the X dimension //! //! @tparam ITEMS_PER_THREAD -//! The number of items partitioned onto each thread. +//! The number of items partitioned onto each thread. //! //! @tparam WARP_TIME_SLICING -//! **[optional]** When `true`, only use enough shared memory for a single warp's worth of tile data, -//! time-slicing the block-wide exchange over multiple synchronized rounds. -//! Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) +//! **[optional]** When `true`, only use enough shared memory for a single warp's worth of +//! tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint +//! at the expense of decreased parallelism. (Default: false) //! //! @tparam BLOCK_DIM_Y -//! **[optional]** The thread block length in threads along the Y dimension (default: 1) +//! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z -//! **[optional]** The thread block length in threads along the Z dimension (default: 1) +//! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH -//! [optional] Unused. -template [optional] Unused. +template class BlockExchange { -private: - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0), - SMEM_BANKS = 1 << LOG_SMEM_BANKS, - - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1, - - TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS, - TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD, - - WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS), - WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD, - - // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise - // we can typically use 128b loads) - INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), - PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0, - }; + static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; ///< The thread block size in threads + static constexpr int WARP_THREADS = CUB_WARP_THREADS(0); + static constexpr int WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS; // TODO(bgruber): use ceil_div in + // C++14 + static constexpr int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0); + + static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; + static constexpr int TIME_SLICES = WARP_TIME_SLICING ? WARPS : 1; + static constexpr int TIME_SLICED_THREADS = WARP_TIME_SLICING ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS; + static constexpr int TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD; + static constexpr int WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS); + static constexpr int WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD; + + // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise + // we can typically use 128b loads) + static constexpr bool INSERT_PADDING = ITEMS_PER_THREAD > 4 && PowerOfTwo::VALUE; + static constexpr int PADDING_ITEMS = INSERT_PADDING ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0; /// Shared memory storage layout type - struct __align__(16) _TempStorage + struct alignas(16) _TempStorage { - InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS]; + T buff[TIME_SLICED_ITEMS + PADDING_ITEMS]; }; public: /// @smemstorage{BlockExchange} - struct TempStorage : Uninitialized<_TempStorage> - {}; + using TempStorage = Uninitialized<_TempStorage>; private: - /// Shared storage reference _TempStorage& temp_storage; - /// Linear thread-id - unsigned int linear_tid; - unsigned int lane_id; - unsigned int warp_id; - unsigned int warp_offset; + // TODO(bgruber): can we use signed int here? Only these variables are unsigned: + unsigned int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); + unsigned int lane_id = LaneId(); + unsigned int warp_id = WARPS == 1 ? 0 : linear_tid / WARP_THREADS; + unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() @@ -205,8 +190,8 @@ private: return private_storage; } - //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. - //! Specialized for no timeslicing. + //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. Specialized for no + //! timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -215,35 +200,37 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) + int item_offset = linear_tid * ITEMS_PER_THREAD + i; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) + int item_offset = i * BLOCK_THREADS + linear_tid; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } - //! @brief Transposes data items from **blocked** arrangement to **striped** - //! arrangement. Specialized for warp-timeslicing. + //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. Specialized for + //! warp-timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -252,51 +239,51 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { - InputT temp_items[ITEMS_PER_THREAD]; + T temp_items[ITEMS_PER_THREAD]; #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + for (int slice = 0; slice < TIME_SLICES; slice++) { - const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; - const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + const int slice_offset = slice * TIME_SLICED_ITEMS; + const int slice_oob = slice_offset + TIME_SLICED_ITEMS; CTA_SYNC(); - if (warp_id == SLICE) + if (warp_id == slice) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) + int item_offset = lane_id * ITEMS_PER_THREAD + i; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { // Read a strip of items - const int STRIP_OFFSET = ITEM * BLOCK_THREADS; - const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + const int strip_offset = i * BLOCK_THREADS; + const int strip_oob = strip_offset + BLOCK_THREADS; - if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + if (slice_offset < strip_oob && slice_oob > strip_offset) { - int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + int item_offset = strip_offset + linear_tid - slice_offset; + if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS) { - if (INSERT_PADDING) + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - temp_items[ITEM] = temp_storage.buff[item_offset]; + temp_items[i] = temp_storage.buff[item_offset]; } } } @@ -304,14 +291,14 @@ private: // Copy #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - output_items[ITEM] = temp_items[ITEM]; + output_items[i] = temp_items[i]; } } - //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. - //! Specialized for no timeslicing + //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. Specialized for no + //! timeslicing //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -320,35 +307,37 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) + int item_offset = warp_offset + i + (lane_id * ITEMS_PER_THREAD); + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } WARP_SYNC(0xffffffff); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) + int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } - //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. - //! Specialized for warp-timeslicing + //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. Specialized for + //! warp-timeslicing //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -357,71 +346,71 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { if (warp_id == 0) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) + int item_offset = i + lane_id * ITEMS_PER_THREAD; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } WARP_SYNC(0xffffffff); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) + int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } #pragma unroll - for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE) + for (int slice = 1; slice < TIME_SLICES; ++slice) { CTA_SYNC(); - if (warp_id == SLICE) + if (warp_id == slice) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) + int item_offset = i + lane_id * ITEMS_PER_THREAD; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } WARP_SYNC(0xffffffff); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) + int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } } } - //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. - //! Specialized for no timeslicing. + //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. Specialized for no + //! timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -430,36 +419,38 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) + int item_offset = i * BLOCK_THREADS + linear_tid; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } CTA_SYNC(); // No timeslicing #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) + int item_offset = linear_tid * ITEMS_PER_THREAD + i; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } - //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. - //! Specialized for warp-timeslicing. + //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. Specialized for + //! warp-timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -468,67 +459,67 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { // Warp time-slicing - InputT temp_items[ITEMS_PER_THREAD]; + T temp_items[ITEMS_PER_THREAD]; #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + for (int slice = 0; slice < TIME_SLICES; slice++) { - const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; - const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + const int slice_offset = slice * TIME_SLICED_ITEMS; + const int slice_oob = slice_offset + TIME_SLICED_ITEMS; CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { // Write a strip of items - const int STRIP_OFFSET = ITEM * BLOCK_THREADS; - const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + const int strip_offset = i * BLOCK_THREADS; + const int strip_oob = strip_offset + BLOCK_THREADS; - if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + if (slice_offset < strip_oob && slice_oob > strip_offset) { - int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + int item_offset = strip_offset + linear_tid - slice_offset; + if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS) { - if (INSERT_PADDING) + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } } } CTA_SYNC(); - if (warp_id == SLICE) + if (warp_id == slice) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) + int item_offset = lane_id * ITEMS_PER_THREAD + i; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - temp_items[ITEM] = temp_storage.buff[item_offset]; + temp_items[i] = temp_storage.buff[item_offset]; } } } // Copy #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - output_items[ITEM] = temp_items[ITEM]; + output_items[i] = temp_items[i]; } } - //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. - //! Specialized for no timeslicing + //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. Specialized for no + //! timeslicing //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -537,35 +528,37 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) + int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } WARP_SYNC(0xffffffff); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) + int item_offset = warp_offset + i + (lane_id * ITEMS_PER_THREAD); + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(output_items + ITEM, temp_storage.buff[item_offset]); + detail::uninitialized_copy_single(output_items + i, temp_storage.buff[item_offset]); } } - //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. - //! Specialized for warp-timeslicing + //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. Specialized for + //! warp-timeslicing //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -574,44 +567,43 @@ private: //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll - for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) + for (int slice = 0; slice < TIME_SLICES; ++slice) { CTA_SYNC(); - if (warp_id == SLICE) + if (warp_id == slice) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) + int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } WARP_SYNC(0xffffffff); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) + int item_offset = i + lane_id * ITEMS_PER_THREAD; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } } } - //! @brief Exchanges data items annotated by rank into **blocked** arrangement. - //! Specialized for no timeslicing. + //! @brief Exchanges data items annotated by rank into **blocked** arrangement. Specialized for no timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -623,38 +615,37 @@ private: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) + int item_offset = ranks[i]; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) + int item_offset = linear_tid * ITEMS_PER_THREAD + i; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } - //! @brief Exchanges data items annotated by rank into **blocked** arrangement. - //! Specialized for warp-timeslicing. + //! @brief Exchanges data items annotated by rank into **blocked** arrangement. Specialized for warp-timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -666,61 +657,60 @@ private: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { - InputT temp_items[ITEMS_PER_THREAD]; + T temp_items[ITEMS_PER_THREAD]; #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + for (int slice = 0; slice < TIME_SLICES; slice++) { CTA_SYNC(); - const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE; + const int slice_offset = TIME_SLICED_ITEMS * slice; #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ranks[ITEM] - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) + int item_offset = ranks[i] - slice_offset; + if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS) { - if (INSERT_PADDING) + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } } CTA_SYNC(); - if (warp_id == SLICE) + if (warp_id == slice) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) + int item_offset = lane_id * ITEMS_PER_THREAD + i; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - temp_items[ITEM] = temp_storage.buff[item_offset]; + temp_items[i] = temp_storage.buff[item_offset]; } } } // Copy #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - output_items[ITEM] = temp_items[ITEM]; + output_items[i] = temp_items[i]; } } - //! @brief Exchanges data items annotated by rank into **striped** arrangement. - //! Specialized for no timeslicing. + //! @brief Exchanges data items annotated by rank into **striped** arrangement. Specialized for no timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -732,38 +722,37 @@ private: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) + int item_offset = ranks[i]; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) + int item_offset = i * BLOCK_THREADS + linear_tid; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } - //! @brief Exchanges data items annotated by rank into **striped** arrangement. - //! Specialized for warp-timeslicing. + //! @brief Exchanges data items annotated by rank into **striped** arrangement. Specialized for warp-timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. @@ -775,54 +764,54 @@ private: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { - InputT temp_items[ITEMS_PER_THREAD]; + T temp_items[ITEMS_PER_THREAD]; #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + for (int slice = 0; slice < TIME_SLICES; slice++) { - const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; - const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + const int slice_offset = slice * TIME_SLICED_ITEMS; + const int slice_oob = slice_offset + TIME_SLICED_ITEMS; CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ranks[ITEM] - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) + int item_offset = ranks[i] - slice_offset; + if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS) { - if (INSERT_PADDING) + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[ITEM]); + detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]); } } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { // Read a strip of items - const int STRIP_OFFSET = ITEM * BLOCK_THREADS; - const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + const int strip_offset = i * BLOCK_THREADS; + const int strip_oob = strip_offset + BLOCK_THREADS; - if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + if (slice_offset < strip_oob && slice_oob > strip_offset) { - int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + int item_offset = strip_offset + linear_tid - slice_offset; + if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS) { - if (INSERT_PADDING) + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } - temp_items[ITEM] = temp_storage.buff[item_offset]; + temp_items[i] = temp_storage.buff[item_offset]; } } } @@ -830,9 +819,9 @@ private: // Copy #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - output_items[ITEM] = temp_items[ITEM]; + output_items[i] = temp_items[i]; } } @@ -840,29 +829,15 @@ public: //! @name Collective constructors //! @{ - /** - * @brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ + //! @brief Collective constructor using a private static allocation of shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE BlockExchange() : temp_storage(PrivateStorage()) - , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - , lane_id(LaneId()) - , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) - , warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) {} - /** - * @brief Collective constructor using the specified memory allocation as temporary storage. - * - * @param[in] temp_storage - * Reference to memory allocation having layout type TempStorage - */ + //! @brief Collective constructor using the specified memory allocation as temporary storage. + //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage _CCCL_DEVICE _CCCL_FORCEINLINE BlockExchange(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) - , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - , lane_id(LaneId()) - , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) - , warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) {} //! @} end member group @@ -899,10 +874,9 @@ public: //! // Collectively exchange data into a blocked arrangement across threads //! BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data); //! - //! Suppose the set of striped input ``thread_data`` across the block of threads is - //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` after loading from - //! device-accessible memory. The corresponding output ``thread_data`` in those threads will be - //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. + //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384], + //! [1,129,257,385], ..., [127,255,383,511] }`` after loading from device-accessible memory. The corresponding output + //! ``thread_data`` in those threads will be ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! @endrst //! //! @param[in] input_items @@ -912,7 +886,7 @@ public: //! Items from exchange, converting between **striped** and **blocked** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void - StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) + StripedToBlocked(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { StripedToBlocked(input_items, output_items, Int2Type()); } @@ -950,11 +924,10 @@ public: //! // Store data striped across block threads into an ordered tile //! cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); //! - //! Suppose the set of blocked input ``thread_data`` across the block of threads is - //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. - //! The corresponding output ``thread_data`` in those threads will be - //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` in - //! preparation for storing to device-accessible memory. + //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7], + //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be + //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` in preparation for storing to device-accessible + //! memory. //! @endrst //! //! @param[in] input_items @@ -964,7 +937,7 @@ public: //! Items from exchange, converting between **striped** and **blocked** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void - BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) + BlockedToStriped(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { BlockedToStriped(input_items, output_items, Int2Type()); } @@ -1001,12 +974,11 @@ public: //! // Collectively exchange data into a blocked arrangement across threads //! BlockExchange(temp_storage).WarpStripedToBlocked(thread_data); //! - //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is - //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` - //! after loading from device-accessible memory. (The first 128 items are striped across - //! the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) - //! The corresponding output ``thread_data`` in those threads will be - //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. + //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is ``{ [0,32,64,96], + //! [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` after loading from device-accessible memory. (The first 128 + //! items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp, + //! etc.) The corresponding output ``thread_data`` in those threads will be ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], + //! ..., [508,509,510,511] }``. //! @endrst //! //! @param[in] input_items @@ -1016,7 +988,7 @@ public: //! Items from exchange, converting between **striped** and **blocked** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void - WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) + WarpStripedToBlocked(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { WarpStripedToBlocked(input_items, output_items, Int2Type()); } @@ -1056,12 +1028,11 @@ public: //! // Store data striped across warp threads into an ordered tile //! cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); //! - //! Suppose the set of blocked input ``thread_data`` across the block of threads is - //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. - //! The corresponding output ``thread_data`` in those threads will be - //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` - //! in preparation for storing to device-accessible memory. (The first 128 items are striped - //! across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) + //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7], + //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be + //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` in preparation for storing to + //! device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128 + //! items are striped across the second warp, etc.) //! @endrst //! //! @param[in] input_items @@ -1071,7 +1042,7 @@ public: //! Items from exchange, converting between **striped** and **blocked** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void - BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) + BlockedToWarpStriped(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { BlockedToWarpStriped(input_items, output_items, Int2Type()); } @@ -1099,7 +1070,7 @@ public: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { @@ -1126,7 +1097,7 @@ public: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { @@ -1153,35 +1124,35 @@ public: //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedGuarded( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) + int item_offset = ranks[i]; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - if (ranks[ITEM] >= 0) + if (ranks[i] >= 0) { - temp_storage.buff[item_offset] = input_items[ITEM]; + temp_storage.buff[item_offset] = input_items[i]; } } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) + int item_offset = i * BLOCK_THREADS + linear_tid; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } @@ -1211,36 +1182,36 @@ public: //! Corresponding flag denoting item validity template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged( - InputT (&input_items)[ITEMS_PER_THREAD], + const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD]) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) + int item_offset = ranks[i]; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - if (is_valid[ITEM]) + if (is_valid[i]) { - temp_storage.buff[item_offset] = input_items[ITEM]; + temp_storage.buff[item_offset] = input_items[i]; } } CTA_SYNC(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) + int item_offset = i * BLOCK_THREADS + linear_tid; + _CCCL_IF_CONSTEXPR (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } - output_items[ITEM] = temp_storage.buff[item_offset]; + output_items[i] = temp_storage.buff[item_offset]; } } @@ -1248,97 +1219,75 @@ public: #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - */ - _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(InputT (&items)[ITEMS_PER_THREAD]) + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(T (&items)[ITEMS_PER_THREAD]) { StripedToBlocked(items, items); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - */ - _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(InputT (&items)[ITEMS_PER_THREAD]) + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(T (&items)[ITEMS_PER_THREAD]) { BlockedToStriped(items, items); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - */ - _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(InputT (&items)[ITEMS_PER_THREAD]) + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(T (&items)[ITEMS_PER_THREAD]) { WarpStripedToBlocked(items, items); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - */ - _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(InputT (&items)[ITEMS_PER_THREAD]) + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(T (&items)[ITEMS_PER_THREAD]) { BlockedToWarpStriped(items, items); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - * - * @param[in] ranks - * Corresponding scatter ranks - */ + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + /// + /// @param[in] ranks + /// Corresponding scatter ranks template - _CCCL_DEVICE _CCCL_FORCEINLINE void - ScatterToBlocked(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) + _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToBlocked(items, items, ranks); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - * - * @param[in] ranks - * Corresponding scatter ranks - */ + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + /// @param[in] ranks + /// Corresponding scatter ranks template - _CCCL_DEVICE _CCCL_FORCEINLINE void - ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) + _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStriped(items, items, ranks); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - * - * @param[in] ranks - * Corresponding scatter ranks - */ + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + /// @param[in] ranks + /// Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void - ScatterToStripedGuarded(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) + ScatterToStripedGuarded(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStripedGuarded(items, items, ranks); } - /** - * @param[in-out] items - * Items to exchange, converting between **striped** and **blocked** arrangements. - * - * @param[in] ranks - * Corresponding scatter ranks - * - * @param[in] is_valid - * Corresponding flag denoting item validity - */ + /// @param[in-out] items + /// Items to exchange, converting between **striped** and **blocked** arrangements. + /// @param[in] ranks + /// Corresponding scatter ranks + /// @param[in] is_valid + /// Corresponding flag denoting item validity template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged( - InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD]) + T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD]) { ScatterToStriped(items, items, ranks, is_valid); } diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh index 3553ec79da6..d5726f240f6 100644 --- a/cub/cub/block/block_histogram.cuh +++ b/cub/cub/block/block_histogram.cuh @@ -28,7 +28,7 @@ /** * @file - * The cub::BlockHistogram class provides [collective](index.html#sec0) methods for + * The cub::BlockHistogram class provides [collective](../index.html#sec0) methods for * constructing block-wide histograms from data samples partitioned across a CUDA thread block. */ diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh index 87adeb54515..76c073f1b54 100644 --- a/cub/cub/block/block_load.cuh +++ b/cub/cub/block/block_load.cuh @@ -26,7 +26,7 @@ * ******************************************************************************/ -//! @file Operations for reading linear tiles of data into the CUDA thread block. +//! @file block_load.cuh Operations for reading linear tiles of data into the CUDA thread block. #pragma once @@ -54,7 +54,6 @@ CUB_NAMESPACE_BEGIN //! Load a linear segment of items into a blocked arrangement across the thread block. //! //! @blocked -//! //! @endrst //! //! @tparam T @@ -63,27 +62,27 @@ CUB_NAMESPACE_BEGIN //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **[inferred]** The random-access iterator type for input iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D +//! thread blocks) //! -//! @param[in] block_itr +//! @param[in] block_src_it //! The thread block's base input iterator for loading from //! -//! @param[out] items -//! Data to load -template +//! @param[out] dst_items +//! Destination to load data into +template _CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) +LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { // Load directly in thread-blocked order #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; + dst_items[i] = block_src_it[linear_tid * ITEMS_PER_THREAD + i]; } } @@ -100,31 +99,32 @@ LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEM //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **[inferred]** The random-access iterator type for input iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D +//! thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into //! -//! @param[in] valid_items -//! Number of valid items to load -template -_CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) +//! @param[in] block_items_end +//! First out-of-bounds index when loading from block_src_it +template +_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked( + int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items) + const auto src_pos = linear_tid * ITEMS_PER_THREAD + i; + if (src_pos < block_items_end) { - items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; + dst_items[i] = block_src_it[src_pos]; } } } @@ -143,35 +143,39 @@ LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEM //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **[inferred]** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D +//! thread blocks) //! -//! @param[in] block_itr +//! @param[in] block_src_it //! The thread block's base input iterator for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into //! -//! @param[in] valid_items -//! Number of valid items to load +//! @param[in] block_items_end +//! First out-of-bounds index when loading from block_src_it //! //! @param[in] oob_default //! Default value to assign out-of-bound items -template +template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked( - int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + int linear_tid, + RandomAccessIterator block_src_it, + T (&dst_items)[ITEMS_PER_THREAD], + int block_items_end, + DefaultT oob_default) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = oob_default; + dst_items[i] = oob_default; } - LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document @@ -179,58 +183,44 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked( //! @brief Internal implementation for load vectorization //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D +//! thread blocks) //! -//! @param[in] block_ptr +//! @param[in] block_src_ptr //! Input pointer for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into template _CCCL_DEVICE _CCCL_FORCEINLINE void -InternalLoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD]) +InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD]) { - // Biggest memory access word that T is a whole multiple of - using DeviceWord = typename UnitWord::DeviceWord; - + // Find biggest memory access word that T is a whole multiple of + using device_word_t = typename UnitWord::DeviceWord; _CCCL_DIAG_PUSH # if defined(CUB_CLANG_VERSION) && CUB_CLANG_VERSION >= 100000 _CCCL_DIAG_SUPPRESS_CLANG("-Wsizeof-array-div") # endif // defined(CUB_CLANG_VERSION) && CUB_CLANG_VERSION >= 100000 - enum - { - TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord), - - VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ? 4 - : (TOTAL_WORDS % 2 == 0) ? 2 - : 1, - - VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE, - }; + constexpr int total_words = static_cast(sizeof(dst_items) / sizeof(device_word_t)); _CCCL_DIAG_POP + constexpr int vector_size = (total_words % 4 == 0) ? 4 : (total_words % 2 == 0) ? 2 : 1; + constexpr int vectors_per_thread = total_words / vector_size; + using vector_t = typename CubVector::Type; - // Vector type - using Vector = typename CubVector::Type; - - // Vector items - Vector vec_items[VECTORS_PER_THREAD]; - - // Aliased input ptr - Vector* vec_ptr = reinterpret_cast(block_ptr) + (linear_tid * VECTORS_PER_THREAD); - -// Load directly in thread-blocked order + // Load into an array of vectors in thread-blocked order + vector_t vec_items[vectors_per_thread]; + const vector_t* vec_ptr = reinterpret_cast(block_src_ptr) + linear_tid * vectors_per_thread; # pragma unroll - for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++) + for (int i = 0; i < vectors_per_thread; i++) { - vec_items[ITEM] = ThreadLoad(vec_ptr + ITEM); + vec_items[i] = ThreadLoad(vec_ptr + i); } -// Copy +// Copy to destination # pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = *(reinterpret_cast(vec_items) + ITEM); + dst_items[i] = *(reinterpret_cast(vec_items) + i); } } @@ -258,19 +248,19 @@ InternalLoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITE //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + +//! linear_tid` for 2D thread blocks) //! -//! @param[in] block_ptr -//! Input pointer for loading from +//! @param[in] block_src_ptr +//! The thread block's base pointer for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! destination to load data into template _CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD]) +LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD]) { - InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); + InternalLoadDirectBlockedVectorized(linear_tid, block_src_ptr, dst_items); } //! @} end member group @@ -293,43 +283,41 @@ LoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_T //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **[inferred]** The random-access iterator type for input iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D +//! thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load -template +//! @param[out] dst_items +//! Destination to load data into +template _CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) +LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS]; + dst_items[i] = block_src_it[linear_tid + i * BLOCK_THREADS]; } } namespace detail { - -template +template _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped( - int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], TransformOpT transform_op) + int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], TransformOpT transform_op) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = transform_op(block_itr[linear_tid + ITEM * BLOCK_THREADS]); + dst_items[i] = transform_op(block_src_it[linear_tid + i * BLOCK_THREADS]); } } - } // namespace detail //! @rst @@ -348,32 +336,32 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped( //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT -//! **inferred** The random-access iterator type for input \iterator. +//! @tparam RandomAccessIterator +//! **inferred** The random-access iterator type for input iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + +//! linear_tid for 2D thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into //! -//! @param[in] valid_items +//! @param[in] block_items_end //! Number of valid items to load -//! -template -_CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) +template +_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped( + int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items) + const auto src_pos = linear_tid + i * BLOCK_THREADS; + if (src_pos < block_items_end) { - items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS]; + dst_items[i] = block_src_it[src_pos]; } } } @@ -395,35 +383,39 @@ LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEM //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **inferred** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + +//! linear_tid` for 2D thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into //! -//! @param[in] valid_items +//! @param[in] block_items_end //! Number of valid items to load //! //! @param[in] oob_default //! Default value to assign out-of-bound items -template +template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped( - int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + int linear_tid, + RandomAccessIterator block_src_it, + T (&dst_items)[ITEMS_PER_THREAD], + int block_items_end, + DefaultT oob_default) { #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = oob_default; + dst_items[i] = oob_default; } - LoadDirectStriped(linear_tid, block_itr, items, valid_items); + LoadDirectStriped(linear_tid, block_src_it, dst_items, block_items_end); } //! @} end member group @@ -448,31 +440,31 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped( //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **inferred** The random-access iterator type for input iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + +//! linear_tid` for 2D thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load -template +//! @param[out] dst_items +//! Destination to load data into +template _CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) +LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); - int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; - int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + const int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + const int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + const int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; // Load directly in warp-striped order #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]); + new (&dst_items[i]) T(block_src_it[warp_offset + tid + (i * CUB_PTX_WARP_THREADS)]); } } @@ -494,36 +486,37 @@ LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **inferred** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + +//! linear_tid` for 2D thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into //! -//! @param[in] valid_items +//! @param[in] block_items_end //! Number of valid items to load -template -_CCCL_DEVICE _CCCL_FORCEINLINE void -LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) +template +_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped( + int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); - int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; - int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + const int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + const int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + const int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; // Load directly in warp-striped order #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) + const auto src_pos = warp_offset + tid + (i * CUB_PTX_WARP_THREADS); + if (src_pos < block_items_end) { - new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]); + new (&dst_items[i]) T(block_src_it[src_pos]); } } } @@ -547,42 +540,46 @@ LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! -//! @tparam InputIteratorT +//! @tparam RandomAccessIterator //! **inferred** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid -//! A suitable 1D thread-identifier for the calling thread -//! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) +//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + +//! linear_tid` for 2D thread blocks) //! -//! @param[in] block_itr -//! The thread block's base input iterator for loading from +//! @param[in] block_src_it +//! The thread block's base iterator for loading from //! -//! @param[out] items -//! Data to load +//! @param[out] dst_items +//! Destination to load data into //! -//! @param[in] valid_items +//! @param[in] block_items_end //! Number of valid items to load //! //! @param[in] oob_default //! Default value to assign out-of-bound items -template +template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped( - int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + int linear_tid, + RandomAccessIterator block_src_it, + T (&dst_items)[ITEMS_PER_THREAD], + int block_items_end, + DefaultT oob_default) { // Load directly in warp-striped order #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + for (int i = 0; i < ITEMS_PER_THREAD; i++) { - items[ITEM] = oob_default; + dst_items[i] = oob_default; } - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end); } //! @} end member group -//! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a -//! linear segment of data from memory into a blocked arrangement across a CUDA thread block. +//! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data +//! from memory into a blocked arrangement across a CUDA thread block. enum BlockLoadAlgorithm { //! @rst @@ -594,8 +591,8 @@ enum BlockLoadAlgorithm //! Performance Considerations //! ++++++++++++++++++++++++++ //! - //! The utilization of memory transactions (coalescing) decreases as the - //! access stride between threads increases (i.e., the number items per thread). + //! The utilization of memory transactions (coalescing) decreases as the access stride between threads increases + //! (i.e., the number items per thread). //! @endrst BLOCK_LOAD_DIRECT, @@ -608,8 +605,7 @@ enum BlockLoadAlgorithm //! Performance Considerations //! ++++++++++++++++++++++++++ //! - //! The utilization of memory transactions (coalescing) doesn't depend on - //! the number of items per thread. + //! The utilization of memory transactions (coalescing) doesn't depend on the number of items per thread. //! //! @endrst BLOCK_LOAD_STRIPED, @@ -618,22 +614,20 @@ enum BlockLoadAlgorithm //! Overview //! ++++++++++++++++++++++++++ //! - //! A :ref:`blocked arrangement ` of data is read - //! from memory using CUDA's built-in vectorized loads as a coalescing optimization. - //! For example, ``ld.global.v4.s32`` instructions will be generated + //! A :ref:`blocked arrangement ` of data is read from memory using CUDA's built-in + //! vectorized loads as a coalescing optimization. For example, ``ld.global.v4.s32`` instructions will be generated //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! - //! - The utilization of memory transactions (coalescing) remains high until the the - //! access stride between threads (i.e., the number items per thread) exceeds the - //! maximum vector load width (typically 4 items or 64B, whichever is lower). - //! - The following conditions will prevent vectorization and loading will fall - //! back to cub::BLOCK_LOAD_DIRECT: + //! - The utilization of memory transactions (coalescing) remains high until the the access stride between threads + //! (i.e., the number items per thread) exceeds the maximum vector load width (typically 4 items or 64B, whichever + //! is lower). + //! - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: //! //! - ``ITEMS_PER_THREAD`` is odd - //! - The ``InputIteratorT`` is not a simple pointer type + //! - The ``RandomAccessIterator`` is not a simple pointer type //! - The block input offset is not quadword-aligned //! - The data type ``T`` is not a built-in primitive or CUDA vector type //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.) @@ -645,16 +639,15 @@ enum BlockLoadAlgorithm //! Overview //! ++++++++++++++++++++++++++ //! - //! A :ref:`striped arrangement ` of data is read efficiently from memory and then - //! locally transposed into a :ref:`blocked arrangement `. + //! A :ref:`striped arrangement ` of data is read efficiently from memory and then locally + //! transposed into a :ref:`blocked arrangement `. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! - //! - The utilization of memory transactions (coalescing) remains high regardless - //! of items loaded per thread. - //! - The local reordering incurs slightly longer latencies and throughput than the - //! direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. + //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread. + //! - The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_LOAD_DIRECT and + //! cub::BLOCK_LOAD_VECTORIZE alternatives. //! //! @endrst BLOCK_LOAD_TRANSPOSE, @@ -675,8 +668,8 @@ enum BlockLoadAlgorithm //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread. - //! - The local reordering incurs slightly larger latencies than the - //! direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. + //! - The local reordering incurs slightly larger latencies than the direct cub::BLOCK_LOAD_DIRECT and + //! cub::BLOCK_LOAD_VECTORIZE alternatives. //! - Provisions more shared storage, but incurs smaller latencies than the //! BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative. //! @@ -687,10 +680,10 @@ enum BlockLoadAlgorithm //! Overview //! ++++++++++++++++++++++++++ //! - //! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement ` - //! of data is read directly from memory and then is locally transposed into a - //! :ref:`blocked arrangement `. To reduce the shared memory requirement, only one - //! warp's worth of shared memory is provisioned and is subsequently time-sliced among warps. + //! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement ` of data is read + //! directly from memory and then is locally transposed into a :ref:`blocked arrangement `. + //! To reduce the shared memory requirement, only one warp's worth of shared memory is provisioned and is subsequently + //! time-sliced among warps. //! //! Usage Considerations //! ++++++++++++++++++++++++++ @@ -700,10 +693,9 @@ enum BlockLoadAlgorithm //! Performance Considerations //! ++++++++++++++++++++++++++ //! - //! - The utilization of memory transactions (coalescing) remains high regardless - //! of items loaded per thread. - //! - Provisions less shared memory temporary storage, but incurs larger - //! latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative. + //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread. + //! - Provisions less shared memory temporary storage, but incurs larger latencies than the BLOCK_LOAD_WARP_TRANSPOSE + //! alternative. //! //! @endrst BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, @@ -711,15 +703,15 @@ enum BlockLoadAlgorithm //! @rst //! The BlockLoad class provides :ref:`collective ` data movement methods for loading a linear -//! segment of items from memory into a :ref:`blocked arrangement ` across a -//! CUDA thread block. +//! segment of items from memory into a :ref:`blocked arrangement ` across a CUDA thread +//! block. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! -//! - The BlockLoad class provides a single data movement abstraction that can be specialized -//! to implement different cub::BlockLoadAlgorithm strategies. This facilitates different -//! performance policies for different architectures, data types, granularity sizes, etc. +//! - The BlockLoad class provides a single data movement abstraction that can be specialized to implement different +//! cub::BlockLoadAlgorithm strategies. This facilitates different performance policies for different architectures, +//! data types, granularity sizes, etc. //! - BlockLoad can be optionally specialized by different data movement strategies: //! //! #. :cpp:enumerator:`cub::BLOCK_LOAD_DIRECT`: @@ -746,10 +738,9 @@ enum BlockLoadAlgorithm //! //! @blockcollective{BlockLoad} //! -//! The code snippet below illustrates the loading of a linear -//! segment of 512 integers into a "blocked" arrangement across 128 threads where each -//! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``, -//! meaning memory references are efficiently coalesced using a warp-striped access +//! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement +//! across 128 threads where each thread owns 4 consecutive items. The load is specialized for +//! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ @@ -768,21 +759,20 @@ enum BlockLoadAlgorithm //! int thread_data[4]; //! BlockLoad(temp_storage).Load(d_data, thread_data); //! -//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. -//! The set of ``thread_data`` across the block of threads in those threads will be -//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. +//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in +//! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. //! //! Re-using dynamically allocating shared memory //! +++++++++++++++++++++++++++++++++++++++++++++ //! -//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of -//! dynamically shared memory with BlockReduce and how to re-purpose the same memory region. -//! This example can be easily adapted to the storage required by BlockLoad. +//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with +//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage required +//! by BlockLoad. //! //! @endrst //! -//! @tparam InputT -//! The data type to read into (which must be convertible from the input iterator's value type). +//! @tparam T +// The data type to read into (which must be convertible from the input iterator's value type). //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension @@ -793,20 +783,15 @@ enum BlockLoadAlgorithm //! @tparam ALGORITHM //! **[optional]** cub::BlockLoadAlgorithm tuning policy. default: ``cub::BLOCK_LOAD_DIRECT``. //! -//! @tparam WARP_TIME_SLICING -//! **[optional]** Whether or not only one warp's worth of shared memory should be -//! allocated and time-sliced among block-warps during any load-related data transpositions -//! (versus each warp having its own storage). (default: false) -//! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z -//! **[optional]** The thread block length in threads along the Z dimension (default: 1) +//! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH -//! **[optional]** Unused. -template class BlockLoad { -private: - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; + static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; // total threads in the block - /// Load helper template - struct LoadInternal; + struct LoadInternal; // helper to dispatch the load algorithm - /** - * BLOCK_LOAD_DIRECT specialization of load helper - */ template struct LoadInternal { - /// Shared memory storage layout type using TempStorage = NullType; - - /// Linear thread-id int linear_tid; - /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} - /** - * @brief Load a linear segment of items from memory - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - LoadDirectBlocked(linear_tid, block_itr, items); + LoadDirectBlocked(linear_tid, block_src_it, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end); } - /** - * @brief Load a linear segment of items from memory, guarded by range, with a fall-back - * assignment of out-of-bound elements - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - * - * @param[in] oob_default - * Default value to assign out-of-bound items - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); + LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default); } }; - /** - * BLOCK_LOAD_STRIPED specialization of load helper - */ template struct LoadInternal { - /// Shared memory storage layout type using TempStorage = NullType; - - /// Linear thread-id int linear_tid; - /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} - /** - * @brief Load a linear segment of items from memory - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - LoadDirectStriped(linear_tid, block_itr, items); + LoadDirectStriped(linear_tid, block_src_it, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - LoadDirectStriped(linear_tid, block_itr, items, valid_items); + LoadDirectStriped(linear_tid, block_src_it, dst_items, block_items_end); } - /** - * @brief Load a linear segment of items from memory, guarded by range, with a fall-back - * assignment of out-of-bound elements - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - * - * @param[in] oob_default - * Default value to assign out-of-bound items - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); + LoadDirectStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default); } }; - /** - * BLOCK_LOAD_VECTORIZE specialization of load helper - */ template struct LoadInternal { - /// Shared memory storage layout type using TempStorage = NullType; - - /// Linear thread-id int linear_tid; - /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} - /** - * @brief Load a linear segment of items from memory, specialized for native pointer types - * (attempts vectorization) - * - * @param[in] block_ptr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD]) + // attempts vectorization (pointer) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const T* block_ptr, T (&dst_items)[ITEMS_PER_THREAD]) { - InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); + InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, dst_items); } - /** - * @brief Load a linear segment of items from memory, specialized for native pointer types - * (attempts vectorization) - * - * @param[in] block_ptr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD]) + // any other iterator, no vectorization + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); + LoadDirectBlocked(linear_tid, block_src_it, dst_items); } - /** - * @brief Load a linear segment of items from memory, specialized for native pointer types - * (attempts vectorization) - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ + // attempts vectorization (cache modified iterator) template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(CacheModifiedInputIterator block_itr, InputT (&items)[ITEMS_PER_THREAD]) + Load(CacheModifiedInputIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); + InternalLoadDirectBlockedVectorized(linear_tid, block_src_it.ptr, dst_items); } - /** - * @brief Load a linear segment of items from memory, specialized for opaque input iterators - * (skips vectorization) - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(_InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) - { - LoadDirectBlocked(linear_tid, block_itr, items); - } - - /** - * @brief Load a linear segment of items from memory, guarded by range (skips vectorization) - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - */ - template + // skips vectorization + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end); } - /** - * @brief Load a linear segment of items from memory, guarded by range, with a fall-back - * assignment of out-of-bound elements (skips vectorization) - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - * - * @param[in] oob_default - * Default value to assign out-of-bound items - */ - template + // skips vectorization + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); + LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default); } }; - /** - * BLOCK_LOAD_TRANSPOSE specialization of load helper - */ template struct LoadInternal { - // BlockExchange utility type for keys - using BlockExchange = BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - {}; + using BlockExchange = BlockExchange; + using _TempStorage = typename BlockExchange::TempStorage; + using TempStorage = Uninitialized<_TempStorage>; - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> - {}; - - /// Thread reference to shared storage _TempStorage& temp_storage; - - /// Linear thread-id int linear_tid; - /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} - /** - * @brief Load a linear segment of items from memory - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - LoadDirectStriped(linear_tid, block_itr, items); - BlockExchange(temp_storage).StripedToBlocked(items, items); + LoadDirectStriped(linear_tid, block_src_it, dst_items); + BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - LoadDirectStriped(linear_tid, block_itr, items, valid_items); - BlockExchange(temp_storage).StripedToBlocked(items, items); + LoadDirectStriped(linear_tid, block_src_it, dst_items, block_items_end); + BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range, with a fall-back - * assignment of out-of-bound elements - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - * - * @param[in] oob_default - * Default value to assign out-of-bound items - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); - BlockExchange(temp_storage).StripedToBlocked(items, items); + LoadDirectStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default); + BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items); } }; - /** - * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper - */ template struct LoadInternal { - enum - { - WARP_THREADS = CUB_WARP_THREADS(0) - }; - - // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - static_assert(int(BLOCK_THREADS) % int(WARP_THREADS) == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS"); + static constexpr int WARP_THREADS = CUB_WARP_THREADS(0); + static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS"); - // BlockExchange utility type for keys - using BlockExchange = BlockExchange; + using BlockExchange = BlockExchange; + using _TempStorage = typename BlockExchange::TempStorage; + using TempStorage = Uninitialized<_TempStorage>; - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - {}; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> - {}; - - /// Thread reference to shared storage _TempStorage& temp_storage; - - /// Linear thread-id int linear_tid; - /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} - /** - * @brief Load a linear segment of items from memory - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - LoadDirectWarpStriped(linear_tid, block_itr, items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items); + BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end); + BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range, with a fall-back - * assignment of out-of-bound elements - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - * - * @param[in] oob_default - * Default value to assign out-of-bound items - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default); + BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items); } }; - /** - * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper - */ template struct LoadInternal { - enum - { - WARP_THREADS = CUB_WARP_THREADS(0) - }; + static constexpr int WARP_THREADS = CUB_WARP_THREADS(0); + static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS"); - // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - static_assert(int(BLOCK_THREADS) % int(WARP_THREADS) == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS"); + using BlockExchange = BlockExchange; + using _TempStorage = typename BlockExchange::TempStorage; + using TempStorage = Uninitialized<_TempStorage>; - // BlockExchange utility type for keys - using BlockExchange = BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - {}; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> - {}; - - /// Thread reference to shared storage _TempStorage& temp_storage; - - /// Linear thread-id int linear_tid; - /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} - /** - * @brief Load a linear segment of items from memory - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - */ - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - LoadDirectWarpStriped(linear_tid, block_itr, items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items); + BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end); + BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items); } - /** - * @brief Load a linear segment of items from memory, guarded by range, with a fall-back - * assignment of out-of-bound elements - * - * @param[in] block_itr - * The thread block's base input iterator for loading from - * - * @param[out] items - * Data to load - * - * @param[in] valid_items - * Number of valid items to load - * - * @param[in] oob_default - * Default value to assign out-of-bound items - */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default); + BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items); } }; - /// Internal load implementation to use - using InternalLoad = LoadInternal; - - /// Shared memory storage layout type + using InternalLoad = LoadInternal; // load implementation to use using _TempStorage = typename InternalLoad::TempStorage; - /// Internal storage allocator + // Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } - /// Thread reference to shared storage _TempStorage& temp_storage; - - /// Linear thread-id int linear_tid; public: /// @smemstorage{BlockLoad} - struct TempStorage : Uninitialized<_TempStorage> - {}; + using TempStorage = Uninitialized<_TempStorage>; //! @name Collective constructors //! @{ - /** - * @brief Collective constructor using a private static allocation of shared memory as temporary - * storage. - */ + /// @brief Collective constructor using a private static allocation of shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - /** - * @brief Collective constructor using the specified memory allocation as temporary storage. - * - * @param[in] temp_storage - * Reference to memory allocation having layout type TempStorage - */ + /// @brief Collective constructor using the specified memory allocation as temporary storage. + /// @param[in] temp_storage Reference to memory allocation having layout type TempStorage _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) @@ -1448,10 +1085,9 @@ public: //! Snippet //! +++++++ //! - //! The code snippet below illustrates the loading of a linear - //! segment of 512 integers into a "blocked" arrangement across 128 threads where each - //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``, - //! meaning memory references are efficiently coalesced using a warp-striped access + //! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement + //! across 128 threads where each thread owns 4 consecutive items. The load is specialized for + //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ @@ -1470,21 +1106,20 @@ public: //! int thread_data[4]; //! BlockLoad(temp_storage).Load(d_data, thread_data); //! - //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. - //! The set of ``thread_data`` across the block of threads in those threads will be - //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. + //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads + //! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. //! //! @endrst //! - //! @param[in] block_itr - //! The thread block's base input iterator for loading from + //! @param[in] block_src_it + //! The thread block's base iterator for loading from //! - //! @param[out] items - //! Data to load - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) + //! @param[out] dst_items + //! Destination to load data into + template + _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD]) { - InternalLoad(temp_storage, linear_tid).Load(block_itr, items); + InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items); } //! @rst @@ -1497,17 +1132,16 @@ public: //! Snippet //! +++++++ //! - //! The code snippet below illustrates the guarded loading of a linear - //! segment of 512 integers into a "blocked" arrangement across 128 threads where each - //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``, - //! meaning memory references are efficiently coalesced using a warp-striped access + //! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked" + //! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for + //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ //! //! #include // or equivalently //! - //! __global__ void ExampleKernel(int *d_data, int valid_items, ...) + //! __global__ void ExampleKernel(int *d_data, int block_items_end, ...) //! { //! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each //! using BlockLoad = cub::BlockLoad; @@ -1517,32 +1151,32 @@ public: //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; - //! BlockLoad(temp_storage).Load(d_data, thread_data, valid_items); + //! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end); //! - //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``valid_items`` is ``5``. - //! The set of ``thread_data`` across the block of threads in those threads will be - //! ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``, with only the first two threads - //! being unmasked to load portions of valid data (and other items remaining unassigned). + //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of + //! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``, + //! with only the first two threads being unmasked to load portions of valid data (and other items remaining + //! unassigned). //! //! @endrst //! - //! @param[in] block_itr - //! The thread block's base input iterator for loading from + //! @param[in] block_src_it + //! The thread block's base iterator for loading from //! - //! @param[out] items - //! Data to load + //! @param[out] dst_items + //! Destination to load data into //! - //! @param[in] valid_items + //! @param[in] block_items_end //! Number of valid items to load - template - _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) + template + _CCCL_DEVICE _CCCL_FORCEINLINE void + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end) { - InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); + InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end); } //! @rst - //! Load a linear segment of items from memory, guarded by range, with a fall-back - //! assignment of out-of-bound elements + //! Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements //! //! - @blocked //! - @smemreuse @@ -1550,17 +1184,16 @@ public: //! Snippet //! +++++++ //! - //! The code snippet below illustrates the guarded loading of a linear - //! segment of 512 integers into a "blocked" arrangement across 128 threads where each - //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``, - //! meaning memory references are efficiently coalesced using a warp-striped access + //! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked" + //! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for + //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ //! //! #include // or equivalently //! - //! __global__ void ExampleKernel(int *d_data, int valid_items, ...) + //! __global__ void ExampleKernel(int *d_data, int block_items_end, ...) //! { //! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each //! using BlockLoad = cub::BlockLoad; @@ -1570,35 +1203,34 @@ public: //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; - //! BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1); + //! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1); //! - //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` - //! ``valid_items`` is ``5``, and the out-of-bounds default is ``-1``. - //! The set of ``thread_data`` across the block of threads in those threads will be - //! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads - //! being unmasked to load portions of valid data (and other items are assigned ``-1``) + //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds + //! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be + //! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads being unmasked to load + //! portions of valid data (and other items are assigned ``-1``) //! //! @endrst //! - //! @param[in] block_itr - //! The thread block's base input iterator for loading from + //! @param[in] block_src_it + //! The thread block's base iterator for loading from //! - //! @param[out] items - //! Data to load + //! @param[out] dst_items + //! Destination to load data into //! - //! @param[in] valid_items + //! @param[in] block_items_end //! Number of valid items to load //! //! @param[in] oob_default //! Default value to assign out-of-bound items - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void - Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) + Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default) { - InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); + InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end, oob_default); } - //@} end member group + //! @} end member group }; template > diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh index 86a81652461..29510db5e97 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh @@ -47,26 +47,26 @@ CUB_NAMESPACE_BEGIN -// Additional details of the Merge-Path Algorithm can be found in: -// S. Odeh, O. Green, Z. Mwassi, O. Shmueli, Y. Birk, " Merge Path - Parallel -// Merging Made Simple", Multithreaded Architectures and Applications (MTAAP) -// Workshop, IEEE 26th International Parallel & Distributed Processing -// Symposium (IPDPS), 2012 -template -_CCCL_DEVICE _CCCL_FORCEINLINE OffsetT MergePath( - KeyIteratorT keys1, KeyIteratorT keys2, OffsetT keys1_count, OffsetT keys2_count, OffsetT diag, BinaryPred binary_pred) +// This implements the DiagonalIntersection algorithm from Merge-Path. Additional details can be found in: +// * S. Odeh, O. Green, Z. Mwassi, O. Shmueli, Y. Birk, "Merge Path - Parallel Merging Made Simple", Multithreaded +// Architectures and Applications (MTAAP) Workshop, IEEE 26th International Parallel & Distributed Processing +// Symposium (IPDPS), 2012 +// * S. Odeh, O. Green, Y. Birk, "Merge Path - A Visually Intuitive Approach to Parallel Merging", 2014, URL: +// https://arxiv.org/abs/1406.2628 +template +_CCCL_DEVICE _CCCL_FORCEINLINE OffsetT +MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count, OffsetT diag, BinaryPred binary_pred) { OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count; OffsetT keys1_end = (cub::min)(diag, keys1_count); while (keys1_begin < keys1_end) { - OffsetT mid = cub::MidPoint(keys1_begin, keys1_end); - KeyT key1 = keys1[mid]; - KeyT key2 = keys2[diag - 1 - mid]; - bool pred = binary_pred(key2, key1); - - if (pred) + const OffsetT mid = cub::MidPoint(keys1_begin, keys1_end); + // pull copies of the keys before calling binary_pred so proxy references are unwrapped + const detail::value_t key1 = keys1[mid]; + const detail::value_t key2 = keys2[diag - 1 - mid]; + if (binary_pred(key2, key1)) { keys1_end = mid; } @@ -78,9 +78,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT MergePath( return keys1_begin; } -template +template _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( - KeyT* keys_shared, + KeyIt keys_shared, int keys1_beg, int keys2_beg, int keys1_count, @@ -89,8 +89,8 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( int (&indices)[ITEMS_PER_THREAD], CompareOp compare_op) { - int keys1_end = keys1_beg + keys1_count; - int keys2_end = keys2_beg + keys2_count; + const int keys1_end = keys1_beg + keys1_count; + const int keys2_end = keys2_beg + keys2_count; KeyT key1 = keys_shared[keys1_beg]; KeyT key2 = keys_shared[keys2_beg]; @@ -98,11 +98,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { - bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); - + const bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); output[item] = p ? key2 : key1; indices[item] = p ? keys2_beg++ : keys1_beg++; - if (p) { key2 = keys_shared[keys2_beg]; @@ -437,7 +435,7 @@ public: int keys1_count = keys1_end - keys1_beg; int keys2_count = keys2_end - keys2_beg; - int partition_diag = MergePath( + int partition_diag = MergePath( &temp_storage.keys_shared[keys1_beg], &temp_storage.keys_shared[keys2_beg], keys1_count, @@ -723,10 +721,9 @@ private: * `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`. * * @par Re-using dynamically allocating shared memory - * The following example under the examples/block folder illustrates usage of + * The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose - * the same memory region: - * example_block_reduce_dyn_smem.cu + * the same memory region. * * This example can be easily adapted to the storage required by BlockMergeSort. */ diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh index 964f4fbe0e7..48650992918 100644 --- a/cub/cub/block/block_radix_sort.cuh +++ b/cub/cub/block/block_radix_sort.cuh @@ -28,7 +28,7 @@ /** * @file - * The cub::BlockRadixSort class provides [collective](index.html#sec0) methods for radix + * The cub::BlockRadixSort class provides [collective](../index.html#sec0) methods for radix * sorting of items partitioned across a CUDA thread block. */ @@ -142,7 +142,7 @@ CUB_NAMESPACE_BEGIN //! @blockcollective{BlockRadixSort} //! //! The code snippet below illustrates a sort of 512 integer keys that -//! are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads +//! are partitioned in a [blocked arrangement](../index.html#sec5sec3) across 128 threads //! where each thread owns 4 consecutive items. //! //! .. tab-set-code:: @@ -199,10 +199,8 @@ CUB_NAMESPACE_BEGIN //! Re-using dynamically allocating shared memory //! -------------------------------------------------- //! -//! The following example under the examples/block folder illustrates usage of -//! dynamically shared memory with BlockReduce and how to re-purpose -//! the same memory region: -//! example_block_reduce_dyn_smem.cu +//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with +//! BlockReduce and how to re-purpose the same memory region. //! //! This example can be easily adapted to the storage required by BlockRadixSort. //! @endrst @@ -986,7 +984,7 @@ public: //! +++++++ //! //! The code snippet below illustrates a sort of 512 integer keys that - //! are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + //! are partitioned in a [blocked arrangement](../index.html#sec5sec3) across 128 threads //! where each thread owns 4 consecutive keys. //! //! .. code-block:: c++ @@ -1590,7 +1588,7 @@ public: //! +++++++ //! //! The code snippet below illustrates a sort of 512 integer keys and values that - //! are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 + //! are initially partitioned in a [blocked arrangement](../index.html#sec5sec3) across 128 //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped. //! //! .. code-block:: c++ diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh index a06b7c185fb..df7ab6e8143 100644 --- a/cub/cub/block/block_scan.cuh +++ b/cub/cub/block/block_scan.cuh @@ -1011,7 +1011,7 @@ public: //! +++++++ //! //! The code snippet below illustrates an exclusive prefix max scan of 512 integer - //! items that are partitioned in a [blocked arrangement](index.html#sec5sec3) + //! items that are partitioned in a [blocked arrangement](../index.html#sec5sec3) //! across 128 threads where each thread owns 4 consecutive items. //! //! .. code-block:: c++ @@ -2180,7 +2180,7 @@ public: //! +++++++ //! //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that - //! are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + //! are partitioned in a [blocked arrangement](../index.html#sec5sec3) across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ @@ -2314,7 +2314,7 @@ public: //! +++++++ //! //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that - //! are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + //! are partitioned in a [blocked arrangement](../index.html#sec5sec3) across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh index ea199e76850..f02ae6c0024 100644 --- a/cub/cub/cub.cuh +++ b/cub/cub/cub.cuh @@ -64,6 +64,7 @@ #include #include #include +#include #include #include #include diff --git a/cub/cub/detail/detect_cuda_runtime.cuh b/cub/cub/detail/detect_cuda_runtime.cuh index 44ee811192d..211e31345da 100644 --- a/cub/cub/detail/detect_cuda_runtime.cuh +++ b/cub/cub/detail/detect_cuda_runtime.cuh @@ -44,7 +44,10 @@ # pragma system_header #endif // no system header -#include +// CUDA headers might not be present when using NVRTC, see NVIDIA/cccl#2095 for detail +#if !defined(_CCCL_COMPILER_NVRTC) +# include +#endif // !_CCCL_COMPILER_NVRTC #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: diff --git a/cub/cub/detail/uninitialized_copy.cuh b/cub/cub/detail/uninitialized_copy.cuh index 9a3f01e2c0a..326826c0d1a 100644 --- a/cub/cub/detail/uninitialized_copy.cuh +++ b/cub/cub/detail/uninitialized_copy.cuh @@ -58,6 +58,7 @@ template ::value, int>::type = 0> _CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val) { + // gevtushenko: placement new should work here as well, but the code generated for copy assignment is sometimes better *ptr = ::cuda::std::forward(val); } diff --git a/cub/cub/device/device_merge.cuh b/cub/cub/device/device_merge.cuh new file mode 100644 index 00000000000..bf110f2f40f --- /dev/null +++ b/cub/cub/device/device_merge.cuh @@ -0,0 +1,197 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +CUB_NAMESPACE_BEGIN + +//! DeviceMerge provides device-wide, parallel operations for merging two sorted sequences of values (called keys) or +//! key-value pairs in device-accessible memory. The sorting order is determined by a comparison functor (default: +//! less-than), which has to establish a [strict weak ordering]. +//! +//! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order +struct DeviceMerge +{ + //! @rst + //! Overview + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! Merges two sorted sequences of values (called keys) into a sorted output sequence. Merging is unstable, + //! which means any two equivalent values (neither value is ordered before the other) may be written to the ouput + //! sequence in any order. + //! + //! A Simple Example + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! The code snippet below illustrates the merging of two device vectors of `int` keys. + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin merge-keys + //! :end-before: example-end merge-keys + //! + //! @endrst + //! + //! @tparam KeyIteratorIn1 **[deduced]** Random access iterator to the first sorted input sequence. Must have the same + //! value type as KeyIteratorIn2. + //! @tparam KeyIteratorIn2 **[deduced]** Random access iterator to the second sorted input sequence. Must have the + //! same value type as KeyIteratorIn1. + //! @tparam KeyIteratorOut **[deduced]** Random access iterator to the output sequence. + //! @tparam CompareOp **[deduced]** Binary predicate to compare the input iterator's value types. Must have a + //! signature equivalent to `bool operator()(Key lhs, Key rhs)` and establish a [strict weak ordering]. + //! + //! @param[in] d_temp_storage Device-accessible allocation of temporary storage. When `nullptr`, the required + //! allocation size is written to `temp_storage_bytes` and no work is done. + //! @param[in,out] temp_storage_bytes Reference to size in bytes of `d_temp_storage` allocation. + //! @param[in] keys_in1 Iterator to the beginning of the first sorted input sequence. + //! @param[in] num_keys1 Number of keys in the first input sequence. + //! @param[in] keys_in2 Iterator to the beginning of the second sorted input sequence. + //! @param[in] num_keys2 Number of keys in the second input sequence. + //! @param[out] keys_out Iterator to the beginning of the output sequence. + //! @param[in] compare_op Comparison function object, returning true if the first argument is ordered before the + //! second. Must establish a [strict weak ordering]. + //! @param[in] stream **[optional]** CUDA stream to launch kernels into. Default is stream0. + //! + //! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order + template > + CUB_RUNTIME_FUNCTION static cudaError_t MergeKeys( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + KeyIteratorIn1 keys_in1, + int num_keys1, + KeyIteratorIn2 keys_in2, + int num_keys2, + KeyIteratorOut keys_out, + CompareOp compare_op = {}, + cudaStream_t stream = nullptr) + { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys"); + return detail::merge:: + dispatch_t:: + dispatch( + d_temp_storage, + temp_storage_bytes, + keys_in1, + nullptr, + num_keys1, + keys_in2, + nullptr, + num_keys2, + keys_out, + nullptr, + compare_op, + stream); + } + + //! @rst + //! Overview + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! Merges two sorted sequences of key-value pairs into a sorted output sequence. Merging is unstable, + //! which means any two equivalent values (neither value is ordered before the other) may be written to the ouput + //! sequence in any order. + //! + //! A Simple Example + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! The code snippet below illustrates the merging of two device vectors of `int` keys. + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin merge-pairs + //! :end-before: example-end merge-pairs + //! + //! @endrst + //! + //! @tparam KeyIteratorIn1 **[deduced]** Random access iterator to the keys of the first sorted input sequence. Must + //! have the same value type as KeyIteratorIn2. + //! @tparam ValueIteratorIn1 **[deduced]** Random access iterator to the values of the first sorted input sequence. + //! Must have the same value type as ValueIteratorIn2. + //! @tparam KeyIteratorIn2 **[deduced]** Random access iterator to the second sorted input sequence. Must have the + //! same value type as KeyIteratorIn1. + //! @tparam ValueIteratorIn2 **[deduced]** Random access iterator to the values of the second sorted input sequence. + //! Must have the same value type as ValueIteratorIn1. + //! @tparam KeyIteratorOut **[deduced]** Random access iterator to the keys of the output sequence. + //! @tparam ValueIteratorOut **[deduced]** Random access iterator to the values of the output sequence. + //! @tparam CompareOp **[deduced]** Binary predicate to compare the key input iterator's value types. Must have a + //! signature equivalent to `bool operator()(Key lhs, Key rhs)` and establish a [strict weak ordering]. + //! + //! @param[in] d_temp_storage Device-accessible allocation of temporary storage. When `nullptr`, the required + //! allocation size is written to `temp_storage_bytes` and no work is done. + //! @param[in,out] temp_storage_bytes Reference to size in bytes of `d_temp_storage` allocation. + //! @param[in] keys_in1 Iterator to the beginning of the keys of the first sorted input sequence. + //! @param[in] values_in1 Iterator to the beginning of the values of the first sorted input sequence. + //! @param[in] num_pairs1 Number of key-value pairs in the first input sequence. + //! @param[in] keys_in2 Iterator to the beginning of the keys of the second sorted input sequence. + //! @param[in] values_in2 Iterator to the beginning of the values of the second sorted input sequence. + //! @param[in] num_pairs2 Number of key-value pairs in the second input sequence. + //! @param[out] keys_out Iterator to the beginning of the keys of the output sequence. + //! @param[out] values_out Iterator to the beginning of the values of the output sequence. + //! @param[in] compare_op Comparison function object, returning true if the first argument is ordered before the + //! second. Must establish a [strict weak ordering]. + //! @param[in] stream **[optional]** CUDA stream to launch kernels into. Default is stream0. + //! + //! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order + template > + CUB_RUNTIME_FUNCTION static cudaError_t MergePairs( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + KeyIteratorIn1 keys_in1, + ValueIteratorIn1 values_in1, + int num_pairs1, + KeyIteratorIn2 keys_in2, + ValueIteratorIn2 values_in2, + int num_pairs2, + KeyIteratorOut keys_out, + ValueIteratorOut values_out, + CompareOp compare_op = {}, + cudaStream_t stream = nullptr) + { + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs"); + return detail::merge::dispatch_t< + KeyIteratorIn1, + ValueIteratorIn1, + KeyIteratorIn2, + ValueIteratorIn2, + KeyIteratorOut, + ValueIteratorOut, + int, + CompareOp>::dispatch(d_temp_storage, + temp_storage_bytes, + keys_in1, + values_in1, + num_pairs1, + keys_in2, + values_in2, + num_pairs2, + keys_out, + values_out, + compare_op, + stream); + } +}; + +CUB_NAMESPACE_END diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index ec3a739abac..54062bd9ea3 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -1035,7 +1035,7 @@ struct DeviceReduce //! struct CustomMin //! { //! template - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index d72dabaab2d..c8a36f0255e 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -326,7 +326,7 @@ struct DeviceScan //! struct CustomMin //! { //! template - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } @@ -471,7 +471,7 @@ struct DeviceScan //! struct CustomMin //! { //! template - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } @@ -595,7 +595,7 @@ struct DeviceScan //! struct CustomMin //! { //! template - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } @@ -751,7 +751,7 @@ struct DeviceScan //! struct CustomMin //! { //! template - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } @@ -1078,7 +1078,7 @@ struct DeviceScan //! struct CustomMin //! { //! template - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } @@ -1307,7 +1307,7 @@ struct DeviceScan //! struct CustomMin //! { //! template - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } @@ -1584,7 +1584,7 @@ struct DeviceScan //! struct CustomMin //! { //! template - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } @@ -1594,7 +1594,7 @@ struct DeviceScan //! struct CustomEqual //! { //! template - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! T operator()(const T &a, const T &b) const { //! return a == b; //! } @@ -1950,7 +1950,7 @@ struct DeviceScan //! struct CustomMin //! { //! template - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } @@ -1960,7 +1960,7 @@ struct DeviceScan //! struct CustomEqual //! { //! template - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! T operator()(const T &a, const T &b) const { //! return a == b; //! } diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index 715cfbcea0c..3113d6ca828 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -380,10 +380,10 @@ struct DeviceSelect //! { //! int compare; //! - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! LessThan(int compare) : compare(compare) {} //! - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! bool operator()(const int &a) const { //! return (a < compare); //! } @@ -534,10 +534,10 @@ struct DeviceSelect //! { //! int compare; //! - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! LessThan(int compare) : compare(compare) {} //! - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! bool operator()(const int &a) const { //! return (a < compare); //! } diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh index 93e51f2293f..6d6d1264828 100644 --- a/cub/cub/device/device_spmv.cuh +++ b/cub/cub/device/device_spmv.cuh @@ -67,7 +67,8 @@ CUB_NAMESPACE_BEGIN //! //! - ``A`` is an ``m * n`` sparse matrix whose non-zero structure is specified in //! `compressed-storage-row (CSR) format -//! `_ (i.e., three arrays: +//! `_ (i.e., three +//! arrays: //! ``values``, ``row_offsets``, and ``column_indices``) //! - ``x`` and ``y`` are dense vectors //! diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh new file mode 100644 index 00000000000..2c16d851448 --- /dev/null +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -0,0 +1,355 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +#include + +CUB_NAMESPACE_BEGIN +namespace detail +{ +namespace merge +{ +_LIBCUDACXX_INLINE_VAR constexpr int fallback_BLOCK_THREADS = 64; +_LIBCUDACXX_INLINE_VAR constexpr int fallback_ITEMS_PER_THREAD = 1; + +template +class choose_merge_agent +{ + using default_agent_t = agent_t; + using fallback_agent_t = + agent_t, Args...>; + + // Use fallback if merge agent exceeds maximum shared memory, but the fallback agent still fits + static constexpr bool use_fallback = sizeof(typename default_agent_t::TempStorage) > max_smem_per_block + && sizeof(typename fallback_agent_t::TempStorage) <= max_smem_per_block; + +public: + using type = ::cuda::std::__conditional_t; +}; + +// Computes the merge path intersections at equally wide intervals. The approach is outlined in the paper: +// Odeh et al, "Merge Path - Parallel Merging Made Simple" * doi : 10.1109 / IPDPSW .2012.202 +// The algorithm is the same as AgentPartition for merge sort, but that agent handles a lot more. +template +CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel( + KeyIt1 keys1, + Offset keys1_count, + KeyIt2 keys2, + Offset keys2_count, + Offset num_partitions, + Offset* merge_partitions, + CompareOp compare_op) +{ + // items_per_tile must be the same of the merge kernel later, so we have to consider whether a fallback agent will be + // selected for the merge agent that changes the tile size + constexpr int items_per_tile = + choose_merge_agent::type::policy::ITEMS_PER_TILE; + const Offset partition_idx = blockDim.x * blockIdx.x + threadIdx.x; + if (partition_idx < num_partitions) + { + const Offset partition_at = (cub::min)(partition_idx * items_per_tile, keys1_count + keys2_count); + merge_partitions[partition_idx] = cub::MergePath(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op); + } +} + +template +__launch_bounds__( + choose_merge_agent::type::policy::BLOCK_THREADS) + CUB_DETAIL_KERNEL_ATTRIBUTES void device_merge_kernel( + KeyIt1 keys1, + ValueIt1 items1, + Offset num_keys1, + KeyIt2 keys2, + ValueIt2 items2, + Offset num_keys2, + KeyIt3 keys_result, + ValueIt3 items_result, + CompareOp compare_op, + Offset* merge_partitions, + vsmem_t global_temp_storage) +{ + // the merge agent loads keys into a local array of KeyIt1::value_type, on which the comparisons are performed + using key_t = value_t; + static_assert(::cuda::std::__invokable::value, + "Comparison operator cannot compare two keys"); + static_assert( + ::cuda::std::is_convertible::type, bool>::value, + "Comparison operator must be convertible to bool"); + + using MergeAgent = typename choose_merge_agent< + typename MaxPolicy::ActivePolicy::merge_policy, + KeyIt1, + ValueIt1, + KeyIt2, + ValueIt2, + KeyIt3, + ValueIt3, + Offset, + CompareOp>::type; + using MergePolicy = typename MergeAgent::policy; + + using THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator; + using vsmem_helper_t = vsmem_helper_impl; + __shared__ typename vsmem_helper_t::static_temp_storage_t shared_temp_storage; + auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage); + MergeAgent{ + temp_storage.Alias(), + make_load_iterator(MergePolicy{}, keys1), + make_load_iterator(MergePolicy{}, items1), + num_keys1, + make_load_iterator(MergePolicy{}, keys2), + make_load_iterator(MergePolicy{}, items2), + num_keys2, + keys_result, + items_result, + compare_op, + merge_partitions}(); + vsmem_helper_t::discard_temp_storage(temp_storage); +} + +template +struct device_merge_policy_hub +{ + static constexpr bool has_values = !::cuda::std::is_same::value; + + using tune_type = char[has_values ? sizeof(KeyT) + sizeof(ValueT) : sizeof(KeyT)]; + + struct policy300 : ChainedPolicy<300, policy300, policy300> + { + using merge_policy = + agent_policy_t<128, + Nominal4BItemsToItems(7), + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE>; + }; + + struct policy350 : ChainedPolicy<350, policy350, policy300> + { + using merge_policy = + agent_policy_t<256, + Nominal4BItemsToItems(11), + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE>; + }; + + struct policy520 : ChainedPolicy<520, policy520, policy350> + { + using merge_policy = + agent_policy_t<512, + Nominal4BItemsToItems(13), + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE>; + }; + + struct policy600 : ChainedPolicy<600, policy600, policy520> + { + using merge_policy = + agent_policy_t<512, + Nominal4BItemsToItems(15), + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE>; + }; + + using max_policy = policy600; +}; + +template , value_t>> +struct dispatch_t +{ + void* d_temp_storage; + std::size_t& temp_storage_bytes; + KeyIt1 d_keys1; + ValueIt1 d_values1; + Offset num_items1; + KeyIt2 d_keys2; + ValueIt2 d_values2; + Offset num_items2; + KeyIt3 d_keys_out; + ValueIt3 d_values_out; + CompareOp compare_op; + cudaStream_t stream; + + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() + { + using max_policy_t = typename PolicyHub::max_policy; + using merge_policy_t = typename ActivePolicy::merge_policy; + using agent_t = + typename choose_merge_agent:: + type; + + const auto num_tiles = cub::DivideAndRoundUp(num_items1 + num_items2, agent_t::policy::ITEMS_PER_TILE); + void* allocations[2] = {nullptr, nullptr}; + { + const std::size_t merge_partitions_size = (1 + num_tiles) * sizeof(Offset); + const std::size_t virtual_shared_memory_size = num_tiles * vsmem_helper_impl::vsmem_per_block; + const std::size_t allocation_sizes[2] = {merge_partitions_size, virtual_shared_memory_size}; + const auto error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) + { + return error; + } + } + + // Return if only temporary storage was requested or there is no work to be done + if (d_temp_storage == nullptr || num_tiles == 0) + { + return cudaSuccess; + } + + auto merge_partitions = static_cast(allocations[0]); + + // parition the merge path + { + const Offset num_partitions = num_tiles + 1; + constexpr int threads_per_partition_block = 256; // TODO(bgruber): no policy? + const int partition_grid_size = + static_cast(cub::DivideAndRoundUp(num_partitions, threads_per_partition_block)); + + auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + partition_grid_size, threads_per_partition_block, 0, stream) + .doit(device_partition_merge_path_kernel< + max_policy_t, + KeyIt1, + ValueIt1, + KeyIt2, + ValueIt2, + KeyIt3, + ValueIt3, + Offset, + CompareOp>, + d_keys1, + num_items1, + d_keys2, + num_items2, + num_partitions, + merge_partitions, + compare_op)); + if (cudaSuccess != error) + { + return error; + } + error = CubDebug(DebugSyncStream(stream)); + if (cudaSuccess != error) + { + return error; + } + } + + // merge + if (num_tiles > 0) + { + auto vshmem_ptr = vsmem_t{allocations[1]}; + auto error = CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + static_cast(num_tiles), static_cast(agent_t::policy::BLOCK_THREADS), 0, stream) + .doit( + device_merge_kernel, + d_keys1, + d_values1, + num_items1, + d_keys2, + d_values2, + num_items2, + d_keys_out, + d_values_out, + compare_op, + merge_partitions, + vshmem_ptr)); + if (cudaSuccess != error) + { + return error; + } + error = CubDebug(DebugSyncStream(stream)); + if (cudaSuccess != error) + { + return error; + } + } + + return cudaSuccess; + } + + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(Args&&... args) + { + int ptx_version = 0; + auto error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) + { + return error; + } + dispatch_t dispatch{::cuda::std::forward(args)...}; + error = CubDebug(PolicyHub::max_policy::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) + { + return error; + } + + return cudaSuccess; + } +}; +} // namespace merge +} // namespace detail +CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 11939b632c7..59deb2e529f 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -510,19 +510,17 @@ struct DispatchMergeSort : SelectedPolicy constexpr auto tile_size = merge_sort_helper_t::policy_t::ITEMS_PER_TILE; const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size); - const auto merge_partitions_size = static_cast(1 + num_tiles) * sizeof(OffsetT); - - const auto temporary_keys_storage_size = static_cast(num_items * sizeof(KeyT)); - + const auto merge_partitions_size = static_cast(1 + num_tiles) * sizeof(OffsetT); + const auto temporary_keys_storage_size = static_cast(num_items * sizeof(KeyT)); const auto temporary_values_storage_size = static_cast(num_items * sizeof(ValueT)) * !KEYS_ONLY; /** * Merge sort supports large types, which can lead to excessive shared memory size requirements. In these cases, * merge sort allocates virtual shared memory that resides in global memory. */ - std::size_t block_sort_smem_size = num_tiles * BlockSortVSmemHelperT::vsmem_per_block; - std::size_t merge_smem_size = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block; - std::size_t virtual_shared_memory_size = (cub::max)(block_sort_smem_size, merge_smem_size); + const std::size_t block_sort_smem_size = num_tiles * BlockSortVSmemHelperT::vsmem_per_block; + const std::size_t merge_smem_size = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block; + const std::size_t virtual_shared_memory_size = (cub::max)(block_sort_smem_size, merge_smem_size); void* allocations[4] = {nullptr, nullptr, nullptr, nullptr}; std::size_t allocation_sizes[4] = { @@ -555,9 +553,9 @@ struct DispatchMergeSort : SelectedPolicy */ bool ping = num_passes % 2 == 0; - auto merge_partitions = reinterpret_cast(allocations[0]); - auto keys_buffer = reinterpret_cast(allocations[1]); - auto items_buffer = reinterpret_cast(allocations[2]); + auto merge_partitions = static_cast(allocations[0]); + auto keys_buffer = static_cast(allocations[1]); + auto items_buffer = static_cast(allocations[2]); // Invoke DeviceMergeSortBlockSortKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -617,7 +615,7 @@ struct DispatchMergeSort : SelectedPolicy for (int pass = 0; pass < num_passes; ++pass, ping = !ping) { - OffsetT target_merged_tiles_number = OffsetT(2) << pass; + const OffsetT target_merged_tiles_number = OffsetT(2) << pass; // Partition THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -706,9 +704,7 @@ struct DispatchMergeSort : SelectedPolicy { // Get PTX version int ptx_version = 0; - - error = CubDebug(PtxVersion(ptx_version)); - + error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh index a572fa50561..3db8d73031b 100644 --- a/cub/cub/thread/thread_load.cuh +++ b/cub/cub/thread/thread_load.cuh @@ -102,11 +102,11 @@ enum CacheLoadModifier * @tparam MODIFIER * [inferred] CacheLoadModifier enumeration * - * @tparam InputIteratorT - * [inferred] Input iterator type \iterator + * @tparam RandomAccessIterator + * [inferred] The input's iterator type \iterator */ -template -_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(InputIteratorT itr); +template +_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(RandomAccessIterator itr); //@} end member group @@ -125,9 +125,9 @@ struct IterateThreadLoad IterateThreadLoad::template Load(ptr, vals); } - template + template CUB_DEPRECATED_BECAUSE("Use UnrolledCopy() instead") - static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(InputIteratorT itr, T* vals) + static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator itr, T* vals) { vals[COUNT] = itr[COUNT]; IterateThreadLoad::Dereference(itr, vals); @@ -142,8 +142,8 @@ struct IterateThreadLoad static _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T const* /*ptr*/, T* /*vals*/) {} - template - static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(InputIteratorT /*itr*/, T* /*vals*/) + template + static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator /*itr*/, T* /*vals*/) {} }; @@ -309,9 +309,9 @@ _CUB_LOAD_ALL(LOAD_LDG, global.nc) /** * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types */ -template -_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t -ThreadLoad(InputIteratorT itr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) +template +_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t +ThreadLoad(RandomAccessIterator itr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { return *itr; } @@ -320,7 +320,8 @@ ThreadLoad(InputIteratorT itr, Int2Type /*modifier*/, Int2Type -_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) +_CCCL_DEVICE _CCCL_FORCEINLINE T +ThreadLoad(const T* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { return *ptr; } @@ -329,9 +330,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type /*mod * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types */ template -_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type /*is_primitive*/) +_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type /*is_primitive*/) { - T retval = *reinterpret_cast(ptr); + T retval = *reinterpret_cast(ptr); return retval; } @@ -339,16 +340,15 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type -_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type /*is_primitive*/) +_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type /*is_primitive*/) { - // Word type for memcopying - using VolatileWord = typename UnitWord::VolatileWord; - + // Word type for memcpying + using VolatileWord = typename UnitWord::VolatileWord; constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); T retval; VolatileWord* words = reinterpret_cast(&retval); - UnrolledCopy(reinterpret_cast(ptr), words); + UnrolledCopy(reinterpret_cast(ptr), words); return retval; } @@ -356,9 +356,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type -_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) +_CCCL_DEVICE _CCCL_FORCEINLINE T +ThreadLoad(const T* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { - // Apply tags for partial-specialization return ThreadLoadVolatilePointer(ptr, Int2Type::PRIMITIVE>()); } @@ -368,25 +368,18 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type /*mo template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T const* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { - using DeviceWord = typename UnitWord::DeviceWord; - + using DeviceWord = typename UnitWord::DeviceWord; constexpr int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); DeviceWord words[DEVICE_MULTIPLE]; - UnrolledThreadLoad( - reinterpret_cast(const_cast(ptr)), words); - + UnrolledThreadLoad(reinterpret_cast(ptr), words); return *reinterpret_cast(words); } -/** - * ThreadLoad definition for generic modifiers - */ -template -_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(InputIteratorT itr) +template +_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(RandomAccessIterator itr) { - // Apply tags for partial-specialization - return ThreadLoad(itr, Int2Type(), Int2Type<::cuda::std::is_pointer::value>()); + return ThreadLoad(itr, Int2Type(), Int2Type<::cuda::std::is_pointer::value>()); } #endif // DOXYGEN_SHOULD_SKIP_THIS diff --git a/cub/cub/thread/thread_search.cuh b/cub/cub/thread/thread_search.cuh index d22ca7ff0a5..802d4ec96f8 100644 --- a/cub/cub/thread/thread_search.cuh +++ b/cub/cub/thread/thread_search.cuh @@ -97,6 +97,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void MergePathSearch( * @param[in] val * Search key */ +// TODO(bgruber): deprecate once ::cuda::std::lower_bound is made public template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT LowerBound(InputIteratorT input, OffsetT num_items, T val) { @@ -131,6 +132,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT LowerBound(InputIteratorT input, OffsetT * @param[in] val * Search key */ +// TODO(bgruber): deprecate once ::cuda::std::upper_bound is made public template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT UpperBound(InputIteratorT input, OffsetT num_items, T val) { diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index 2998608d567..e1cc4d53724 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -72,6 +72,7 @@ namespace detail * @brief Helper class template that allows overwriting the `BLOCK_THREAD` and `ITEMS_PER_THREAD` * configurations of a given policy. */ +// TODO(bgruber): this should be called something like "override_policy" template struct policy_wrapper_t : PolicyT { @@ -155,9 +156,10 @@ CUB_RUNTIME_FUNCTION inline int DeviceCountUncached() /** * \brief Cache for an arbitrary value produced by a nullary function. + * deprecated [Since 2.6.0] */ template -struct ValueCache +struct CUB_DEPRECATED ValueCache { T const value; @@ -170,13 +172,11 @@ struct ValueCache {} }; -// Host code, only safely usable in C++11 or newer, where thread-safe -// initialization of static locals is guaranteed. This is a separate function -// to avoid defining a local static in a host/device function. +// Host code. This is a separate function to avoid defining a local static in a host/device function. _CCCL_HOST inline int DeviceCountCachedValue() { - static ValueCache cache; - return cache.value; + static int count = DeviceCountUncached(); + return count; } /** @@ -211,7 +211,7 @@ struct PerDeviceAttributeCache // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the // `DeviceEntryInitializing` state, and then proceeds to the // `DeviceEntryReady` state. These are the only state transitions allowed; - // e.g. a linear sequence of transitions. + // i.e. a linear sequence of transitions. enum DeviceEntryStatus { DeviceEntryEmpty = 0, @@ -372,7 +372,6 @@ _CCCL_HOST inline cudaError_t PtxVersionUncached(int& ptx_version, int device) template _CCCL_HOST inline PerDeviceAttributeCache& GetPerDeviceAttributeCache() { - // C++11 guarantees that initialization of static locals is thread safe. static PerDeviceAttributeCache cache; return cache; } @@ -383,17 +382,15 @@ struct SmVersionCacheTag {}; /** - * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10). + * \brief Retrieves the PTX virtual architecture that will be used on \p device (major * 100 + minor * 10). * * \note This function may cache the result internally. - * * \note This function is thread safe. */ _CCCL_HOST inline cudaError_t PtxVersion(int& ptx_version, int device) { auto const payload = GetPerDeviceAttributeCache()( - // If this call fails, then we get the error code back in the payload, - // which we check with `CubDebug` below. + // If this call fails, then we get the error code back in the payload, which we check with `CubDebug` below. [=](int& pv) { return PtxVersionUncached(pv, device); }, @@ -408,37 +405,23 @@ _CCCL_HOST inline cudaError_t PtxVersion(int& ptx_version, int device) } /** - * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). + * \brief Retrieves the PTX virtual architecture that will be used on the current device (major * 100 + minor * 10). * * \note This function may cache the result internally. - * * \note This function is thread safe. */ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version) { cudaError_t result = cudaErrorUnknown; - NV_IF_TARGET( - NV_IS_HOST, - (auto const device = CurrentDevice(); - auto const payload = GetPerDeviceAttributeCache()( - // If this call fails, then we get the error code back in the payload, - // which we check with `CubDebug` below. - [=](int& pv) { - return PtxVersionUncached(pv, device); - }, - device); - - if (!CubDebug(payload.error)) { ptx_version = payload.attribute; } - - result = payload.error;), - ( // NV_IS_DEVICE: - result = PtxVersionUncached(ptx_version);)); - + NV_IF_TARGET(NV_IS_HOST, + (result = PtxVersion(ptx_version, CurrentDevice());), + ( // NV_IS_DEVICE: + result = PtxVersionUncached(ptx_version);)); return result; } /** - * \brief Retrieves the SM version of \p device (major * 100 + minor * 10) + * \brief Retrieves the SM version (i.e. compute capability) of \p device (major * 100 + minor * 10) */ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice()) { @@ -464,10 +447,9 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int d } /** - * \brief Retrieves the SM version of \p device (major * 100 + minor * 10) + * \brief Retrieves the SM version (i.e. compute capability) of \p device (major * 100 + minor * 10). * * \note This function may cache the result internally. - * * \note This function is thread safe. */ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = CurrentDevice()) @@ -477,8 +459,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = NV_IF_TARGET( NV_IS_HOST, (auto const payload = GetPerDeviceAttributeCache()( - // If this call fails, then we get the error code back in - // the payload, which we check with `CubDebug` below. + // If this call fails, then we get the error code back in the payload, which we check with `CubDebug` below. [=](int& pv) { return SmVersionUncached(pv, device); }, @@ -565,9 +546,8 @@ CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream) CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva) { has_uva = false; - cudaError_t error = cudaSuccess; int device = -1; - error = CubDebug(cudaGetDevice(&device)); + cudaError_t error = CubDebug(cudaGetDevice(&device)); if (cudaSuccess != error) { return error; diff --git a/cub/cub/util_math.cuh b/cub/cub/util_math.cuh index 454447b4b0d..e5b8444466d 100644 --- a/cub/cub/util_math.cuh +++ b/cub/cub/util_math.cuh @@ -80,6 +80,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT safe_add_bound_to_max(OffsetT lhs, O template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr NumeratorT DivideAndRoundUp(NumeratorT n, DenominatorT d) { + // TODO(bgruber): implement using ::cuda::ceil_div static_assert( cub::detail::is_integral_or_enum::value && cub::detail::is_integral_or_enum::value, "DivideAndRoundUp is only intended for integral types."); diff --git a/cub/cub/warp/warp_exchange.cuh b/cub/cub/warp/warp_exchange.cuh index 712d0a6bcd3..79f422f5abe 100644 --- a/cub/cub/warp/warp_exchange.cuh +++ b/cub/cub/warp/warp_exchange.cuh @@ -27,7 +27,7 @@ /** * @file - * The cub::WarpExchange class provides [collective](index.html#sec0) + * The cub::WarpExchange class provides [collective](../index.html#sec0) * methods for rearranging data partitioned across a CUDA warp. */ @@ -68,7 +68,7 @@ using InternalWarpExchangeImpl = } // namespace detail /** - * @brief The WarpExchange class provides [collective](index.html#sec0) + * @brief The WarpExchange class provides [collective](../index.html#sec0) * methods for rearranging data partitioned across a CUDA warp. * * @tparam T @@ -94,10 +94,10 @@ using InternalWarpExchangeImpl = * partitioning of items across threads (where consecutive items belong to a * single thread). * - WarpExchange supports the following types of data exchanges: - * - Transposing between [blocked](index.html#sec5sec3) and - * [striped](index.html#sec5sec3) arrangements + * - Transposing between [blocked](../index.html#sec5sec3) and + * [striped](../index.html#sec5sec3) arrangements * - Scattering ranked items to a - * [striped arrangement](index.html#sec5sec3) + * [striped arrangement](../index.html#sec5sec3) * * @par A Simple Example * @par diff --git a/cub/examples/CMakeLists.txt b/cub/examples/CMakeLists.txt index 3865b6f057f..aa766336d3e 100644 --- a/cub/examples/CMakeLists.txt +++ b/cub/examples/CMakeLists.txt @@ -35,10 +35,6 @@ function(cub_add_example target_name_var example_name example_src cub_target) cub_configure_cuda_target(${example_target} RDC ${CUB_FORCE_RDC}) target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples") - if (CUB_IN_THRUST) - thrust_fix_clang_nvcc_build_for(${example_target}) - endif() - # Add to the active configuration's meta target add_dependencies(${config_meta_target} ${example_target}) diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index 2187c66f84c..48a0142801a 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -202,10 +202,6 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) else() target_compile_definitions(${config_c2h_target} PRIVATE C2H_HAS_CURAND=0) endif() - - if (CUB_IN_THRUST) - thrust_fix_clang_nvcc_build_for(${config_c2h_target}) - endif() endif() # config_c2h_target if (CUB_SEPARATE_CATCH2) @@ -240,10 +236,6 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) target_link_options(${config_c2run_target} PRIVATE "-cuda") endif() - if (CUB_IN_THRUST) - thrust_fix_clang_nvcc_build_for(${config_c2run_target}) - endif() - add_test(NAME ${config_c2run_target} COMMAND "$" ) @@ -265,10 +257,6 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) target_compile_definitions(${test_target} PRIVATE NVRTC_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}") endif() - if (CUB_IN_THRUST) - thrust_fix_clang_nvcc_build_for(${test_target}) - endif() - if ("${test_target}" MATCHES "test.iterator") target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists endif() @@ -297,9 +285,6 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) target_link_libraries(${test_target} nvtx3-cpp) endif() - if (CUB_IN_THRUST) - thrust_fix_clang_nvcc_build_for(${test_target}) - endif() _cub_is_fail_test(is_fail_test "${test_src}") if (is_fail_test) diff --git a/cub/test/catch2_test_device_merge.cu b/cub/test/catch2_test_device_merge.cu new file mode 100644 index 00000000000..abc8b1a5ce8 --- /dev/null +++ b/cub/test/catch2_test_device_merge.cu @@ -0,0 +1,463 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + +#include + +#include +#include + +#include + +#include "catch2_test_helper.h" +#include "catch2_test_launch_helper.h" +#include + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergePairs, merge_pairs); +DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergeKeys, merge_keys); + +// TODO(bgruber): replace the following by the CUB device API directly, once we have figured out how to handle different +// offset types +namespace detail +{ +template > +CUB_RUNTIME_FUNCTION static cudaError_t merge_keys_custom_offset_type( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + KeyIteratorIn1 keys_in1, + Offset num_keys1, + KeyIteratorIn2 keys_in2, + Offset num_keys2, + KeyIteratorOut keys_out, + CompareOp compare_op = {}, + cudaStream_t stream = 0) +{ + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys"); + return cub::detail::merge::dispatch_t< + KeyIteratorIn1, + cub::NullType*, + KeyIteratorIn2, + cub::NullType*, + KeyIteratorOut, + cub::NullType*, + Offset, + CompareOp>::dispatch(d_temp_storage, + temp_storage_bytes, + keys_in1, + nullptr, + num_keys1, + keys_in2, + nullptr, + num_keys2, + keys_out, + nullptr, + compare_op, + stream); +} + +template > +CUB_RUNTIME_FUNCTION static cudaError_t merge_pairs_custom_offset_type( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + KeyIteratorIn1 keys_in1, + ValueIteratorIn1 values_in1, + Offset num_pairs1, + KeyIteratorIn2 keys_in2, + ValueIteratorIn2 values_in2, + Offset num_pairs2, + KeyIteratorOut keys_out, + ValueIteratorOut values_out, + CompareOp compare_op = {}, + cudaStream_t stream = 0) +{ + CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs"); + return cub::detail::merge::dispatch_t< + KeyIteratorIn1, + ValueIteratorIn1, + KeyIteratorIn2, + ValueIteratorIn2, + KeyIteratorOut, + ValueIteratorOut, + Offset, + CompareOp>::dispatch(d_temp_storage, + temp_storage_bytes, + keys_in1, + values_in1, + num_pairs1, + keys_in2, + values_in2, + num_pairs2, + keys_out, + values_out, + compare_op, + stream); +} +} // namespace detail + +DECLARE_LAUNCH_WRAPPER(detail::merge_keys_custom_offset_type, merge_keys_custom_offset_type); +DECLARE_LAUNCH_WRAPPER(detail::merge_pairs_custom_offset_type, merge_pairs_custom_offset_type); + +using types = c2h::type_list; + +// gevtushenko: there is no code path in CUB and Thrust that leads to unsigned offsets, so let's safe some compile time +using offset_types = c2h::type_list; + +template , + typename MergeKeys = decltype(::merge_keys)> +void test_keys(Offset size1 = 3623, Offset size2 = 6346, CompareOp compare_op = {}, MergeKeys merge_keys = ::merge_keys) +{ + CAPTURE(c2h::type_name(), c2h::type_name(), size1, size2); + + c2h::device_vector keys1_d(size1); + c2h::device_vector keys2_d(size2); + + c2h::gen(CUB_SEED(1), keys1_d); + c2h::gen(CUB_SEED(1), keys2_d); + + thrust::sort(c2h::device_policy, keys1_d.begin(), keys1_d.end(), compare_op); + thrust::sort(c2h::device_policy, keys2_d.begin(), keys2_d.end(), compare_op); + // CAPTURE(keys1_d, keys2_d); + + c2h::device_vector result_d(size1 + size2); + merge_keys(thrust::raw_pointer_cast(keys1_d.data()), + static_cast(keys1_d.size()), + thrust::raw_pointer_cast(keys2_d.data()), + static_cast(keys2_d.size()), + thrust::raw_pointer_cast(result_d.data()), + compare_op); + + c2h::host_vector keys1_h = keys1_d; + c2h::host_vector keys2_h = keys2_d; + c2h::host_vector reference_h(size1 + size2); + std::merge(keys1_h.begin(), keys1_h.end(), keys2_h.begin(), keys2_h.end(), reference_h.begin(), compare_op); + + // FIXME(bgruber): comparing std::vectors (slower than thrust vectors) but compiles a lot faster + CHECK((detail::to_vec(reference_h) == detail::to_vec(c2h::host_vector(result_d)))); +} + +CUB_TEST("DeviceMerge::MergeKeys key types", "[merge][device]", types) +{ + using key_t = c2h::get<0, TestType>; + using offset_t = int; + test_keys(); +} + +using large_type_fallb = c2h::custom_type_t::type>; +using large_type_vsmem = c2h::custom_type_t::type>; + +struct fallback_test_policy_hub +{ + struct max_policy : cub::ChainedPolicy<100, max_policy, max_policy> + { + using merge_policy = cub::detail::merge:: + agent_policy_t<128, 7, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_STORE_WARP_TRANSPOSE>; + }; +}; + +// TODO(bgruber): This test alone increases compile time from 1m16s to 8m43s. What's going on? +CUB_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_list) +{ + using key_t = c2h::get<0, TestType>; + using offset_t = int; + + constexpr auto agent_sm = sizeof(key_t) * 128 * 7; + constexpr auto fallback_sm = + sizeof(key_t) * cub::detail::merge::fallback_BLOCK_THREADS * cub::detail::merge::fallback_ITEMS_PER_THREAD; + static_assert(agent_sm > cub::detail::max_smem_per_block, + "key_t is not big enough to exceed SM and trigger fallback policy"); + static_assert( + ::cuda::std::is_same::value == (fallback_sm <= cub::detail::max_smem_per_block), + "SM consumption by fallback policy should fit into max_smem_per_block"); + + test_keys( + 3623, + 6346, + ::cuda::std::less{}, + [](const key_t* k1, offset_t s1, const key_t* k2, offset_t s2, key_t* r, ::cuda::std::less co) { + using dispatch_t = cub::detail::merge::dispatch_t< + const key_t*, + const cub::NullType*, + const key_t*, + const cub::NullType*, + key_t*, + cub::NullType*, + offset_t, + ::cuda::std::less, + fallback_test_policy_hub>; // use a fixed policy for this test so the needed shared memory is deterministic + + std::size_t temp_storage_bytes = 0; + dispatch_t::dispatch( + nullptr, temp_storage_bytes, k1, nullptr, s1, k2, nullptr, s2, r, nullptr, co, cudaStream_t{0}); + + c2h::device_vector temp_storage(temp_storage_bytes); + dispatch_t::dispatch( + thrust::raw_pointer_cast(temp_storage.data()), + temp_storage_bytes, + k1, + nullptr, + s1, + k2, + nullptr, + s2, + r, + nullptr, + co, + cudaStream_t{0}); + }); +} + +CUB_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]", offset_types) +{ + using key_t = int; + using offset_t = c2h::get<0, TestType>; + test_keys(3623, 6346, ::cuda::std::less<>{}, merge_keys_custom_offset_type); +} + +CUB_TEST("DeviceMerge::MergeKeys input sizes", "[merge][device]") +{ + using key_t = int; + using offset_t = int; + // TODO(bgruber): maybe less combinations + const auto size1 = offset_t{GENERATE(0, 1, 23, 123, 3234)}; + const auto size2 = offset_t{GENERATE(0, 1, 52, 556, 56767)}; + test_keys(size1, size2); +} + +// cannot put those in an anon namespace, or nvcc complains that the kernels have internal linkage +using unordered_t = c2h::custom_type_t; +struct order +{ + _CCCL_HOST_DEVICE auto operator()(const unordered_t& a, const unordered_t& b) const -> bool + { + return a.key < b.key; + } +}; + +CUB_TEST("DeviceMerge::MergeKeys no operator<", "[merge][device]") +{ + using key_t = unordered_t; + using offset_t = int; + test_keys(); +} + +namespace +{ +template +auto zip(Its... its) -> decltype(thrust::make_zip_iterator(its...)) +{ + return thrust::make_zip_iterator(its...); +} + +template +struct key_to_value +{ + template + _CCCL_HOST_DEVICE auto operator()(const Key& k) const -> Value + { + Value v{}; + convert(k, v, 0); + return v; + } + + template + _CCCL_HOST_DEVICE static void convert(const Key& k, Value& v, ...) + { + v = static_cast(k); + } + + template