diff --git a/.github/workflows/build_and_test.yaml b/.github/workflows/build_and_test.yaml new file mode 100644 index 0000000..8c75af5 --- /dev/null +++ b/.github/workflows/build_and_test.yaml @@ -0,0 +1,161 @@ +name: build_and_test + +on: + workflow_call: + inputs: + build_type: + required: true + type: string + +permissions: + actions: read + checks: none + contents: read + deployments: none + discussions: none + id-token: write + issues: none + packages: read + pages: none + pull-requests: read + repository-projects: none + security-events: none + statuses: none + +jobs: + compute-matrices: + runs-on: ubuntu-latest + outputs: + BUILD_MATRIX: ${{ steps.compute-matrix.outputs.BUILD_MATRIX }} + TEST_MATRIX: ${{ steps.compute-matrix.outputs.TEST_MATRIX }} + steps: + - name: Compute Build Matrix + id: compute-matrix + run: | + set -eo pipefail + + # please keep the matrices sorted in ascending order by the following: + # + # [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER] + # + export BUILD_MATRIX=" + # amd64 + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.2.2', LINUX_VER: 'rockylinux8' } + # arm64 + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.2.2', LINUX_VER: 'rockylinux8' } + " + + BUILD_MATRIX="$( + yq -n -o json 'env(BUILD_MATRIX)' | \ + jq -c '{include: .}' + )" + + echo "BUILD_MATRIX=${BUILD_MATRIX}" | tee --append "${GITHUB_OUTPUT}" + + export TEST_MATRIX=" + # amd64 + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu20.04', gpu: 'a100', driver: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu22.04', gpu: 'a100', driver: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.2.2', LINUX_VER: 'ubuntu20.04', gpu: 'a100', driver: 'latest' } + # arm64 + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '11.8.0', LINUX_VER: 'ubuntu22.04', gpu: 'a100', driver: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu20.04', gpu: 'a100', driver: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.2.2', LINUX_VER: 'ubuntu22.04', gpu: 'a100', driver: 'latest' } + " + + TEST_MATRIX="$( + yq -n -o json 'env(TEST_MATRIX)' | \ + jq -c '{include: .}' + )" + + echo "TEST_MATRIX=${TEST_MATRIX}" | tee --append "${GITHUB_OUTPUT}" + build: + name: build-${{ matrix.CUDA_VER }}, ${{ matrix.ARCH }}, ${{ matrix.LINUX_VER }} + needs: compute-matrices + strategy: + matrix: ${{ fromJSON(needs.compute-matrices.outputs.BUILD_MATRIX) }} + runs-on: "linux-${{ matrix.ARCH }}-cpu16" + container: + image: "rapidsai/ci-wheel:cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}" + env: + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + steps: + - uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ vars.AWS_ROLE_ARN }} + aws-region: ${{ vars.AWS_REGION }} + role-duration-seconds: 900 + - name: checkout code repo + uses: actions/checkout@v4 + with: + persist-credentials: false + - name: Standardize repository information + uses: rapidsai/shared-actions/rapids-github-info@main + - name: Build and repair the wheel + run: ci/build_wheel.sh + env: + GH_TOKEN: ${{ github.token }} + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + # Use a shell that loads the rc file so that we get the compiler settings + shell: bash -leo pipefail {0} + test: + name: test-${{ matrix.CUDA_VER }}, ${{ matrix.ARCH }}, ${{ matrix.LINUX_VER }}, ${{ matrix.gpu }} + needs: [compute-matrices, build] + strategy: + matrix: ${{ fromJSON(needs.compute-matrices.outputs.TEST_MATRIX) }} + runs-on: "linux-${{ matrix.ARCH }}-gpu-${{ matrix.gpu }}-${{ matrix.driver }}-1" + container: + image: "rapidsai/citestwheel:cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}" + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} # GPU jobs must set this container env variable + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + steps: + - uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ vars.AWS_ROLE_ARN }} + aws-region: ${{ vars.AWS_REGION }} + role-duration-seconds: 900 + - name: Run nvidia-smi to make sure GPU is working + run: nvidia-smi + - name: checkout code repo + uses: actions/checkout@v4 + with: + persist-credentials: false + - name: Standardize repository information + uses: rapidsai/shared-actions/rapids-github-info@main + - name: Run tests + run: ci/test_wheel.sh + env: + GH_TOKEN: ${{ github.token }} + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + publish: + if: ${{ inputs.build_type == 'branch' }} + needs: test + runs-on: linux-amd64-cpu4 + container: + image: "rapidsai/ci-wheel:latest" + env: + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + steps: + - uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ vars.AWS_ROLE_ARN }} + aws-region: ${{ vars.AWS_REGION }} + role-duration-seconds: 900 + - name: checkout code repo + uses: actions/checkout@v4 + with: + persist-credentials: false + - name: Get current date + id: date + run: | + echo "CURRENT_DATE=$(date --rfc-3339=date)" >> ${GITHUB_ENV} + - name: Standardize repository information + uses: rapidsai/shared-actions/rapids-github-info@main + - name: Download wheels from downloads.rapids.ai and publish to anaconda repository + env: + RAPIDS_CONDA_TOKEN: ${{ secrets.CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN }} + run: | + rapids-wheels-anaconda ucx cpp diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index e3b9ba8..4c7871e 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -9,27 +9,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} cancel-in-progress: true -# TODO: I would love to not need RAPIDS shared workflows for these builds, but -# for getting things stood up quickly that's the fastest route. + jobs: - pr-builder: - needs: - - wheel-build-ucx - - wheel-tests-ucx - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06 - wheel-build-ucx: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 - with: - matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) - build_type: pull-request - script: ci/build_wheel.sh - wheel-tests-ucx: - needs: wheel-build-ucx + build_and_test: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: ./.github/workflows/build_and_test.yaml with: - matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) build_type: pull-request - script: ci/test_wheel.sh diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index d4f5df3..dcc0ee1 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -17,4 +17,4 @@ sed -i -E "s/^name = \"${package_name}(.*)?\"$/name = \"${package_name}${PACKAGE python -m pip wheel "${package_dir}"/ -w "${package_dir}"/dist -vvv --no-deps --disable-pip-version-check python -m auditwheel repair -w ${package_dir}/final_dist --exclude "libcuda.so.1" --exclude "libnvidia-ml.so.1" --exclude "libucm.so.0" --exclude "libuct.so.0" --exclude "libucs.so.0" --exclude "libucp.so.0" ${package_dir}/dist/* -RAPIDS_PY_WHEEL_NAME="${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist +RAPIDS_PY_WHEEL_NAME="ucx_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index 4299c12..a7f3ddf 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -7,6 +7,6 @@ package_name="libucx" WHEELHOUSE="${PWD}/dist/" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp "${WHEELHOUSE}" +RAPIDS_PY_WHEEL_NAME="ucx_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp "${WHEELHOUSE}" python -m pip install "${package_name}-${RAPIDS_PY_CUDA_SUFFIX}" --find-links "${WHEELHOUSE}" -python -c "import libucx; libucx.load_library()" +python -c "import libucx; libucx.load_library(); print('Loaded libucx libraries successfully!')"