From 6d558fc8843e08ad2875af16b56d42f4dd0d6ffb Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Wed, 22 Jan 2025 11:00:05 -0500 Subject: [PATCH] Build Zoom wheel --- .github/workflows/build_zoom_backend.yml | 135 +++++++++++++++++++++++ build.sh | 18 ++- test.sh | 59 ++++++++++ zoom_extension/examples/test.py | 10 ++ zoom_extension/test/pytorch_test_base.py | 35 ++++++ 5 files changed, 246 insertions(+), 11 deletions(-) create mode 100644 .github/workflows/build_zoom_backend.yml mode change 100644 => 100755 build.sh create mode 100755 test.sh create mode 100644 zoom_extension/examples/test.py create mode 100644 zoom_extension/test/pytorch_test_base.py diff --git a/.github/workflows/build_zoom_backend.yml b/.github/workflows/build_zoom_backend.yml new file mode 100644 index 00000000000000..66cfb12e3e0160 --- /dev/null +++ b/.github/workflows/build_zoom_backend.yml @@ -0,0 +1,135 @@ +name: "Build PyTorch" + +on: + workflow_dispatch: + inputs: + force_debug_with_tmate: + type: boolean + description: 'Run the build with tmate session' + required: false + default: false + debug_with_tmate: + type: boolean + description: 'Run the build with a tmate session ONLY in case of failure' + required: false + default: false + pull_request: + push: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + build: + + strategy: + fail-fast: false + matrix: + include: + - name: "ubuntu-22.04" + runs-on: "mi300" + # container: "rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0" + # runs-on: "nod-ai-shared-cpubuilder-manylinux-x86_64" + + runs-on: ${{ matrix.runs-on }} + + name: ${{ matrix.name }} + + env: + CACHE_DIR: ${{ github.workspace }}/.container-cache + # either the PR number or `branch-N` where N always increments + CACHE_KEY: linux-build-test-cpp-asserts-manylinux-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }} + + defaults: + run: + shell: bash + + permissions: + id-token: write + contents: write + + container: + image: ${{ matrix.container }} + + steps: + - name: "Check out repository" + uses: actions/checkout@v4.2.2 + with: + submodules: true + + - name: Enable cache + uses: actions/cache/restore@v3 + with: + path: ${{ env.CACHE_DIR }} + key: ${{ env.CACHE_KEY }} + restore-keys: linux-build-test-cpp- + + - name: "Build PyTorch" + id: build + run: | + + export CCACHE_DIR="${{ env.CACHE_DIR }}" + export CMAKE_C_COMPILER_LAUNCHER=ccache + export CMAKE_CXX_COMPILER_LAUNCHER=ccache + export CCACHE_SLOPPINESS=include_file_ctime,include_file_mtime,time_macros + + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + ./build.sh + + - name: "Audit" + id: audit + run: | + + sudo apt install patchelf + source venv/bin/activate + pip install auditwheel + auditwheel repair -w dist --plat manylinux_2_39_x86_64 dist/torch* + + - name: "Test" + id: test + run: | + + # smoke test + python zoom_extension/examples/test.py + # device tests + PYTORCH_TEST_WITH_SLOW=1 TORCH_TEST_DEVICES=zoom_extension/test/pytorch_test_base.py ./test.sh + + - name: Save cache + uses: actions/cache/save@v3 + if: ${{ !cancelled() }} + with: + path: ${{ env.CACHE_DIR }} + key: ${{ env.CACHE_KEY }} + + - name: Upload artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.name }}_artifact + path: dist + if-no-files-found: warn + + - name: Release current commit + if: ${{ !cancelled() }} + uses: ncipollo/release-action@v1.12.0 + with: + artifacts: "dist/torch*.whl" + token: "${{ secrets.GITHUB_TOKEN }}" + tag: "latest" + name: "latest" + removeArtifacts: false + allowUpdates: true + replacesArtifacts: true + makeLatest: true + + - name: "Setup tmate session" + if: ${{ (failure() && inputs.debug_with_tmate) || inputs.force_debug_with_tmate }} + uses: mxschmitt/action-tmate@v3.18 + with: + limit-access-to-actor: true + install-dependencies: ${{ startsWith(matrix.runs-on, 'macos') || startsWith(matrix.runs-on, 'windows') }} diff --git a/build.sh b/build.sh old mode 100644 new mode 100755 index 74897f8830e56a..b658f848f7b4ca --- a/build.sh +++ b/build.sh @@ -1,9 +1,6 @@ #!/bin/bash rm -rf build -git clean -fdx -e .idea -git clean -fdX -e .idea - export USE_ZOOM=1 export USE_ROCM=0 @@ -118,13 +115,12 @@ export USE_VULKAN_FP16_INFERENCE=0 export USE_VULKAN_RELAXED_PRECISI0=0 export USE_XNNPACK=0 export USE_XPU=0 +export ONNX_ML=0 -# for the ligerllama example we need distributed and tensorpipe, only because -# huggingface model.generate insists on querying torch.distributed and distributed relies on tensorpipe -# this could be a factor of nod-pytorch being out of date with upstream: -# https://github.com/pytorch/pytorch/issues/97397 - +export PYTORCH_ROCM_ARCH="gfx90a;gfx940;gfx941;gfx942;gfx1100;" +source venv/bin/activate python setup.py develop -python zoom_extension/examples/test.py -PYTORCH_TEST_WITH_SLOW=1 TORCH_TEST_DEVICES=zoom_extension/test/pytorch_test_base.py ./test.sh -python setup.py bdist_wheel \ No newline at end of file +python setup.py bdist_wheel + +#python zoom_extension/examples/test.py +#PYTORCH_TEST_WITH_SLOW=1 TORCH_TEST_DEVICES=zoom_extension/test/pytorch_test_base.py ./test.sh diff --git a/test.sh b/test.sh new file mode 100755 index 00000000000000..d67d06a332de94 --- /dev/null +++ b/test.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +log_file="test.log" +bak_file="test.log.bak" +output_file="zoom_unimplemented_operators.log" +bak_out="zoom_unimplemented_operators.log.bak" +error_file="zoom_test_errors.log" +bak_err="zoom_test_errors.log.bak" + +# backup logs +[ -f $log_file ] && cp $log_file $bak_file +[ -f $output_file ] && cp $output_file $bak_out +[ -f $error_file ] && cp $error_file $bak_err + +python test/test_torch.py --run-parallel 0 -k TestTorchDeviceTypePRIVATEUSEONE --verbose &> $log_file +#python test/test_ops.py -k TestCommonPRIVATEUSEONE +#python test/test_ops.py -k TestCommonPRIVATEUSEONE.test_compare_cpu --verbose &> $log_file +#python test/test_ops.py -k TestCommonPRIVATEUSEONE.test_numpy_ref --verbose &> $log_file + +## Find Unimplemented Operator Errors from failing tests +# Pattern to search for +pattern="Could not run 'aten::[^']*' with arguments from the 'zoom' backend" + +# Extract aten operators, count frequencies, sort by frequency (descending), and save to output file +grep -oP "$pattern" "$log_file" | +sed -n "s/.*'aten::\([^']*\)'.*/\1/p" | +sort | +uniq -c | +sort -rn | +sed 's/^ *//; s/ /\t/' > "$output_file" + +# Count total matches +total_matches=$(grep -cP "$pattern" "$log_file") + +# Append total matches to the output file +echo -e "\nTotal unimplemented operator failures: $total_matches" >> "$output_file" +echo "A list of unimplemented operators has been saved to $output_file" + +## Find errors from failing tests +# Extract error messages, count frequencies, sort by frequency (descending), and save to output file +# Pattern to search for +pattern="^.*Error: (?!test)(.+?)(?=\n|$)" + +grep -oP "$pattern" "$log_file" | +sed 's/^(.*Error): //g' | +awk '{print substr($0, 1, 100)}' | # Limit to first 100 characters +sort | +uniq -c | +sort -rn | +sed 's/^ *//; s/ /\t/' > "$error_file" + +# Count total matches +total_matches=$(grep -cP "$pattern" "$log_file") + +# Append total matches to the output file +echo -e "\nTotal test errors failures: $total_matches" >> "$error_file" +echo "A list of test errors has been saved to $error_file" + +echo "Test logs have been saved to $log_file" \ No newline at end of file diff --git a/zoom_extension/examples/test.py b/zoom_extension/examples/test.py new file mode 100644 index 00000000000000..e27c0c39620041 --- /dev/null +++ b/zoom_extension/examples/test.py @@ -0,0 +1,10 @@ +import torch.zoom + +torch.utils.rename_privateuse1_backend("zoom") +# TODO: figure this out +unsupported_dtypes = None +torch.utils.generate_methods_for_privateuse1_backend( + unsupported_dtype=unsupported_dtypes +) +x = torch.empty(5, device="zoom:0", dtype=torch.int64) +print(x) diff --git a/zoom_extension/test/pytorch_test_base.py b/zoom_extension/test/pytorch_test_base.py new file mode 100644 index 00000000000000..d4358f56e261d4 --- /dev/null +++ b/zoom_extension/test/pytorch_test_base.py @@ -0,0 +1,35 @@ +import torch +import torch.zoom +from typing import ClassVar + +torch.utils.rename_privateuse1_backend('zoom') +unsupported_dtypes = None +torch.utils.generate_methods_for_privateuse1_backend(unsupported_dtype=unsupported_dtypes) + +class ZoomTestBase(DeviceTypeTestBase): + device_type = 'privateuseone' + primary_device: ClassVar[str] + + @classmethod + def get_primary_device(cls): + return cls.primary_device + + + @classmethod + def get_all_devices(cls): + primary_device_idx = int(cls.get_primary_device().split(':')[1]) + num_devices = torch.zoom.device_count() + + prim_device = cls.get_primary_device() + zoom_str = 'zoom:{0}' + non_primary_devices = [zoom_str.format(idx) for idx in range(num_devices) if idx != primary_device_idx] + return [prim_device] + non_primary_devices + + @classmethod + def setUpClass(cls): + # Force Zoom Init + t = torch.ones(1, device='zoom') + # Acquires the current device as the primary (test) device + cls.primary_device = f'zoom:{torch.zoom.current_device()}' + +TEST_CLASS = ZoomTestBase \ No newline at end of file