[CI]split xpu distributed ut into another job #5
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Linux UT Test | ||
on: | ||
workflow_call: | ||
inputs: | ||
pytorch: | ||
required: false | ||
type: string | ||
default: 'main' | ||
description: Pytorch branch/commit | ||
keep_torch_xpu_ops: | ||
required: false | ||
type: string | ||
default: 'false' | ||
description: Keep torch-xpu-ops pin. `true` means use pined commit | ||
triton: | ||
required: false | ||
type: string | ||
default: '' | ||
description: Triton commit. Use pytorch pined commit by default | ||
ut: | ||
required: true | ||
type: string | ||
default: '' | ||
description: UT scope. `op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu` Delimiter is comma | ||
abi: | ||
required: false | ||
type: string | ||
default: 1 | ||
description: ABI version. Default abi as 1. | ||
python: | ||
required: false | ||
type: string | ||
default: '3.10' | ||
description: Python version | ||
runner: | ||
required: true | ||
type: string | ||
default: 'linux.idc.xpu' | ||
description: Runner label | ||
driver: | ||
required: false | ||
type: string | ||
default: 'lts' | ||
description: Driver lts/rolling | ||
permissions: read-all | ||
jobs: | ||
ut_test: | ||
runs-on: ${{ inputs.runner }} | ||
if: !contains(inputs.ut, 'xpu_distributed') | ||
timeout-minutes: 900 | ||
env: | ||
NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} | ||
DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} | ||
steps: | ||
- name: Checkout torch-xpu-ops | ||
uses: actions/checkout@v4 | ||
- name: Prepare Stock Pytorch | ||
run: | | ||
pwd | ||
which conda && conda clean -ay | ||
conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ | ||
rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} | ||
conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
cd ../ && rm -rf pytorch | ||
pip install requests | ||
git clone https://github.com/pytorch/pytorch pytorch | ||
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then | ||
cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) | ||
# apply PRs for stock pytorch | ||
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py | ||
git status && git show -s | ||
git submodule sync && git submodule update --init --recursive | ||
if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then | ||
echo "Don't replace torch-xpu-ops!" | ||
else | ||
rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ | ||
# Workaround for torch-xpu-ops ci test | ||
sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt | ||
fi | ||
fi | ||
- name: Triton Installation | ||
run: | | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
cd ../pytorch | ||
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" | ||
if [ -z ${{ inputs.triton }} ]; then | ||
TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" | ||
else | ||
TRITON_COMMIT_ID="${{ inputs.triton }}" | ||
fi | ||
echo ${TRITON_REPO}@${TRITON_COMMIT_ID} | ||
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then | ||
pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" | ||
fi | ||
- name: Download Pytorch wheel | ||
if: ${{ inputs.pytorch != 'nightly_wheel' }} | ||
uses: actions/download-artifact@v4 | ||
with: | ||
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} | ||
path: ${{ github.workspace }} | ||
- name: Install Pytorch XPU | ||
run: | | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
source .github/scripts/env.sh ${{ inputs.pytorch }} | ||
pip install mkl-static==2025.0.1 mkl-include==2025.0.1 | ||
if [[ ${{ inputs.abi }} == '0' ]]; then | ||
export _GLIBCXX_USE_CXX11_ABI=0 | ||
else | ||
export _GLIBCXX_USE_CXX11_ABI=1 | ||
fi | ||
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then | ||
cd ../pytorch | ||
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} | ||
pip install -r requirements.txt | ||
pip install --force-reinstall ${{ github.workspace }}/torch*.whl | ||
git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. | ||
else | ||
pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu | ||
TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') | ||
cd ../pytorch | ||
git reset --hard && git checkout ${TORCH_COMMIT_ID} | ||
TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt) | ||
rm -rf third_party/torch-xpu-ops | ||
git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops | ||
cd third_party/torch-xpu-ops | ||
git checkout ${TORCH_XPU_OPS_COMMIT} | ||
cd ../.. | ||
python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py | ||
fi | ||
pip install -r .ci/docker/requirements-ci.txt | ||
- name: Torch Config | ||
run: | | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
source .github/scripts/env.sh ${{ inputs.pytorch }} | ||
python -c "import torch; print(torch.__config__.show())" | ||
python -c "import torch; print(torch.__config__.parallel_info())" | ||
python -c "import torch; print(torch.__config__.torch.xpu.device_count())" | ||
python -c "import triton; print(triton.__version__)" | ||
cd .. | ||
python pytorch/torch/utils/collect_env.py | ||
rm -rf /tmp/torchinductor_* | ||
rm -rf ~/.triton/cache | ||
- name: Run XPU OP Regression | ||
if: contains(inputs.ut, 'op_regression') || github.event_name == 'schedule' | ||
run: | | ||
cd ${{ github.workspace }} | ||
mkdir -p ut_log/op_regression | ||
xpu-smi discovery | ||
source .github/scripts/env.sh ${{ inputs.pytorch }} | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
cd ${{ github.workspace }} | ||
cd test/regressions | ||
pip install pytest | ||
timeout 8000 pytest -v 2>${{ github.workspace }}/ut_log/op_regression/op_regression_test_error.log | tee ${{ github.workspace }}/ut_log/op_regression/op_regression_test.log | ||
- name: Run XPU OP Regressions test on device 1 | ||
if: contains(inputs.ut, 'op_regression_dev1') || github.event_name == 'schedule' | ||
run: | | ||
cd ${{ github.workspace }} | ||
mkdir -p ut_log/op_regression_dev1 | ||
xpu-smi discovery | ||
source .github/scripts/env.sh ${{ inputs.pytorch }} | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
export ZE_AFFINITY_MASK_OLD=${ZE_AFFINITY_MASK} | ||
unset ZE_AFFINITY_MASK | ||
cd ${{ github.workspace }} | ||
cd test/regressions | ||
pip install pytest | ||
timeout 8000 pytest -v test_operation_on_device_1.py 2>${{ github.workspace }}/ut_log/op_regression_dev1/op_regression_dev1_test_error.log | tee ${{ github.workspace }}/ut_log/op_regression_dev1/op_regression_dev1_test.log | ||
export ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK_OLD} | ||
- name: Run XPU OP Extended UT | ||
if: contains(inputs.ut, 'op_extended') || github.event_name == 'schedule' | ||
run: | | ||
source .github/scripts/env.sh ${{ inputs.pytorch }} | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
export PYTORCH_TEST_WITH_SLOW=1 | ||
cd ${{ github.workspace }} | ||
mkdir -p ut_log/op_extended | ||
cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/ | ||
timeout 10000 python run_test_with_skip.py 2>${{ github.workspace }}/ut_log/op_extended/op_extended_test_error.log | tee ${{ github.workspace }}/ut_log/op_extended/op_extended_test.log | ||
- name: Run XPU OP UT | ||
if: contains(inputs.ut, 'op_ut') || github.event_name == 'schedule' | ||
run: | | ||
source .github/scripts/env.sh ${{ inputs.pytorch }} | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
export PYTORCH_ENABLE_XPU_FALLBACK=1 | ||
export PYTORCH_TEST_WITH_SLOW=1 | ||
cd ${{ github.workspace }} | ||
mkdir -p ut_log/op_ut | ||
cd ../pytorch/third_party/torch-xpu-ops/test/xpu | ||
timeout 10000 python run_test_with_skip.py 2>${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test_error.log | tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test.log | ||
# Cases run with a on-demand white list, since some suites are too | ||
# slow to go through all operators on CPU. So add cases on-demand | ||
# when XPU implementatoin is done. | ||
# test_foreach, test_decomp | ||
timeout 10000 python run_test_with_only.py 2>${{ github.workspace }}/ut_log/op_ut/op_ut_with_only_test_error.log | tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_only_test.log | ||
- name: Run Torch XPU UT | ||
if: contains(inputs.ut, 'torch_xpu') || github.event_name == 'schedule' | ||
run: | | ||
source .github/scripts/env.sh ${{ inputs.pytorch }} | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
cd ${{ github.workspace }} | ||
mkdir -p ut_log/torch_xpu | ||
cd ../pytorch | ||
TEST_REPORTS_DIR=$(pwd)/test/test-reports | ||
rm -rf "$TEST_REPORTS_DIR" && mkdir -p "$TEST_REPORTS_DIR" | ||
# Run Pytorch XPU binary UT | ||
for xpu_case in build/bin/*{xpu,sycl}*; do | ||
if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then | ||
case_name=$(basename "$xpu_case") | ||
echo "Testing ${case_name} ..." | ||
"$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml 2>${{ github.workspace }}/ut_log/torch_xpu/binary_ut_torch_xpu_${case_name}_test_error.log | tee ${{ github.workspace }}/ut_log/torch_xpu/binary_ut_torch_xpu_${case_name}_test.log | ||
fi | ||
done | ||
# Run Pytorch XPU python UT | ||
export PYTORCH_TEST_WITH_SLOW=1 | ||
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu" | ||
test_cmd="python test/run_test.py --include " | ||
# All Inductor UT under test/inductor | ||
for test in $(ls test/inductor | grep test); | ||
do | ||
test_cmd="${test_cmd} inductor/$test"; | ||
done | ||
# All xpu ut under test/xpu | ||
for test in $(ls test/xpu | grep test); | ||
do | ||
test_cmd="${test_cmd} xpu/$test"; | ||
done | ||
if [ -f "test/test_xpu.py" ]; then | ||
test_cmd="${test_cmd} test_xpu.py" | ||
fi | ||
eval $test_cmd 2>${{ github.workspace }}/ut_log/torch_xpu/torch_xpu_test_error.log | tee ${{ github.workspace }}/ut_log/torch_xpu/torch_xpu_test.log | ||
- name: UT Test Results Check | ||
shell: bash | ||
run: | | ||
function contains() { | ||
contains_status="echo 'Start $2 ...'" | ||
{ | ||
[[ $1 =~ (^|,)$2($|,) ]] | ||
} || { | ||
echo "[Warning] $2 is not suppotted type! Skipped!" | ||
contains_status="continue" | ||
} | ||
} | ||
set -xe | ||
echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | ||
for ut_suite in $(echo ${{ inputs.ut }} |sed 's/,/ /g') | ||
do | ||
contains "op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu,xpu_distributed" $ut_suite | ||
$contains_status | ||
cd ${{ github.workspace }}/ut_log/${ut_suite} | ||
cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ | ||
bash ut_result_check.sh ${ut_suite} | ||
done | ||
- name: Upload Inductor XPU UT Log | ||
if: always() | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-${{ env.UT_NAME }} | ||
path: ${{ github.workspace }}/ut_log | ||
distributed_ut_test: | ||
runs-on: ${{ inputs.runner }} | ||
if: contains(inputs.ut, 'xpu_distributed') | ||
timeout-minutes: 900 | ||
env: | ||
NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} | ||
DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} | ||
steps: | ||
- name: Checkout torch-xpu-ops | ||
uses: actions/checkout@v4 | ||
- name: Prepare Stock Pytorch | ||
run: | | ||
pwd | ||
which conda && conda clean -ay | ||
conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ | ||
rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} | ||
conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
cd ../ && rm -rf pytorch | ||
pip install requests | ||
git clone https://github.com/pytorch/pytorch pytorch | ||
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then | ||
cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) | ||
# apply PRs for stock pytorch | ||
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py | ||
git status && git show -s | ||
git submodule sync && git submodule update --init --recursive | ||
if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then | ||
echo "Don't replace torch-xpu-ops!" | ||
else | ||
rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ | ||
# Workaround for torch-xpu-ops ci test | ||
sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt | ||
fi | ||
fi | ||
- name: Triton Installation | ||
run: | | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
cd ../pytorch | ||
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" | ||
if [ -z ${{ inputs.triton }} ]; then | ||
TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" | ||
else | ||
TRITON_COMMIT_ID="${{ inputs.triton }}" | ||
fi | ||
echo ${TRITON_REPO}@${TRITON_COMMIT_ID} | ||
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then | ||
pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" | ||
fi | ||
- name: Download Pytorch wheel | ||
if: ${{ inputs.pytorch != 'nightly_wheel' }} | ||
uses: actions/download-artifact@v4 | ||
with: | ||
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} | ||
path: ${{ github.workspace }} | ||
- name: Install Pytorch XPU | ||
run: | | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
source .github/scripts/env.sh ${{ inputs.pytorch }} | ||
pip install mkl-static==2025.0.1 mkl-include==2025.0.1 | ||
if [[ ${{ inputs.abi }} == '0' ]]; then | ||
export _GLIBCXX_USE_CXX11_ABI=0 | ||
else | ||
export _GLIBCXX_USE_CXX11_ABI=1 | ||
fi | ||
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then | ||
cd ../pytorch | ||
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} | ||
pip install -r requirements.txt | ||
pip install --force-reinstall ${{ github.workspace }}/torch*.whl | ||
git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. | ||
else | ||
pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu | ||
TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') | ||
cd ../pytorch | ||
git reset --hard && git checkout ${TORCH_COMMIT_ID} | ||
TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt) | ||
rm -rf third_party/torch-xpu-ops | ||
git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops | ||
cd third_party/torch-xpu-ops | ||
git checkout ${TORCH_XPU_OPS_COMMIT} | ||
cd ../.. | ||
python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py | ||
fi | ||
pip install -r .ci/docker/requirements-ci.txt | ||
- name: Torch Config | ||
run: | | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
source .github/scripts/env.sh ${{ inputs.pytorch }} | ||
python -c "import torch; print(torch.__config__.show())" | ||
python -c "import torch; print(torch.__config__.parallel_info())" | ||
python -c "import torch; print(torch.__config__.torch.xpu.device_count())" | ||
python -c "import triton; print(triton.__version__)" | ||
cd .. | ||
python pytorch/torch/utils/collect_env.py | ||
rm -rf /tmp/torchinductor_* | ||
rm -rf ~/.triton/cache | ||
- name: Run Torch XPU Distributed UT | ||
run: | | ||
source .github/scripts/env.sh ${{ inputs.pytorch }} | ||
source activate xpu_op_${ZE_AFFINITY_MASK} | ||
pip install pytest | ||
cd ${{ github.workspace }} | ||
sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk | ||
sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope | ||
mkdir -p ut_log/xpu_distributed | ||
cd ../pytorch/third_party/torch-xpu-ops/test/xpu | ||
python -c "import torch;print(torch.distributed.is_xccl_available())" | ||
timeout 10000 python run_distributed.py 2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log | ||
cd ${{ github.workspace }} | ||
sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope | ||
- name: UT Test Results Check | ||
shell: bash | ||
run: | | ||
function contains() { | ||
contains_status="echo 'Start $2 ...'" | ||
{ | ||
[[ $1 =~ (^|,)$2($|,) ]] | ||
} || { | ||
echo "[Warning] $2 is not suppotted type! Skipped!" | ||
contains_status="continue" | ||
} | ||
} | ||
set -xe | ||
echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | ||
for ut_suite in $(echo ${{ inputs.ut }} |sed 's/,/ /g') | ||
do | ||
contains "op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu,xpu_distributed" $ut_suite | ||
$contains_status | ||
cd ${{ github.workspace }}/ut_log/${ut_suite} | ||
cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ | ||
bash ut_result_check.sh ${ut_suite} | ||
done | ||
- name: Upload Inductor XPU UT Log | ||
if: always() | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-${{ env.UT_NAME }} | ||
path: ${{ github.workspace }}/ut_log |