[CI]split xpu distributed ut into another job #5

Workflow file for this run

.github/workflows/_linux_ut.yml at 7a6c368

	name: Linux UT Test

	on:
	workflow_call:
	inputs:
	pytorch:
	required: false
	type: string
	default: 'main'
	description: Pytorch branch/commit
	keep_torch_xpu_ops:
	required: false
	type: string
	default: 'false'
	description: Keep torch-xpu-ops pin. `true` means use pined commit
	triton:
	required: false
	type: string
	default: ''
	description: Triton commit. Use pytorch pined commit by default
	ut:
	required: true
	type: string
	default: ''
	description: UT scope. `op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu` Delimiter is comma
	abi:
	required: false
	type: string
	default: 1
	description: ABI version. Default abi as 1.
	python:
	required: false
	type: string
	default: '3.10'
	description: Python version
	runner:
	required: true
	type: string
	default: 'linux.idc.xpu'
	description: Runner label
	driver:
	required: false
	type: string
	default: 'lts'
	description: Driver lts/rolling

	permissions: read-all

	jobs:
	ut_test:
Check failure on line 50 in .github/workflows/_linux_ut.yml View workflow run for this annotation GitHub Actions / .github/workflows/_linux_ut.yml Invalid workflow file `You have an error in your yaml syntax on line 50`
	runs-on: ${{ inputs.runner }}
	if: !contains(inputs.ut, 'xpu_distributed')
	timeout-minutes: 900
	env:
	NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' \|\| '0' }}
	DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' \|\| '0' }}
	steps:
	- name: Checkout torch-xpu-ops
	uses: actions/checkout@v4
	- name: Prepare Stock Pytorch
	run: \|
	pwd
	which conda && conda clean -ay
	conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} \|\| \
	rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
	conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
	source activate xpu_op_${ZE_AFFINITY_MASK}
	cd ../ && rm -rf pytorch
	pip install requests
	git clone https://github.com/pytorch/pytorch pytorch
	if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
	cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
	# apply PRs for stock pytorch
	python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
	git status && git show -s
	git submodule sync && git submodule update --init --recursive
	if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
	echo "Don't replace torch-xpu-ops!"
	else
	rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
	# Workaround for torch-xpu-ops ci test
	sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
	fi
	fi
	- name: Triton Installation
	run: \|
	source activate xpu_op_${ZE_AFFINITY_MASK}
	cd ../pytorch
	TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
	if [ -z ${{ inputs.triton }} ]; then
	TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
	else
	TRITON_COMMIT_ID="${{ inputs.triton }}"
	fi
	echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
	if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
	pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
	fi
	- name: Download Pytorch wheel
	if: ${{ inputs.pytorch != 'nightly_wheel' }}
	uses: actions/download-artifact@v4
	with:
	name: Torch-XPU-Wheel-${{ github.event.pull_request.number \|\| github.sha }}-${{ inputs.abi }}
	path: ${{ github.workspace }}
	- name: Install Pytorch XPU
	run: \|
	source activate xpu_op_${ZE_AFFINITY_MASK}
	source .github/scripts/env.sh ${{ inputs.pytorch }}
	pip install mkl-static==2025.0.1 mkl-include==2025.0.1
	if [[ ${{ inputs.abi }} == '0' ]]; then
	export _GLIBCXX_USE_CXX11_ABI=0
	else
	export _GLIBCXX_USE_CXX11_ABI=1
	fi
	if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
	cd ../pytorch
	export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
	pip install -r requirements.txt
	pip install --force-reinstall ${{ github.workspace }}/torch*.whl
	git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
	else
	pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
	TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
	cd ../pytorch
	git reset --hard && git checkout ${TORCH_COMMIT_ID}
	TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
	rm -rf third_party/torch-xpu-ops
	git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
	cd third_party/torch-xpu-ops
	git checkout ${TORCH_XPU_OPS_COMMIT}
	cd ../..
	python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
	fi
	pip install -r .ci/docker/requirements-ci.txt
	- name: Torch Config
	run: \|
	source activate xpu_op_${ZE_AFFINITY_MASK}
	source .github/scripts/env.sh ${{ inputs.pytorch }}
	python -c "import torch; print(torch.__config__.show())"
	python -c "import torch; print(torch.__config__.parallel_info())"
	python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
	python -c "import triton; print(triton.__version__)"

	cd ..
	python pytorch/torch/utils/collect_env.py
	rm -rf /tmp/torchinductor_*
	rm -rf ~/.triton/cache
	- name: Run XPU OP Regression
	if: contains(inputs.ut, 'op_regression') \|\| github.event_name == 'schedule'
	run: \|
	cd ${{ github.workspace }}
	mkdir -p ut_log/op_regression
	xpu-smi discovery
	source .github/scripts/env.sh ${{ inputs.pytorch }}
	source activate xpu_op_${ZE_AFFINITY_MASK}
	cd ${{ github.workspace }}
	cd test/regressions
	pip install pytest
	timeout 8000 pytest -v 2>${{ github.workspace }}/ut_log/op_regression/op_regression_test_error.log \| tee ${{ github.workspace }}/ut_log/op_regression/op_regression_test.log
	- name: Run XPU OP Regressions test on device 1
	if: contains(inputs.ut, 'op_regression_dev1') \|\| github.event_name == 'schedule'
	run: \|
	cd ${{ github.workspace }}
	mkdir -p ut_log/op_regression_dev1
	xpu-smi discovery
	source .github/scripts/env.sh ${{ inputs.pytorch }}
	source activate xpu_op_${ZE_AFFINITY_MASK}
	export ZE_AFFINITY_MASK_OLD=${ZE_AFFINITY_MASK}
	unset ZE_AFFINITY_MASK
	cd ${{ github.workspace }}
	cd test/regressions
	pip install pytest
	timeout 8000 pytest -v test_operation_on_device_1.py 2>${{ github.workspace }}/ut_log/op_regression_dev1/op_regression_dev1_test_error.log \| tee ${{ github.workspace }}/ut_log/op_regression_dev1/op_regression_dev1_test.log
	export ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK_OLD}
	- name: Run XPU OP Extended UT
	if: contains(inputs.ut, 'op_extended') \|\| github.event_name == 'schedule'
	run: \|
	source .github/scripts/env.sh ${{ inputs.pytorch }}
	source activate xpu_op_${ZE_AFFINITY_MASK}
	export PYTORCH_TEST_WITH_SLOW=1
	cd ${{ github.workspace }}
	mkdir -p ut_log/op_extended
	cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/
	timeout 10000 python run_test_with_skip.py 2>${{ github.workspace }}/ut_log/op_extended/op_extended_test_error.log \| tee ${{ github.workspace }}/ut_log/op_extended/op_extended_test.log
	- name: Run XPU OP UT
	if: contains(inputs.ut, 'op_ut') \|\| github.event_name == 'schedule'
	run: \|
	source .github/scripts/env.sh ${{ inputs.pytorch }}
	source activate xpu_op_${ZE_AFFINITY_MASK}
	export PYTORCH_ENABLE_XPU_FALLBACK=1
	export PYTORCH_TEST_WITH_SLOW=1
	cd ${{ github.workspace }}
	mkdir -p ut_log/op_ut
	cd ../pytorch/third_party/torch-xpu-ops/test/xpu
	timeout 10000 python run_test_with_skip.py 2>${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test_error.log \| tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test.log
	# Cases run with a on-demand white list, since some suites are too
	# slow to go through all operators on CPU. So add cases on-demand
	# when XPU implementatoin is done.
	# test_foreach, test_decomp
	timeout 10000 python run_test_with_only.py 2>${{ github.workspace }}/ut_log/op_ut/op_ut_with_only_test_error.log \| tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_only_test.log
	- name: Run Torch XPU UT
	if: contains(inputs.ut, 'torch_xpu') \|\| github.event_name == 'schedule'
	run: \|
	source .github/scripts/env.sh ${{ inputs.pytorch }}
	source activate xpu_op_${ZE_AFFINITY_MASK}
	cd ${{ github.workspace }}
	mkdir -p ut_log/torch_xpu
	cd ../pytorch
	TEST_REPORTS_DIR=$(pwd)/test/test-reports
	rm -rf "$TEST_REPORTS_DIR" && mkdir -p "$TEST_REPORTS_DIR"
	# Run Pytorch XPU binary UT
	for xpu_case in build/bin/{xpu,sycl}; do
	if [[ "$xpu_case" != ""* && "$xpu_case" != .so && "$xpu_case" != .a ]]; then
	case_name=$(basename "$xpu_case")
	echo "Testing ${case_name} ..."
	"$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml 2>${{ github.workspace }}/ut_log/torch_xpu/binary_ut_torch_xpu_${case_name}_test_error.log \| tee ${{ github.workspace }}/ut_log/torch_xpu/binary_ut_torch_xpu_${case_name}_test.log
	fi
	done
	# Run Pytorch XPU python UT
	export PYTORCH_TEST_WITH_SLOW=1
	export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"

	test_cmd="python test/run_test.py --include "
	# All Inductor UT under test/inductor
	for test in $(ls test/inductor \| grep test);
	do
	test_cmd="${test_cmd} inductor/$test";
	done
	# All xpu ut under test/xpu
	for test in $(ls test/xpu \| grep test);
	do
	test_cmd="${test_cmd} xpu/$test";
	done
	if [ -f "test/test_xpu.py" ]; then
	test_cmd="${test_cmd} test_xpu.py"
	fi
	eval $test_cmd 2>${{ github.workspace }}/ut_log/torch_xpu/torch_xpu_test_error.log \| tee ${{ github.workspace }}/ut_log/torch_xpu/torch_xpu_test.log
	- name: UT Test Results Check
	shell: bash
	run: \|
	function contains() {
	contains_status="echo 'Start $2 ...'"
	{
	[[ $1 =~ (^\|,)$2($\|,) ]]
	} \|\| {
	echo "[Warning] $2 is not suppotted type! Skipped!"
	contains_status="continue"
	}
	}
	set -xe
	echo "UT_NAME=$(echo ${{ inputs.ut }} \|sed 's/,/-/g')" \|tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
	for ut_suite in $(echo ${{ inputs.ut }} \|sed 's/,/ /g')
	do
	contains "op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu,xpu_distributed" $ut_suite
	$contains_status
	cd ${{ github.workspace }}/ut_log/${ut_suite}
	cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
	bash ut_result_check.sh ${ut_suite}
	done
	- name: Upload Inductor XPU UT Log
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number \|\| github.sha }}-${{ inputs.abi }}-${{ env.UT_NAME }}
	path: ${{ github.workspace }}/ut_log

	distributed_ut_test:
	runs-on: ${{ inputs.runner }}
	if: contains(inputs.ut, 'xpu_distributed')
	timeout-minutes: 900
	env:
	NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' \|\| '0' }}
	DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' \|\| '0' }}
	steps:
	- name: Checkout torch-xpu-ops
	uses: actions/checkout@v4
	- name: Prepare Stock Pytorch
	run: \|
	pwd
	which conda && conda clean -ay
	conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} \|\| \
	rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
	conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
	source activate xpu_op_${ZE_AFFINITY_MASK}
	cd ../ && rm -rf pytorch
	pip install requests
	git clone https://github.com/pytorch/pytorch pytorch
	if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
	cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
	# apply PRs for stock pytorch
	python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
	git status && git show -s
	git submodule sync && git submodule update --init --recursive
	if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
	echo "Don't replace torch-xpu-ops!"
	else
	rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
	# Workaround for torch-xpu-ops ci test
	sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
	fi
	fi
	- name: Triton Installation
	run: \|
	source activate xpu_op_${ZE_AFFINITY_MASK}
	cd ../pytorch
	TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
	if [ -z ${{ inputs.triton }} ]; then
	TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
	else
	TRITON_COMMIT_ID="${{ inputs.triton }}"
	fi
	echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
	if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
	pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
	fi
	- name: Download Pytorch wheel
	if: ${{ inputs.pytorch != 'nightly_wheel' }}
	uses: actions/download-artifact@v4
	with:
	name: Torch-XPU-Wheel-${{ github.event.pull_request.number \|\| github.sha }}-${{ inputs.abi }}
	path: ${{ github.workspace }}
	- name: Install Pytorch XPU
	run: \|
	source activate xpu_op_${ZE_AFFINITY_MASK}
	source .github/scripts/env.sh ${{ inputs.pytorch }}
	pip install mkl-static==2025.0.1 mkl-include==2025.0.1
	if [[ ${{ inputs.abi }} == '0' ]]; then
	export _GLIBCXX_USE_CXX11_ABI=0
	else
	export _GLIBCXX_USE_CXX11_ABI=1
	fi
	if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
	cd ../pytorch
	export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
	pip install -r requirements.txt
	pip install --force-reinstall ${{ github.workspace }}/torch*.whl
	git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
	else
	pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
	TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
	cd ../pytorch
	git reset --hard && git checkout ${TORCH_COMMIT_ID}
	TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
	rm -rf third_party/torch-xpu-ops
	git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
	cd third_party/torch-xpu-ops
	git checkout ${TORCH_XPU_OPS_COMMIT}
	cd ../..
	python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
	fi
	pip install -r .ci/docker/requirements-ci.txt
	- name: Torch Config
	run: \|
	source activate xpu_op_${ZE_AFFINITY_MASK}
	source .github/scripts/env.sh ${{ inputs.pytorch }}
	python -c "import torch; print(torch.__config__.show())"
	python -c "import torch; print(torch.__config__.parallel_info())"
	python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
	python -c "import triton; print(triton.__version__)"

	cd ..
	python pytorch/torch/utils/collect_env.py
	rm -rf /tmp/torchinductor_*
	rm -rf ~/.triton/cache
	- name: Run Torch XPU Distributed UT
	run: \|
	source .github/scripts/env.sh ${{ inputs.pytorch }}
	source activate xpu_op_${ZE_AFFINITY_MASK}
	pip install pytest
	cd ${{ github.workspace }}
	sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
	sudo echo "0"\|sudo tee /proc/sys/kernel/yama/ptrace_scope
	mkdir -p ut_log/xpu_distributed
	cd ../pytorch/third_party/torch-xpu-ops/test/xpu
	python -c "import torch;print(torch.distributed.is_xccl_available())"
	timeout 10000 python run_distributed.py 2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log \| tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
	cd ${{ github.workspace }}
	sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
	- name: UT Test Results Check
	shell: bash
	run: \|
	function contains() {
	contains_status="echo 'Start $2 ...'"
	{
	[[ $1 =~ (^\|,)$2($\|,) ]]
	} \|\| {
	echo "[Warning] $2 is not suppotted type! Skipped!"
	contains_status="continue"
	}
	}
	set -xe
	echo "UT_NAME=$(echo ${{ inputs.ut }} \|sed 's/,/-/g')" \|tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
	for ut_suite in $(echo ${{ inputs.ut }} \|sed 's/,/ /g')
	do
	contains "op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu,xpu_distributed" $ut_suite
	$contains_status
	cd ${{ github.workspace }}/ut_log/${ut_suite}
	cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
	bash ut_result_check.sh ${ut_suite}
	done
	- name: Upload Inductor XPU UT Log
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number \|\| github.sha }}-${{ inputs.abi }}-${{ env.UT_NAME }}
	path: ${{ github.workspace }}/ut_log

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[CI]split xpu distributed ut into another job #5

Workflow file

[CI]split xpu distributed ut into another job #5

Jobs

Run details

Workflow file for this run

GitHub Actions / .github/workflows/_linux_ut.yml