Skip to content

Commit

Permalink
Add number of gpus check and distributed pjrt tests
Browse files Browse the repository at this point in the history
  • Loading branch information
hsharsha committed Jul 3, 2024
1 parent 2ce6ed3 commit af476e8
Showing 1 changed file with 9 additions and 3 deletions.
12 changes: 9 additions & 3 deletions build_tools/rocm/run_xla_multi_gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@ N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
# out how many GPUs we have to test with.
rocm-smi -i
STATUS=$?
if [ $STATUS -ne 0 ]; then TF_GPU_COUNT=4; else
if [ $STATUS -ne 0 ]; then TF_GPU_COUNT=1; else
TF_GPU_COUNT=$(rocm-smi -i|grep 'Device ID' |grep 'GPU' |wc -l)
fi
if [[ $TF_GPU_COUNT -lt 4 ]]; then
echo "Found only ${TF_GPU_COUNT} gpus, multi-gpu tests need atleast 4 gpus."
fi

TF_TESTS_PER_GPU=1
N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})

Expand All @@ -47,7 +51,7 @@ fi
export PYTHON_BIN_PATH=`which python3`
export TF_NEED_ROCM=1
export ROCM_PATH=$ROCM_INSTALL_DIR
TAGS_FILTER="gpu,requires-gpu-nvidia,-oss_excluded,-oss_serial"
TAGS_FILTER="-oss_excluded,-oss_serial"
UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"

Expand All @@ -69,4 +73,6 @@ bazel \
-- //xla/tests:collective_ops_test_e2e_gpu \
//xla/tests:collective_ops_test_gpu \
//xla/tests:replicated_io_feed_test_gpu \
//xla/tools/multihost_hlo_runner:functional_hlo_runner_test_gpu
//xla/tools/multihost_hlo_runner:functional_hlo_runner_test_gpu \
//xla/pjrt/distributed:topology_util_test \
//xla/pjrt/distributed:client_server_test

0 comments on commit af476e8

Please sign in to comment.