Skip to content

Commit

Permalink
maybe?
Browse files Browse the repository at this point in the history
  • Loading branch information
dlwh committed May 20, 2024
1 parent 30c8d75 commit 61b0180
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 2 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/tpu_unit_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,5 @@ jobs:
run: |
export TPU_NAME=ci-run-${{ github.run_id }}
eval "$(ssh-agent -s)"
ssh-add ~/.ssh/google_compute_engine
infra/babysit-tpu-vm.sh $TPU_NAME -z ${{ matrix.tpu-zone }} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${{ github.sha }} -- \
infra/babysit-tpu-vm.sh $TPU_NAME -z ${{ matrix.tpu-zone }} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${{ github.sha }} --retries 3 -- \
bash levanter/infra/run.sh pytest tests
9 changes: 9 additions & 0 deletions infra/babysit-tpu-vm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ CMD_ARGS_STR=$(printf ' %s' "${CMD_ARGS[@]}")
CMD_ARGS_STR=${CMD_ARGS_STR:1}
CMD_ARGS_STR="RUN_ID=${RUN_ID} ${CMD_ARGS_STR}"

TRIES=0

# check if the VM is running
# if not, spin it up
# if it is, just run the command
Expand All @@ -82,6 +84,13 @@ while true; do
break
else
echo "Command failed"
TRIES=$((TRIES+1))
if [ $RETRIES -ge 0 ]; then
if [ $TRIES -ge $RETRIES ]; then
echo "Command failed $TRIES times, exiting"
break
fi
fi
fi
fi
else
Expand Down
6 changes: 6 additions & 0 deletions infra/helpers/parse-tpu-creation-args.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ AUTODELETE=true
SETUP_SCRIPT="$SCRIPT_DIR/helpers/setup-tpu-vm.sh"
SUBNETWORK="default"
USE_ALPHA=false
RETRIES=-1 # how many times babysit-tpu-vm.sh should retry before giving up. -1 means infinite

if [ -z "$GIT_BRANCH" ]; then
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
Expand Down Expand Up @@ -86,6 +87,11 @@ while [[ $# -gt 0 ]]; do
USE_ALPHA="true"
shift # past argument
;;
--retries)
RETRIES="$2"
shift # past argument
shift # past value
;;
*) # unknown option, assume it's the vm name if it doesn't start with a dash
if [[ $1 == -* ]]; then
echo "Error: unknown option $1" >&2
Expand Down

0 comments on commit 61b0180

Please sign in to comment.