Skip to content

Commit

Permalink
follow infra tweaks pr
Browse files Browse the repository at this point in the history
  • Loading branch information
blahBlahhhJ committed Oct 23, 2024
1 parent 6a90ca6 commit 4cc74de
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/levanter/infra/cli_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def make_docker_run_command(image_id, command, *, foreground, env, name="levante

# optionally add multislice env vars (if set by ray runtime env vars)
for v in ["MEGASCALE_COORDINATOR_ADDRESS", "MEGASCALE_NUM_SLICES", "MEGASCALE_PORT", "MEGASCALE_SLICE_ID"]:
v = shlex.quote(str(v))
docker_command.extend(["-e", v])

for k, v in env.items():
Expand Down
11 changes: 9 additions & 2 deletions src/levanter/infra/ray_tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,13 @@ def do_run(self, remote_fn, coordinator_ip, slice_id, num_slices) -> _TpuRunResu
except Exception:
logger.exception("Failed to kill job after primary failure")
return _handle_ray_error(info, e)
except Exception as e:
for f in futures:
try:
ray.cancel(f)
except Exception:
logger.exception("Failed to kill job after primary failure")
return TpuFailed(info, e)

actors = [MultisliceActor.remote() for _ in range(num_slices)] # type: ignore
info = _TpuInfo("get_slice_info", "ACTIVE", "TPU")
Expand Down Expand Up @@ -296,12 +303,12 @@ def run_on_pod_multislice_resumable(
outs = ray.get(run_on_pod_multislice(remote_fn, tpu_type, num_slices))
except ray.exceptions.RayTaskError as e:
problem = e
if "preempted" in str(e):
if "preempted" in str(e).lower():
num_preemptions += 1
logger.warning(f"Preempted {num_preemptions} times, {e}")
else:
num_failures += 1
logger.warning(f"Failed {num_failures} times")
logger.warning(f"Failed {num_failures} times", exc_info=e)
continue
except Exception as e:
problem = e
Expand Down

0 comments on commit 4cc74de

Please sign in to comment.