From 8af01f28d08ce68d5f745f0d9184ca312a0b7557 Mon Sep 17 00:00:00 2001 From: "Fabio M. Graetz, Ph.D" Date: Thu, 21 Dec 2023 20:19:23 +0100 Subject: [PATCH] Fix: Handle SIGTERM in kubeflow pytorch elastic training plugin (#2064) Signed-off-by: Fabio Graetz --- plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py index cceb0c1cc7..ffe21d90cd 100644 --- a/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py +++ b/plugins/flytekit-kf-pytorch/flytekitplugins/kfpytorch/task.py @@ -386,6 +386,7 @@ def fn_partial(): else: raise Exception("Bad start method") + from torch.distributed.elastic.multiprocessing.api import SignalException from torch.distributed.elastic.multiprocessing.errors import ChildFailedError try: @@ -399,6 +400,9 @@ def fn_partial(): raise FlyteRecoverableException(e.format_msg()) else: raise RuntimeError(e.format_msg()) + except SignalException as e: + logger.exception(f"Elastic launch agent process terminating: {e}") + raise IgnoreOutputs() # `out` is a dictionary of rank (not local rank) -> result # Rank 0 returns the result of the task function