From 9b9873cbe70372a45b7b2a8463ee6a0124cefbb6 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 26 Jan 2024 15:11:21 +0100 Subject: [PATCH] Timeout added to ps call. Handling of unknown cpuconsumptiontime --- pilot/util/monitoring.py | 2 ++ pilot/util/processes.py | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 5adced08..03b39ce4 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -118,6 +118,8 @@ def job_monitor_tasks(job, mt, args): # noqa: C901 job.cpuconsumptiontime = int(round(cpuconsumptiontime)) job.cpuconversionfactor = 1.0 logger.info(f'(instant) CPU consumption time for pid={job.pid}: {cpuconsumptiontime} (rounded to {job.cpuconsumptiontime})') + elif _cpuconsumptiontime == -1: + logger.warning('could not get CPU consumption time') else: logger.warning(f'process {job.pid} is no longer using CPU - aborting') return 0, "" diff --git a/pilot/util/processes.py b/pilot/util/processes.py index fc3c3ced..7c382392 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -568,8 +568,12 @@ def get_current_cpu_consumption_time(pid): # get all the child processes children = [] - _, ps_cache, _ = execute("ps -eo pid,ppid -m", mute=True) - find_processes_in_group(children, pid, ps_cache) + _, ps_cache, _ = execute("ps -eo pid,ppid -m", mute=True, timeout=60) + if ps_cache: + find_processes_in_group(children, pid, ps_cache) + else: + logger.warning('failed to get ps_cache') + return -1 cpuconsumptiontime = 0 for _pid in children: