diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 5adced08..03b39ce4 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -118,6 +118,8 @@ def job_monitor_tasks(job, mt, args): # noqa: C901 job.cpuconsumptiontime = int(round(cpuconsumptiontime)) job.cpuconversionfactor = 1.0 logger.info(f'(instant) CPU consumption time for pid={job.pid}: {cpuconsumptiontime} (rounded to {job.cpuconsumptiontime})') + elif _cpuconsumptiontime == -1: + logger.warning('could not get CPU consumption time') else: logger.warning(f'process {job.pid} is no longer using CPU - aborting') return 0, "" diff --git a/pilot/util/processes.py b/pilot/util/processes.py index fc3c3ced..7c382392 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -568,8 +568,12 @@ def get_current_cpu_consumption_time(pid): # get all the child processes children = [] - _, ps_cache, _ = execute("ps -eo pid,ppid -m", mute=True) - find_processes_in_group(children, pid, ps_cache) + _, ps_cache, _ = execute("ps -eo pid,ppid -m", mute=True, timeout=60) + if ps_cache: + find_processes_in_group(children, pid, ps_cache) + else: + logger.warning('failed to get ps_cache') + return -1 cpuconsumptiontime = 0 for _pid in children: