From 17fb9e0afe23dea475b0b51f600d7f0ce03c8da4 Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 13:29:51 -0700 Subject: [PATCH 01/14] ruff auto fix --- bin/validate-raythena-job.py | 5 +- example/standalone_ray_test_hello_world.py | 5 +- src/raythena/actors/esworker.py | 39 ++- src/raythena/actors/payloads/basePayload.py | 2 +- .../actors/payloads/eventservice/esPayload.py | 3 +- .../actors/payloads/eventservice/pilothttp.py | 16 +- .../drivers/communicators/baseCommunicator.py | 14 +- .../communicators/harvesterFileMessenger.py | 24 +- .../drivers/communicators/harvesterMock.py | 161 ++++----- .../communicators/harvesterMock2205.py | 154 ++++----- src/raythena/drivers/esdriver.py | 67 ++-- src/raythena/scripts/raythena.py | 5 +- src/raythena/utils/bookkeeper.py | 45 ++- src/raythena/utils/config.py | 4 +- src/raythena/utils/eventservice.py | 33 +- src/raythena/utils/exception.py | 4 +- src/raythena/utils/logging.py | 2 +- src/raythena/utils/ray.py | 3 +- src/raythena/utils/timing.py | 8 +- tests/conftest.py | 313 +++++++++--------- tests/harvester/conftest.py | 5 +- .../harvester/test_harvesterFileMessenger.py | 6 +- tests/test_bookkeeper.py | 1 - tests/test_eventservice.py | 17 +- tests/test_importutils.py | 1 - tests/test_pilothttp.py | 3 +- tests/test_ray_utils.py | 7 +- 27 files changed, 514 insertions(+), 433 deletions(-) diff --git a/bin/validate-raythena-job.py b/bin/validate-raythena-job.py index b6ec34d..df10d0a 100644 --- a/bin/validate-raythena-job.py +++ b/bin/validate-raythena-job.py @@ -1,10 +1,9 @@ #!/usr/bin/env python -from __future__ import print_function -from array import array import argparse import json import os.path as path +from array import array import ROOT @@ -23,7 +22,7 @@ def get_event_numbers(filename): def validate_job(job_dir, job_state_file): - with open(job_state_file, 'r') as f: + with open(job_state_file) as f: job_state = json.load(f) merged_input_files = job_state["merged"] merged_output_files = set([list(x.keys())[0] for x in merged_input_files.values()]) diff --git a/example/standalone_ray_test_hello_world.py b/example/standalone_ray_test_hello_world.py index afac388..68a2a46 100755 --- a/example/standalone_ray_test_hello_world.py +++ b/example/standalone_ray_test_hello_world.py @@ -9,9 +9,10 @@ import argparse import os import platform +import time from pprint import pprint + import ray -import time def build_nodes_resource_list(redis_ip: str): @@ -27,7 +28,7 @@ def build_nodes_resource_list(redis_ip: str): @ray.remote -class actor(): +class actor: def __init__(self) -> None: self.pid = os.getpid() self.hostname = platform.node() diff --git a/src/raythena/actors/esworker.py b/src/raythena/actors/esworker.py index 4361f36..4b55955 100644 --- a/src/raythena/actors/esworker.py +++ b/src/raythena/actors/esworker.py @@ -1,36 +1,49 @@ +import datetime import json import os import re import shutil -import time -from typing import Union, Tuple, Sequence, Any, Mapping, Optional - -import datetime import threading - +import time +from collections.abc import Mapping, Sequence from socket import gethostname from time import sleep +from typing import Any, Optional, Tuple, Union import ray -from raythena.utils.logging import disable_stdout_logging, make_logger, log_to_file -from raythena.utils.config import Config -from raythena.utils.eventservice import EventRangeRequest, Messages, EventRangeUpdate, PandaJob, EventRange -from raythena.utils.exception import IllegalWorkerState, StageInFailed, StageOutFailed, WrappedException, BaseRaythenaException -from raythena.utils.ray import get_node_ip # from raythena.utils.timing import CPUMonitor from raythena.actors.payloads.basePayload import BasePayload from raythena.actors.payloads.eventservice.esPayload import ESPayload - from raythena.actors.payloads.eventservice.pilothttp import PilotHttpPayload - +from raythena.utils.config import Config +from raythena.utils.eventservice import ( + EventRange, + EventRangeRequest, + EventRangeUpdate, + Messages, + PandaJob, +) +from raythena.utils.exception import ( + BaseRaythenaException, + IllegalWorkerState, + StageInFailed, + StageOutFailed, + WrappedException, +) +from raythena.utils.logging import ( + disable_stdout_logging, + log_to_file, + make_logger, +) +from raythena.utils.ray import get_node_ip # Type returned by the worker methods to the driver WorkerResponse = Tuple[str, int, Any] @ray.remote(num_cpus=1, max_restarts=1, max_task_retries=3) -class ESWorker(object): +class ESWorker: """ Actor running on HPC compute node. Each actor will start a payload plugin which handle the job processing as well as the communication with the job processing framework, Athena or any intermediary layer such as pilot 2. diff --git a/src/raythena/actors/payloads/basePayload.py b/src/raythena/actors/payloads/basePayload.py index 08af4c4..7355167 100644 --- a/src/raythena/actors/payloads/basePayload.py +++ b/src/raythena/actors/payloads/basePayload.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Dict, Optional, Any +from typing import Any, Dict, Optional from raythena.utils.config import Config from raythena.utils.eventservice import PandaJob diff --git a/src/raythena/actors/payloads/eventservice/esPayload.py b/src/raythena/actors/payloads/eventservice/esPayload.py index fe31dfa..742ad2b 100644 --- a/src/raythena/actors/payloads/eventservice/esPayload.py +++ b/src/raythena/actors/payloads/eventservice/esPayload.py @@ -1,5 +1,6 @@ from abc import abstractmethod -from typing import Dict, Optional, Sequence +from collections.abc import Sequence +from typing import Dict, Optional from raythena.actors.payloads.basePayload import BasePayload from raythena.utils.config import Config diff --git a/src/raythena/actors/payloads/eventservice/pilothttp.py b/src/raythena/actors/payloads/eventservice/pilothttp.py index 8b42d39..6ecc4c3 100644 --- a/src/raythena/actors/payloads/eventservice/pilothttp.py +++ b/src/raythena/actors/payloads/eventservice/pilothttp.py @@ -4,23 +4,23 @@ import os import shlex import stat -from asyncio import Queue, QueueEmpty, Event +from asyncio import Event, Queue, QueueEmpty +from collections.abc import Iterable, Mapping from subprocess import DEVNULL, Popen -from typing import Dict, List, Callable, Optional, Iterable, Mapping +from typing import Callable, Dict, List, Optional from urllib.parse import parse_qs import uvloop from aiohttp import web -from raythena.utils.logging import make_logger from raythena.actors.payloads.eventservice.esPayload import ESPayload from raythena.utils.config import Config -from raythena.utils.eventservice import ESEncoder -from raythena.utils.eventservice import PandaJob, EventRange -from raythena.utils.exception import FailedPayload, ExThread +from raythena.utils.eventservice import ESEncoder, EventRange, PandaJob +from raythena.utils.exception import ExThread, FailedPayload +from raythena.utils.logging import make_logger -class AsyncRouter(object): +class AsyncRouter: """ Very simple router mapping HTTP endpoint to a handler. Only supports with asynchronous handler compatible with the asyncio Framework. @@ -157,7 +157,7 @@ def _build_pilot_command(self) -> str: Raises: FailedPayload: if source code to be executed cannot be retrieved from CVMFS """ - cmd = str() + cmd = "" extra_setup = self.config.payload.get('extrasetup', None) if extra_setup is not None: diff --git a/src/raythena/drivers/communicators/baseCommunicator.py b/src/raythena/drivers/communicators/baseCommunicator.py index b766e1b..abfcf82 100644 --- a/src/raythena/drivers/communicators/baseCommunicator.py +++ b/src/raythena/drivers/communicators/baseCommunicator.py @@ -1,10 +1,18 @@ from abc import ABC, abstractmethod +from collections.abc import Mapping, Sequence from queue import Queue -from typing import Mapping, Sequence, Union +from typing import Union from raythena.utils.config import Config -from raythena.utils.eventservice import EventRangeRequest, PandaJobRequest, PandaJobUpdate, EventRangeUpdate, \ - JobReport, EventRangeDef, JobDef +from raythena.utils.eventservice import ( + EventRangeDef, + EventRangeRequest, + EventRangeUpdate, + JobDef, + JobReport, + PandaJobRequest, + PandaJobUpdate, +) RequestData = Union[PandaJobRequest, EventRangeUpdate, JobReport, EventRangeRequest, PandaJobUpdate] diff --git a/src/raythena/drivers/communicators/harvesterFileMessenger.py b/src/raythena/drivers/communicators/harvesterFileMessenger.py index ae3952e..f154936 100644 --- a/src/raythena/drivers/communicators/harvesterFileMessenger.py +++ b/src/raythena/drivers/communicators/harvesterFileMessenger.py @@ -7,9 +7,15 @@ from raythena.drivers.communicators.baseCommunicator import BaseCommunicator from raythena.utils.config import Config -from raythena.utils.logging import make_logger -from raythena.utils.eventservice import EventRangeRequest, PandaJobRequest, PandaJobUpdate, EventRangeUpdate, JobReport +from raythena.utils.eventservice import ( + EventRangeRequest, + EventRangeUpdate, + JobReport, + PandaJobRequest, + PandaJobUpdate, +) from raythena.utils.exception import ExThread +from raythena.utils.logging import make_logger class HarvesterFileCommunicator(BaseCommunicator): @@ -71,17 +77,17 @@ def _parse_harvester_config(self) -> None: os.path.join(self.harvester_workdir, self.harvester_conf['payload_interaction'][k])) if not hasattr(self, "jobspecfile"): - self.jobspecfile = str() + self.jobspecfile = '' if not hasattr(self, "jobspecfile"): - self.jobrequestfile = str() + self.jobrequestfile = '' if not hasattr(self, "eventrangesfile"): - self.eventrangesfile = str() + self.eventrangesfile = '' if not hasattr(self, "eventrequestfile"): - self.eventrequestfile = str() + self.eventrequestfile = '' if not hasattr(self, "eventstatusdumpjsonfile"): - self.eventstatusdumpjsonfile = str() + self.eventstatusdumpjsonfile = '' if not hasattr(self, "jobreportfile"): - self.jobreportfile = str() + self.jobreportfile = '' def request_job(self, request: PandaJobRequest) -> None: """ @@ -152,7 +158,7 @@ def request_event_ranges(self, request: EventRangeRequest) -> None: self._logger.debug(f"request_event_ranges: found a {self.eventrangesfile} file") while os.path.isfile(self.eventrangesfile): try: - with open(self.eventrangesfile, 'r') as f: + with open(self.eventrangesfile) as f: ranges = json.load(f) if os.path.isfile(self.eventrangesfile): shutil.move( diff --git a/src/raythena/drivers/communicators/harvesterMock.py b/src/raythena/drivers/communicators/harvesterMock.py index e3844ac..b94b378 100644 --- a/src/raythena/drivers/communicators/harvesterMock.py +++ b/src/raythena/drivers/communicators/harvesterMock.py @@ -6,7 +6,12 @@ from raythena.drivers.communicators.baseCommunicator import BaseCommunicator from raythena.utils.config import Config -from raythena.utils.eventservice import EventRangeRequest, PandaJobRequest, PandaJobUpdate, EventRangeUpdate +from raythena.utils.eventservice import ( + EventRangeRequest, + EventRangeUpdate, + PandaJobRequest, + PandaJobUpdate, +) from raythena.utils.exception import ExThread @@ -184,73 +189,73 @@ def request_job(self, job_request: PandaJobRequest) -> None: self.job_queue.put({ str(self.pandaID): { - u'jobsetID': + 'jobsetID': self.jobsetId, - u'logGUID': + 'logGUID': log_guid, - u'cmtConfig': - u'x86_64-slc6-gcc49-opt', - u'prodDBlocks': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'dispatchDBlockTokenForOut': - u'NULL,NULL', - u'destinationDBlockToken': - u'NULL,NULL', - u'destinationSE': + 'cmtConfig': + 'x86_64-slc6-gcc49-opt', + 'prodDBlocks': + 'user.mlassnig:user.mlassnig.pilot.test.single.hits', + 'dispatchDBlockTokenForOut': + 'NULL,NULL', + 'destinationDBlockToken': + 'NULL,NULL', + 'destinationSE': self.get_panda_queue_name(), - u'realDatasets': + 'realDatasets': job_name, - u'prodUserID': - u'no_one', - u'GUID': + 'prodUserID': + 'no_one', + 'GUID': self.guid, - u'realDatasetsIn': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'nSent': + 'realDatasetsIn': + 'user.mlassnig:user.mlassnig.pilot.test.single.hits', + 'nSent': 0, - u'eventService': + 'eventService': 'true', - u'cloud': - u'US', - u'StatusCode': + 'cloud': + 'US', + 'StatusCode': 0, - u'homepackage': - u'AtlasOffline/21.0.15', - u'inFiles': + 'homepackage': + 'AtlasOffline/21.0.15', + 'inFiles': self.inFiles, - u'processingType': - u'pilot-ptest', - u'ddmEndPointOut': - u'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', - u'fsize': - u'118612262', - u'fileDestinationSE': + 'processingType': + 'pilot-ptest', + 'ddmEndPointOut': + 'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', + 'fsize': + '118612262', + 'fileDestinationSE': f"{self.get_panda_queue_name()},{self.get_panda_queue_name()}", - u'scopeOut': - u'panda', - u'minRamCount': + 'scopeOut': + 'panda', + 'minRamCount': 0, - u'jobDefinitionID': + 'jobDefinitionID': 7932, - u'maxWalltime': - u'NULL', - u'scopeLog': - u'panda', - u'transformation': - u'Sim_tf.py', - u'maxDiskCount': + 'maxWalltime': + 'NULL', + 'scopeLog': + 'panda', + 'transformation': + 'Sim_tf.py', + 'maxDiskCount': 0, - u'coreCount': + 'coreCount': self.ncores, - u'prodDBlockToken': - u'NULL', - u'transferType': - u'NULL', - u'destinationDblock': + 'prodDBlockToken': + 'NULL', + 'transferType': + 'NULL', + 'destinationDblock': job_name, - u'dispatchDBlockToken': - u'NULL', - u'jobPars': ( + 'dispatchDBlockToken': + 'NULL', + 'jobPars': ( '--eventService=True --skipEvents=0 --firstEvent=1 --preExec \'from AthenaCommon.DetFlags ' 'import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();' 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff()\' ' @@ -261,37 +266,37 @@ def request_job(self, job_request: PandaJobRequest) -> None: '--randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 ' '--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root' % (self.inFiles, job_name)), - u'attemptNr': + 'attemptNr': 0, - u'swRelease': - u'Atlas-21.0.15', - u'nucleus': - u'NULL', - u'maxCpuCount': + 'swRelease': + 'Atlas-21.0.15', + 'nucleus': + 'NULL', + 'maxCpuCount': 0, - u'outFiles': - u'HITS_%s.pool.root,%s.job.log.tgz' % (job_name, job_name), - u'currentPriority': + 'outFiles': + 'HITS_%s.pool.root,%s.job.log.tgz' % (job_name, job_name), + 'currentPriority': 1000, - u'scopeIn': + 'scopeIn': self.scope, - u'PandaID': + 'PandaID': self.pandaID, - u'sourceSite': - u'NULL', - u'dispatchDblock': - u'NULL', - u'prodSourceLabel': - u'ptest', - u'checksum': - u'ad:5d000974', - u'jobName': + 'sourceSite': + 'NULL', + 'dispatchDblock': + 'NULL', + 'prodSourceLabel': + 'ptest', + 'checksum': + 'ad:5d000974', + 'jobName': job_name, - u'ddmEndPointIn': - u'UTA_SWT2_DATADISK', - u'taskID': + 'ddmEndPointIn': + 'UTA_SWT2_DATADISK', + 'taskID': self.taskId, - u'logFile': - u'%s.job.log.tgz' % job_name + 'logFile': + '%s.job.log.tgz' % job_name } }) diff --git a/src/raythena/drivers/communicators/harvesterMock2205.py b/src/raythena/drivers/communicators/harvesterMock2205.py index 775a5a0..fe21803 100644 --- a/src/raythena/drivers/communicators/harvesterMock2205.py +++ b/src/raythena/drivers/communicators/harvesterMock2205.py @@ -65,73 +65,73 @@ def request_job(self, job_request: PandaJobRequest) -> None: self.job_queue.put({ str(self.pandaID): { - u'jobsetID': + 'jobsetID': self.jobsetId, - u'logGUID': + 'logGUID': log_guid, - u'cmtConfig': - u'x86_64-centos7-gcc8-opt', - u'prodDBlocks': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'dispatchDBlockTokenForOut': - u'NULL,NULL', - u'destinationDBlockToken': - u'NULL,NULL', - u'destinationSE': + 'cmtConfig': + 'x86_64-centos7-gcc8-opt', + 'prodDBlocks': + 'user.mlassnig:user.mlassnig.pilot.test.single.hits', + 'dispatchDBlockTokenForOut': + 'NULL,NULL', + 'destinationDBlockToken': + 'NULL,NULL', + 'destinationSE': self.get_panda_queue_name(), - u'realDatasets': + 'realDatasets': job_name, - u'prodUserID': - u'no_one', - u'GUID': + 'prodUserID': + 'no_one', + 'GUID': self.guid, - u'realDatasetsIn': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'nSent': + 'realDatasetsIn': + 'user.mlassnig:user.mlassnig.pilot.test.single.hits', + 'nSent': 0, - u'eventService': + 'eventService': 'true', - u'cloud': - u'US', - u'StatusCode': + 'cloud': + 'US', + 'StatusCode': 0, - u'homepackage': - u'Athena/22.0.5', - u'inFiles': + 'homepackage': + 'Athena/22.0.5', + 'inFiles': self.inFiles, - u'processingType': - u'pilot-ptest', - u'ddmEndPointOut': - u'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', - u'fsize': - u'118612262', - u'fileDestinationSE': + 'processingType': + 'pilot-ptest', + 'ddmEndPointOut': + 'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', + 'fsize': + '118612262', + 'fileDestinationSE': f"{self.get_panda_queue_name()},{self.get_panda_queue_name()}", - u'scopeOut': - u'panda', - u'minRamCount': + 'scopeOut': + 'panda', + 'minRamCount': 0, - u'jobDefinitionID': + 'jobDefinitionID': 7932, - u'maxWalltime': - u'NULL', - u'scopeLog': - u'panda', - u'transformation': - u'Sim_tf.py', - u'maxDiskCount': + 'maxWalltime': + 'NULL', + 'scopeLog': + 'panda', + 'transformation': + 'Sim_tf.py', + 'maxDiskCount': 0, - u'coreCount': + 'coreCount': self.ncores, - u'prodDBlockToken': - u'NULL', - u'transferType': - u'NULL', - u'destinationDblock': + 'prodDBlockToken': + 'NULL', + 'transferType': + 'NULL', + 'destinationDblock': job_name, - u'dispatchDBlockToken': - u'NULL', - u'jobPars': ( + 'dispatchDBlockToken': + 'NULL', + 'jobPars': ( '--multiprocess --eventService=True --skipEvents=0 --firstEvent=1 ' '--preExec \'from AthenaCommon.DetFlags ' 'import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();' @@ -144,37 +144,37 @@ def request_job(self, job_request: PandaJobRequest) -> None: '--conditionsTag default:OFLCOND-MC16-SDR-14 ' '--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root' % (self.inFiles, job_name)), - u'attemptNr': + 'attemptNr': 0, - u'swRelease': - u'Atlas-22.0.5', - u'nucleus': - u'NULL', - u'maxCpuCount': + 'swRelease': + 'Atlas-22.0.5', + 'nucleus': + 'NULL', + 'maxCpuCount': 0, - u'outFiles': - u'HITS_%s.pool.root,%s.job.log.tgz' % (job_name, job_name), - u'currentPriority': + 'outFiles': + 'HITS_%s.pool.root,%s.job.log.tgz' % (job_name, job_name), + 'currentPriority': 1000, - u'scopeIn': + 'scopeIn': self.scope, - u'PandaID': + 'PandaID': self.pandaID, - u'sourceSite': - u'NULL', - u'dispatchDblock': - u'NULL', - u'prodSourceLabel': - u'ptest', - u'checksum': - u'ad:5d000974', - u'jobName': + 'sourceSite': + 'NULL', + 'dispatchDblock': + 'NULL', + 'prodSourceLabel': + 'ptest', + 'checksum': + 'ad:5d000974', + 'jobName': job_name, - u'ddmEndPointIn': - u'UTA_SWT2_DATADISK', - u'taskID': + 'ddmEndPointIn': + 'UTA_SWT2_DATADISK', + 'taskID': self.taskId, - u'logFile': - u'%s.job.log.tgz' % job_name + 'logFile': + '%s.job.log.tgz' % job_name } }) diff --git a/src/raythena/drivers/esdriver.py b/src/raythena/drivers/esdriver.py index 4e01657..c70d653 100644 --- a/src/raythena/drivers/esdriver.py +++ b/src/raythena/drivers/esdriver.py @@ -1,39 +1,56 @@ import configparser +import json import os import re -import json import shutil import stat import tempfile import time import traceback +from collections.abc import Iterable, Iterator, Mapping, Sequence from math import ceil from queue import Empty, Queue from socket import gethostname -from typing import (Any, Dict, Iterator, List, Mapping, Optional, Sequence, Iterable, - Tuple) from subprocess import DEVNULL, Popen +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) import ray from ray.exceptions import RayActorError from ray.types import ObjectRef + from raythena import __version__ from raythena.actors.esworker import ESWorker, WorkerResponse from raythena.drivers.baseDriver import BaseDriver -from raythena.drivers.communicators.baseCommunicator import (BaseCommunicator, - RequestData) -from raythena.drivers.communicators.harvesterFileMessenger import \ - HarvesterFileCommunicator +from raythena.drivers.communicators.baseCommunicator import ( + BaseCommunicator, + RequestData, +) +from raythena.drivers.communicators.harvesterFileMessenger import ( + HarvesterFileCommunicator, +) from raythena.utils.bookkeeper import BookKeeper, TaskStatus from raythena.utils.config import Config -from raythena.utils.eventservice import (EventRange, EventRangeDef, - EventRangeRequest, EventRangeUpdate, - JobDef, Messages, - PandaJobRequest - ) +from raythena.utils.eventservice import ( + EventRange, + EventRangeDef, + EventRangeRequest, + EventRangeUpdate, + JobDef, + Messages, + PandaJobRequest, +) from raythena.utils.exception import BaseRaythenaException -from raythena.utils.logging import (disable_stdout_logging, log_to_file, - make_logger) +from raythena.utils.logging import ( + disable_stdout_logging, + log_to_file, + make_logger, +) from raythena.utils.ray import build_nodes_resource_list @@ -77,8 +94,8 @@ def __init__(self, config: Config, session_dir: str) -> None: workdir = os.getcwd() self.config.ray['workdir'] = workdir self.workdir = workdir - self.output_dir = str() - self.merged_files_dir = str() + self.output_dir = "" + self.merged_files_dir = "" logfile = self.config.logging.get("driverlogfile", None) if logfile: log_to_file(self.config.logging.get("level", None), logfile) @@ -117,10 +134,10 @@ def __init__(self, config: Config, session_dir: str) -> None: self.pandaqueue = self.config.payload['pandaqueue'] parser = configparser.ConfigParser() harvester_config = self.config.harvester['harvesterconf'] - self.queuedata_file = str() - self.container_options = str() - self.container_type = str() - self.jobreport_name = str() + self.queuedata_file = "" + self.container_options = "" + self.container_type = "" + self.jobreport_name = "" if not os.path.isfile(harvester_config): self._logger.warning(f"Couldn't find harvester config file {harvester_config}") else: @@ -133,7 +150,7 @@ def __init__(self, config: Config, session_dir: str) -> None: self._logger.warning(f"cached queudata file not found: {queuedata_config[0]}") else: self.queuedata_file = queuedata_config[0] - with open(self.queuedata_file, 'r') as f: + with open(self.queuedata_file) as f: queuedata = json.load(f) self.container_options = queuedata["container_options"] self.container_type = queuedata["container_type"].split(":")[0] @@ -648,7 +665,7 @@ def produce_final_report(self, output_map: Dict[str, str]): if not files: return - with open(os.path.join(self.job_reports_dir, files[0]), 'r') as f: + with open(os.path.join(self.job_reports_dir, files[0])) as f: final_report = json.load(f) final_report_files = final_report["files"] @@ -666,7 +683,7 @@ def produce_final_report(self, output_map: Dict[str, str]): for file in files[1:]: current_file = os.path.join(self.job_reports_dir, file) - with open(current_file, 'r') as f: + with open(current_file) as f: current_report = json.load(f) final_report_files["input"].append(current_report["files"]["input"][0]) output_file_entry = current_report["files"]["output"][0]["subFiles"][0] @@ -731,7 +748,7 @@ def get_output_file_guid(self, job_report_file) -> Optional[str]: """ Extract the GUID from the jobReport of HITSMerge_tf """ - with open(job_report_file, 'r') as f: + with open(job_report_file) as f: job_report = json.load(f) try: guid = job_report["files"]["output"][0]["subFiles"][0]["file_guid"] @@ -832,7 +849,7 @@ def hits_merge_transform(self, input_files: Iterable[str], output_file: str) -> f.write(setup_script) os.chmod(setup_script_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) - cmd = str() + cmd = "" cmd += "export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase;" cmd += f"export thePlatform=\"{self.the_platform}\";" diff --git a/src/raythena/scripts/raythena.py b/src/raythena/scripts/raythena.py index 6971a4f..a75a1cb 100755 --- a/src/raythena/scripts/raythena.py +++ b/src/raythena/scripts/raythena.py @@ -1,17 +1,16 @@ #!/usr/bin/env python import functools import signal -import types import traceback +import types import click from raythena.drivers.baseDriver import BaseDriver +from raythena.drivers.esdriver import ESDriver from raythena.utils.config import Config from raythena.utils.ray import setup_ray, shutdown_ray -from raythena.drivers.esdriver import ESDriver - @click.command() @click.option( diff --git a/src/raythena/utils/bookkeeper.py b/src/raythena/utils/bookkeeper.py index ce06ad4..337f55a 100644 --- a/src/raythena/utils/bookkeeper.py +++ b/src/raythena/utils/bookkeeper.py @@ -1,17 +1,34 @@ import collections -from functools import reduce import json +import os import threading +import time +from collections.abc import Mapping, Sequence +from functools import reduce +from typing import ( + Any, + Deque, + Dict, + List, + Optional, + Set, + Tuple, + Union, +) + from raythena.utils.config import Config -from raythena.utils.eventservice import PandaJobQueue, EventRange, PandaJob, EventRangeUpdate, EventRangeDef, JobDef, PilotEventRangeUpdateDef +from raythena.utils.eventservice import ( + EventRange, + EventRangeDef, + EventRangeUpdate, + JobDef, + PandaJob, + PandaJobQueue, + PilotEventRangeUpdateDef, +) from raythena.utils.exception import ExThread from raythena.utils.logging import make_logger -from typing import Deque, Dict, Set, Optional, List, Mapping, Sequence, Union, Tuple, Any - -import time -import os - class TaskStatus: """ @@ -83,13 +100,13 @@ def _restore_status(self): filename = self.tmpfilepath try: - with open(filename, 'r') as f: + with open(filename) as f: self._status = json.load(f) except OSError as e: # failed to load status, try to read from a possible tmp file if it exists and not already done if filename != self.tmpfilepath and os.path.isfile(self.tmpfilepath): try: - with open(self.tmpfilepath, 'r') as f: + with open(self.tmpfilepath) as f: self._status = json.load(f) except OSError as ee: self._logger.error(e.strerror) @@ -303,7 +320,7 @@ def total_events(self) -> int: return self._nevents -class BookKeeper(object): +class BookKeeper: """ Performs bookkeeping of jobs and event ranges distributed to workers """ @@ -311,9 +328,9 @@ class BookKeeper(object): def __init__(self, config: Config) -> None: self.jobs: PandaJobQueue = PandaJobQueue() self.config: Config = config - self.output_dir = str() - self.merged_files_dir = str() - self.commitlog = str() + self.output_dir = "" + self.merged_files_dir = "" + self.commitlog = "" self._logger = make_logger(self.config, "BookKeeper") self.actors: Dict[str, Optional[str]] = dict() self.rangesID_by_actor: Dict[str, Set[str]] = dict() @@ -535,7 +552,7 @@ def recover_outputfile_name(self, filename: str) -> str: """ Read the commitlog change history of filename and return the current filename """ - with open(self.commitlog, 'r') as f: + with open(self.commitlog) as f: for line in f: op, *args = line.rstrip().split(" ") if op != "rename_output": diff --git a/src/raythena/utils/config.py b/src/raythena/utils/config.py index c73ccee..00a9231 100644 --- a/src/raythena/utils/config.py +++ b/src/raythena/utils/config.py @@ -3,7 +3,7 @@ import yaml -class Config(object): +class Config: """Class storing app configuration. This class will store configuration by prioritizing in the following order: @@ -147,7 +147,7 @@ def _validate_section(self, template_section_name: str, Exception: Invalid configuration file """ for name, value in template_params.items(): - if name not in section_params.keys(): + if name not in section_params: raise Exception( f"Param '{name}' not found in conf section '{template_section_name}'" ) diff --git a/src/raythena/utils/eventservice.py b/src/raythena/utils/eventservice.py index d5e5aaf..cd1a858 100644 --- a/src/raythena/utils/eventservice.py +++ b/src/raythena/utils/eventservice.py @@ -1,7 +1,14 @@ import json import os - -from typing import Set, Union, Dict, List, Mapping, Iterable, Any, Optional, Sequence, MutableMapping +from collections.abc import Iterable, Mapping, MutableMapping, Sequence +from typing import ( + Any, + Dict, + List, + Optional, + Set, + Union, +) # Types aliases Builtin = Union[int, float, str] @@ -25,7 +32,7 @@ # Messages sent by ray actor to the driver -class Messages(object): +class Messages: """ Defines messages exchanged between ray actors and the driver """ @@ -79,7 +86,7 @@ def default(self, o: Any) -> Any: return super().default(o) -class PandaJobQueue(object): +class PandaJobQueue: """ Build from the reply to a job request. Harvester will provide the following JSON as a reply: Provides utility methods to manage the job queue such as retrieving a spcific job, assigning jobs to workers. @@ -314,7 +321,7 @@ def pop(self): return obj -class EventRangeQueue(object): +class EventRangeQueue: """ Each PandaJob has an eventRangeQueue that should be filled from a reply to an event ranges request: @@ -424,7 +431,7 @@ def assign_ready_ranges(self, n_ranges=1) -> List['EventRange']: n_ranges = min(self.nranges_available(), n_ranges) if not n_ranges: return list() - res: List[Optional['EventRange']] = [None] * n_ranges + res: List[Optional[EventRange]] = [None] * n_ranges res_idx = 0 ready = self.rangesID_by_state[EventRange.READY] assigned = self.rangesID_by_state[EventRange.ASSIGNED] @@ -574,7 +581,7 @@ def get_next_ranges(self, nranges: int) -> List['EventRange']: return self.assign_ready_ranges(n_ranges=nranges) -class PandaJobUpdate(object): +class PandaJobUpdate: """ Wrapper for jobUpdate @@ -608,7 +615,7 @@ def to_dict(self) -> Dict[str, Builtin]: return self.__dict__ -class EventRangeUpdate(object): +class EventRangeUpdate: """ Event ranges update sent by pilot 2 using JSON schema: [ @@ -787,7 +794,7 @@ def build_from_dict(panda_id: str, return EventRangeUpdate(update_dict) -class PandaJobRequest(object): +class PandaJobRequest: """ Wrapper for a job request. Pilot2 requests job using the following JSON schema: @@ -836,7 +843,7 @@ def to_dict(self) -> Dict[str, Builtin]: return self.__dict__ -class EventRangeRequest(object): +class EventRangeRequest: """ Send event request to harvester. Event ranges for multiple jobs can be requested in a singled request. Harvester expects the following JSON schema: @@ -902,7 +909,7 @@ def build_from_dict(request_dict: Mapping[str, Dict[str, Builtin]]) -> 'EventRan return request -class PandaJob(object): +class PandaJob: """ Wrapper for a panda jobspec. Usually contains the following fields: { @@ -1039,7 +1046,7 @@ def __contains__(self, k: str) -> bool: return k in self.job -class EventRange(object): +class EventRange: """ Hold an event range: { @@ -1171,7 +1178,7 @@ def build_from_dict(event_ranges_dict: EventRangeDef) -> 'EventRange': event_ranges_dict['GUID'], event_ranges_dict['scope']) -class JobReport(object): +class JobReport: """ Wrapper for a job report. Raythena creates a job report after the job has finished: diff --git a/src/raythena/utils/exception.py b/src/raythena/utils/exception.py index 4e8b321..7845869 100644 --- a/src/raythena/utils/exception.py +++ b/src/raythena/utils/exception.py @@ -1,8 +1,8 @@ import threading -from queue import Queue, Empty +from queue import Empty, Queue -class ErrorCodes(object): +class ErrorCodes: """ Defines error codes constants and associated default error message for each error code """ diff --git a/src/raythena/utils/logging.py b/src/raythena/utils/logging.py index d89ad20..90e0858 100644 --- a/src/raythena/utils/logging.py +++ b/src/raythena/utils/logging.py @@ -29,7 +29,7 @@ def disable_stdout_logging(): def get_fmt(log_level): - if logging.DEBUG == logging.getLevelName(log_level): + if logging.getLevelName(log_level) == logging.DEBUG: fmt = "{asctime} | {levelname:8} | {name}:{funcName} | {message}" else: fmt = "{asctime} | {levelname:8} | {name} | {message}" diff --git a/src/raythena/utils/ray.py b/src/raythena/utils/ray.py index ae0b37e..e1d9fe9 100644 --- a/src/raythena/utils/ray.py +++ b/src/raythena/utils/ray.py @@ -1,4 +1,5 @@ -from typing import List, Mapping, Any +from collections.abc import Mapping +from typing import Any, List import ray diff --git a/src/raythena/utils/timing.py b/src/raythena/utils/timing.py index d8e2d7a..1e44268 100644 --- a/src/raythena/utils/timing.py +++ b/src/raythena/utils/timing.py @@ -1,10 +1,10 @@ -import psutil -import time import json - +import time from threading import Event from typing import Any, Dict, List, Union +import psutil + from raythena.utils.exception import ExThread @@ -102,7 +102,7 @@ def monitor_cpu(self) -> None: process_usage.append(self.process.cpu_percent()) process_cpu_times = self.process.cpu_times() - for k in process_times.keys(): + for k in process_times: process_times[k].append(getattr(process_cpu_times, k)) if time.time() >= last_write + self.write_interval: diff --git a/tests/conftest.py b/tests/conftest.py index 8debc51..e227244 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,7 +2,6 @@ import time import pytest - from raythena.utils.config import Config from raythena.utils.ray import setup_ray, shutdown_ray @@ -170,79 +169,79 @@ def sample_multijobs(request, input_output_file_list, is_eventservice, pandaids, outFiles = ",".join(output_files) outFilesShort = f"[{','.join([str(i) for i in range(len(outFiles))])}]" res[pandaID] = { - u'jobsetID': + 'jobsetID': jobsetId, - u'nEventsPerInputFile': nevents_per_file, - u'esmergeSpec': { + 'nEventsPerInputFile': nevents_per_file, + 'esmergeSpec': { "transPath": "", "jobParameters": "", "nEventsPerOutputFile": nhits_per_file }, - u'logGUID': + 'logGUID': log_guid, - u'cmtConfig': - u'x86_64-slc6-gcc49-opt', - u'prodDBlocks': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'dispatchDBlockTokenForOut': - u'NULL,NULL', - u'destinationDBlockToken': - u'NULL,NULL', - u'destinationSE': + 'cmtConfig': + 'x86_64-slc6-gcc49-opt', + 'prodDBlocks': + 'user.mlassnig:user.mlassnig.pilot.test.single.hits', + 'dispatchDBlockTokenForOut': + 'NULL,NULL', + 'destinationDBlockToken': + 'NULL,NULL', + 'destinationSE': panda_queue_name, - u'realDatasets': + 'realDatasets': job_name, - u'prodUserID': - u'no_one', - u'GUID': + 'prodUserID': + 'no_one', + 'GUID': ",".join([f"{guid}{i}" for i in range(len(input_files))]), - u'realDatasetsIn': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'nSent': + 'realDatasetsIn': + 'user.mlassnig:user.mlassnig.pilot.test.single.hits', + 'nSent': 0, - u'eventService': + 'eventService': str(is_eventservice), - u'cloud': - u'US', - u'StatusCode': + 'cloud': + 'US', + 'StatusCode': 0, - u'homepackage': - u'AtlasOffline/21.0.15', - u'inFiles': + 'homepackage': + 'AtlasOffline/21.0.15', + 'inFiles': inFiles, - u'processingType': - u'pilot-ptest', - u'ddmEndPointOut': - u'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', - u'fsize': - u'118612262', - u'fileDestinationSE': + 'processingType': + 'pilot-ptest', + 'ddmEndPointOut': + 'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', + 'fsize': + '118612262', + 'fileDestinationSE': f"{panda_queue_name},{panda_queue_name}", - u'scopeOut': - u'panda', - u'minRamCount': + 'scopeOut': + 'panda', + 'minRamCount': 0, - u'jobDefinitionID': + 'jobDefinitionID': 7932, - u'maxWalltime': - u'NULL', - u'scopeLog': - u'panda', - u'transformation': - u'Sim_tf.py', - u'maxDiskCount': + 'maxWalltime': + 'NULL', + 'scopeLog': + 'panda', + 'transformation': + 'Sim_tf.py', + 'maxDiskCount': 0, - u'coreCount': + 'coreCount': ncores, - u'prodDBlockToken': - u'NULL', - u'transferType': - u'NULL', - u'destinationDblock': + 'prodDBlockToken': + 'NULL', + 'transferType': + 'NULL', + 'destinationDblock': job_name, - u'dispatchDBlockToken': - u'NULL', - u'jobPars': ( + 'dispatchDBlockToken': + 'NULL', + 'jobPars': ( '--eventService=%s --skipEvents=0 --firstEvent=1 --preExec "from AthenaCommon.DetFlags ' 'import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();' 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff() "' @@ -251,38 +250,38 @@ def sample_multijobs(request, input_output_file_list, is_eventservice, pandaids, '--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 ' '--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root)' % (str(is_eventservice), inFiles, outFilesShort)), - u'attemptNr': + 'attemptNr': 0, - u'swRelease': - u'Atlas-21.0.15', - u'nucleus': - u'NULL', - u'maxCpuCount': + 'swRelease': + 'Atlas-21.0.15', + 'nucleus': + 'NULL', + 'maxCpuCount': 0, - u'outFiles': + 'outFiles': outFiles, - u'currentPriority': + 'currentPriority': 1000, - u'scopeIn': + 'scopeIn': scope, - u'PandaID': + 'PandaID': pandaID, - u'sourceSite': - u'NULL', - u'dispatchDblock': - u'NULL', - u'prodSourceLabel': - u'ptest', - u'checksum': - u'ad:5d000974', - u'jobName': + 'sourceSite': + 'NULL', + 'dispatchDblock': + 'NULL', + 'prodSourceLabel': + 'ptest', + 'checksum': + 'ad:5d000974', + 'jobName': job_name, - u'ddmEndPointIn': - u'UTA_SWT2_DATADISK', - u'taskID': + 'ddmEndPointIn': + 'UTA_SWT2_DATADISK', + 'taskID': taskId, - u'logFile': - u'%s.job.log.tgz' % job_name + 'logFile': + '%s.job.log.tgz' % job_name } return res @@ -308,79 +307,79 @@ def sample_job(is_eventservice, input_output_file_list, nhits_per_file, nevents_ outFilesShort = f"[{','.join([str(i) for i in range(len(outFiles))])}]" return { pandaID: { - u'jobsetID': + 'jobsetID': jobsetId, - u'logGUID': + 'logGUID': log_guid, - u'nEventsPerInputFile': nevents_per_file, - u'esmergeSpec': { + 'nEventsPerInputFile': nevents_per_file, + 'esmergeSpec': { "transPath": "", "jobParameters": "", "nEventsPerOutputFile": nhits_per_file }, - u'cmtConfig': - u'x86_64-slc6-gcc49-opt', - u'prodDBlocks': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'dispatchDBlockTokenForOut': - u'NULL,NULL', - u'destinationDBlockToken': - u'NULL,NULL', - u'destinationSE': + 'cmtConfig': + 'x86_64-slc6-gcc49-opt', + 'prodDBlocks': + 'user.mlassnig:user.mlassnig.pilot.test.single.hits', + 'dispatchDBlockTokenForOut': + 'NULL,NULL', + 'destinationDBlockToken': + 'NULL,NULL', + 'destinationSE': panda_queue_name, - u'realDatasets': + 'realDatasets': job_name, - u'prodUserID': - u'no_one', - u'GUID': + 'prodUserID': + 'no_one', + 'GUID': guid, - u'realDatasetsIn': - u'user.mlassnig:user.mlassnig.pilot.test.single.hits', - u'nSent': + 'realDatasetsIn': + 'user.mlassnig:user.mlassnig.pilot.test.single.hits', + 'nSent': 0, - u'eventService': + 'eventService': str(is_eventservice), - u'cloud': - u'US', - u'StatusCode': + 'cloud': + 'US', + 'StatusCode': 0, - u'homepackage': - u'AtlasOffline/21.0.15', - u'inFiles': + 'homepackage': + 'AtlasOffline/21.0.15', + 'inFiles': inFiles, - u'processingType': - u'pilot-ptest', - u'ddmEndPointOut': - u'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', - u'fsize': - u'118612262', - u'fileDestinationSE': + 'processingType': + 'pilot-ptest', + 'ddmEndPointOut': + 'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', + 'fsize': + '118612262', + 'fileDestinationSE': f"{panda_queue_name},{panda_queue_name}", - u'scopeOut': - u'panda', - u'minRamCount': + 'scopeOut': + 'panda', + 'minRamCount': 0, - u'jobDefinitionID': + 'jobDefinitionID': 7932, - u'maxWalltime': - u'NULL', - u'scopeLog': - u'panda', - u'transformation': - u'Sim_tf.py', - u'maxDiskCount': + 'maxWalltime': + 'NULL', + 'scopeLog': + 'panda', + 'transformation': + 'Sim_tf.py', + 'maxDiskCount': 0, - u'coreCount': + 'coreCount': ncores, - u'prodDBlockToken': - u'NULL', - u'transferType': - u'NULL', - u'destinationDblock': + 'prodDBlockToken': + 'NULL', + 'transferType': + 'NULL', + 'destinationDblock': job_name, - u'dispatchDBlockToken': - u'NULL', - u'jobPars': ( + 'dispatchDBlockToken': + 'NULL', + 'jobPars': ( '--eventService=%s --skipEvents=0 --firstEvent=1 --preExec "from AthenaCommon.DetFlags ' 'import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();' 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff() "' @@ -389,37 +388,37 @@ def sample_job(is_eventservice, input_output_file_list, nhits_per_file, nevents_ '--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 ' '--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root)' % (str(is_eventservice), inFiles, outFilesShort)), - u'attemptNr': + 'attemptNr': 0, - u'swRelease': - u'Atlas-21.0.15', - u'nucleus': - u'NULL', - u'maxCpuCount': + 'swRelease': + 'Atlas-21.0.15', + 'nucleus': + 'NULL', + 'maxCpuCount': 0, - u'outFiles': + 'outFiles': outFiles, - u'currentPriority': + 'currentPriority': 1000, - u'scopeIn': + 'scopeIn': scope, - u'PandaID': + 'PandaID': pandaID, - u'sourceSite': - u'NULL', - u'dispatchDblock': - u'NULL', - u'prodSourceLabel': - u'ptest', - u'checksum': - u'ad:5d000974', - u'jobName': + 'sourceSite': + 'NULL', + 'dispatchDblock': + 'NULL', + 'prodSourceLabel': + 'ptest', + 'checksum': + 'ad:5d000974', + 'jobName': job_name, - u'ddmEndPointIn': - u'UTA_SWT2_DATADISK', - u'taskID': + 'ddmEndPointIn': + 'UTA_SWT2_DATADISK', + 'taskID': taskId, - u'logFile': - u'%s.job.log.tgz' % job_name + 'logFile': + '%s.job.log.tgz' % job_name } } diff --git a/tests/harvester/conftest.py b/tests/harvester/conftest.py index 52ac4c8..ea8f1a6 100644 --- a/tests/harvester/conftest.py +++ b/tests/harvester/conftest.py @@ -2,8 +2,9 @@ import queue import pytest - -from raythena.drivers.communicators.harvesterFileMessenger import HarvesterFileCommunicator +from raythena.drivers.communicators.harvesterFileMessenger import ( + HarvesterFileCommunicator, +) from raythena.drivers.communicators.harvesterMock import HarvesterMock from raythena.drivers.communicators.harvesterMock2205 import HarvesterMock2205 diff --git a/tests/harvester/test_harvesterFileMessenger.py b/tests/harvester/test_harvesterFileMessenger.py index f4f0033..515c9c3 100644 --- a/tests/harvester/test_harvesterFileMessenger.py +++ b/tests/harvester/test_harvesterFileMessenger.py @@ -2,7 +2,7 @@ import os import time -from raythena.utils.eventservice import PandaJobRequest, EventRangeRequest +from raythena.utils.eventservice import EventRangeRequest, PandaJobRequest class TestHarvesterFileMessenger: @@ -47,7 +47,7 @@ def test_restart(self, harvester_file_communicator): assert ref_thread == harvester_file_communicator.communicator_thread harvester_file_communicator.stop() assert not harvester_file_communicator.communicator_thread.is_alive() - assert not ref_thread == harvester_file_communicator.communicator_thread + assert ref_thread != harvester_file_communicator.communicator_thread harvester_file_communicator.start() ref_thread = harvester_file_communicator.communicator_thread assert harvester_file_communicator.communicator_thread.is_alive() @@ -70,7 +70,7 @@ def test_get_event_ranges(self, config, harvester_file_communicator, time.sleep(0.01) ranges_res = {} - with open(harvester_file_communicator.eventrequestfile, 'r') as f: + with open(harvester_file_communicator.eventrequestfile) as f: communicator_request = json.load(f) for pandaIDSent, pandaIDCom in zip(evnt_request, communicator_request): diff --git a/tests/test_bookkeeper.py b/tests/test_bookkeeper.py index 705089e..3ad9cf8 100644 --- a/tests/test_bookkeeper.py +++ b/tests/test_bookkeeper.py @@ -1,5 +1,4 @@ import pytest - from raythena.utils.bookkeeper import BookKeeper diff --git a/tests/test_eventservice.py b/tests/test_eventservice.py index 383896d..39565f9 100644 --- a/tests/test_eventservice.py +++ b/tests/test_eventservice.py @@ -1,7 +1,14 @@ import pytest - -from raythena.utils.eventservice import EventRange, EventRangeQueue, EventRangeRequest, EventRangeUpdate -from raythena.utils.eventservice import PandaJob, PandaJobQueue, PandaJobRequest, PandaJobUpdate +from raythena.utils.eventservice import ( + EventRange, + EventRangeQueue, + EventRangeRequest, + EventRangeUpdate, + PandaJob, + PandaJobQueue, + PandaJobRequest, + PandaJobUpdate, +) class TestEventRangeRequest: @@ -186,8 +193,8 @@ def test_build_from_dict(self): "scope": scope } range_from_dict = EventRange.build_from_dict(r_dict) - assert range_from_dict.PFN == pfn and range_from_dict.eventRangeID == id and range_from_dict.startEvent == start \ - and range_from_dict.lastEvent == last and range_from_dict.GUID == guid and range_from_dict.scope == scope + assert pfn == range_from_dict.PFN and range_from_dict.eventRangeID == id and range_from_dict.startEvent == start \ + and range_from_dict.lastEvent == last and guid == range_from_dict.GUID and range_from_dict.scope == scope assert range_from_dict.status == EventRange.READY diff --git a/tests/test_importutils.py b/tests/test_importutils.py index 8c614dc..7318d88 100644 --- a/tests/test_importutils.py +++ b/tests/test_importutils.py @@ -1,5 +1,4 @@ import pytest - from raythena.utils.importUtils import import_from_string diff --git a/tests/test_pilothttp.py b/tests/test_pilothttp.py index 4fddde6..d91c2e3 100644 --- a/tests/test_pilothttp.py +++ b/tests/test_pilothttp.py @@ -3,9 +3,8 @@ import pytest import requests - from raythena.actors.payloads.eventservice.pilothttp import PilotHttpPayload -from raythena.utils.eventservice import PandaJob, EventRange +from raythena.utils.eventservice import EventRange, PandaJob class MockPopen: diff --git a/tests/test_ray_utils.py b/tests/test_ray_utils.py index 8c72048..c9de77a 100644 --- a/tests/test_ray_utils.py +++ b/tests/test_ray_utils.py @@ -1,8 +1,11 @@ import socket import pytest - -from raythena.utils.ray import cluster_size, build_nodes_resource_list, get_node_ip +from raythena.utils.ray import ( + build_nodes_resource_list, + cluster_size, + get_node_ip, +) @pytest.mark.usefixtures("requires_ray") From 8e78e017a09aa605db6d301dd5b4791174d9bd57 Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 13:31:51 -0700 Subject: [PATCH 02/14] format --- bin/validate-raythena-job.py | 43 +- example/standalone_ray_test_hello_world.py | 29 +- src/raythena/__init__.py | 1 + src/raythena/actors/esworker.py | 230 ++++++--- src/raythena/actors/payloads/basePayload.py | 3 +- .../actors/payloads/eventservice/esPayload.py | 4 +- .../actors/payloads/eventservice/pilothttp.py | 168 +++--- .../drivers/communicators/baseCommunicator.py | 21 +- .../communicators/harvesterFileMessenger.py | 112 ++-- .../drivers/communicators/harvesterMock.py | 235 ++++----- .../communicators/harvesterMock2205.py | 209 +++----- src/raythena/drivers/esdriver.py | 482 +++++++++++++----- src/raythena/scripts/raythena.py | 74 ++- src/raythena/utils/bookkeeper.py | 369 ++++++++++---- src/raythena/utils/config.py | 103 ++-- src/raythena/utils/eventservice.py | 242 +++++---- src/raythena/utils/exception.py | 28 +- src/raythena/utils/importUtils.py | 4 +- src/raythena/utils/logging.py | 15 +- src/raythena/utils/ray.py | 26 +- src/raythena/utils/timing.py | 18 +- tests/conftest.py | 458 +++++++---------- tests/harvester/conftest.py | 24 +- .../harvester/test_harvesterFileMessenger.py | 78 +-- tests/harvester/test_harvesterMock.py | 11 +- tests/test_bookkeeper.py | 120 +++-- tests/test_config.py | 1 - tests/test_driver.py | 1 - tests/test_eventservice.py | 234 +++++---- tests/test_importutils.py | 7 +- tests/test_pilothttp.py | 97 ++-- tests/test_ray_utils.py | 4 +- tests/test_taskstatus.py | 57 ++- 33 files changed, 2125 insertions(+), 1383 deletions(-) diff --git a/bin/validate-raythena-job.py b/bin/validate-raythena-job.py index df10d0a..5e5a295 100644 --- a/bin/validate-raythena-job.py +++ b/bin/validate-raythena-job.py @@ -11,9 +11,9 @@ def get_event_numbers(filename): f = ROOT.TFile.Open(filename) tree = f.Get("POOLCollectionTree") - event_number = array('Q', [0]) + event_number = array("Q", [0]) n_entries = tree.GetEntries() - tree.SetBranchAddress('EventNumber', event_number) + tree.SetBranchAddress("EventNumber", event_number) event_numbers = list() for n in range(n_entries): tree.GetEntry(n) @@ -25,25 +25,52 @@ def validate_job(job_dir, job_state_file): with open(job_state_file) as f: job_state = json.load(f) merged_input_files = job_state["merged"] - merged_output_files = set([list(x.keys())[0] for x in merged_input_files.values()]) + merged_output_files = set( + [list(x.keys())[0] for x in merged_input_files.values()] + ) event_numbers = set() for output_file in merged_output_files: output_file_abs = path.join(job_dir, "final", output_file) if not path.isfile(output_file_abs): - print("Expected file " + output_file_abs + " to be present in the job directory") + print( + "Expected file " + + output_file_abs + + " to be present in the job directory" + ) exit(1) current_event_numbers = get_event_numbers(output_file_abs) unique_current_event_numbers = set(current_event_numbers) if len(unique_current_event_numbers) != len(current_event_numbers): - print("Duplicate events in file " + output_file + "(" + str(len(current_event_numbers) - len(unique_current_event_numbers)) + "): ") + print( + "Duplicate events in file " + + output_file + + "(" + + str( + len(current_event_numbers) + - len(unique_current_event_numbers) + ) + + "): " + ) exit(1) - print(str(len(current_event_numbers)) + " events in file " + output_file) + print( + str(len(current_event_numbers)) + " events in file " + output_file + ) if not unique_current_event_numbers.isdisjoint(event_numbers): - print("Found duplicate events in file " + output_file + ": " + str(unique_current_event_numbers & event_numbers)) + print( + "Found duplicate events in file " + + output_file + + ": " + + str(unique_current_event_numbers & event_numbers) + ) exit(1) event_numbers |= unique_current_event_numbers - print("No duplicate found. # events merged: " + str(len(event_numbers)) + ", # of files: " + str(len(merged_output_files))) + print( + "No duplicate found. # events merged: " + + str(len(event_numbers)) + + ", # of files: " + + str(len(merged_output_files)) + ) def main(): diff --git a/example/standalone_ray_test_hello_world.py b/example/standalone_ray_test_hello_world.py index 68a2a46..5b1924e 100755 --- a/example/standalone_ray_test_hello_world.py +++ b/example/standalone_ray_test_hello_world.py @@ -19,7 +19,7 @@ def build_nodes_resource_list(redis_ip: str): nodes = ray.nodes() resource_list = list() for node in nodes: - naddr = node['NodeManagerAddress'] + naddr = node["NodeManagerAddress"] if naddr == redis_ip: continue else: @@ -33,7 +33,9 @@ def __init__(self) -> None: self.pid = os.getpid() self.hostname = platform.node() self.ip = ray._private.services.get_node_ip_address() - print(f"Initial message from PID - {self.pid} Running on host - {self.hostname} {self.ip}") + print( + f"Initial message from PID - {self.pid} Running on host - {self.hostname} {self.ip}" + ) def ping(self): print(f"{self.pid} {self.hostname} {self.ip} - ping") @@ -43,8 +45,11 @@ def ping(self): def main(redis_ip: str, redis_port: str, redis_password: str): redis_address = f"{redis_ip}:{redis_port}" - ray.init(ignore_reinit_error=True, - address="%s" % redis_address, _redis_password="%s" % redis_password) + ray.init( + ignore_reinit_error=True, + address="%s" % redis_address, + _redis_password="%s" % redis_password, + ) # show the ray cluster print(f"Ray Cluster resources : {ray.cluster_resources()}") @@ -74,10 +79,18 @@ def main(redis_ip: str, redis_port: str, redis_password: str): if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Wait on ray head node or workers to connect') - parser.add_argument('--redis-ip', default="%s" % (os.environ["RAYTHENA_RAY_HEAD_IP"])) - parser.add_argument('--redis-port', default="%s" % (os.environ["RAYTHENA_RAY_REDIS_PORT"])) - parser.add_argument('--redis-password', default=os.environ["RAYTHENA_RAY_REDIS_PASSWORD"]) + parser = argparse.ArgumentParser( + description="Wait on ray head node or workers to connect" + ) + parser.add_argument( + "--redis-ip", default="%s" % (os.environ["RAYTHENA_RAY_HEAD_IP"]) + ) + parser.add_argument( + "--redis-port", default="%s" % (os.environ["RAYTHENA_RAY_REDIS_PORT"]) + ) + parser.add_argument( + "--redis-password", default=os.environ["RAYTHENA_RAY_REDIS_PASSWORD"] + ) args = parser.parse_args() print(f"args : {args}") main(args.redis_ip, args.redis_port, args.redis_password) diff --git a/src/raythena/__init__.py b/src/raythena/__init__.py index 70edd66..63620c5 100644 --- a/src/raythena/__init__.py +++ b/src/raythena/__init__.py @@ -1,5 +1,6 @@ try: from . import _version + __version__ = _version.__version__ except: # noqa: E722 __version__ = "0.0.0" diff --git a/src/raythena/actors/esworker.py b/src/raythena/actors/esworker.py index 4b55955..aa7bce1 100644 --- a/src/raythena/actors/esworker.py +++ b/src/raythena/actors/esworker.py @@ -58,12 +58,18 @@ class ESWorker: """ READY_FOR_JOB = 0 # initial state, before the first job request - JOB_REQUESTED = 1 # job has been requested to the driver, waiting for result + JOB_REQUESTED = ( + 1 # job has been requested to the driver, waiting for result + ) READY_FOR_EVENTS = 2 # ready to request new events for the current job - EVENT_RANGES_REQUESTED = 3 # event ranges have been requested to the driver, waiting for result + EVENT_RANGES_REQUESTED = ( + 3 # event ranges have been requested to the driver, waiting for result + ) FINISHING_LOCAL_RANGES = 4 # do not request additional ranges, will move to STAGE_OUT once local cache is empty PROCESSING = 5 # currently processing event ranges - FINISHING = 6 # Performing cleanup of resources, preparing final server update + FINISHING = ( + 6 # Performing cleanup of resources, preparing final server update + ) DONE = 7 # Actor has finished processing job STAGE_IN = 8 # Staging-in data. STAGE_OUT = 9 # Staging-out data @@ -78,7 +84,7 @@ class ESWorker: FINISHING: "FINISHING", DONE: "DONE", STAGE_IN: "STAGE_IN", - STAGE_OUT: "STAGE_OUT" + STAGE_OUT: "STAGE_OUT", } # authorize state transition from x to y if y in TRANSITION[X] @@ -92,11 +98,19 @@ class ESWorker: PROCESSING: [READY_FOR_EVENTS, STAGE_OUT], STAGE_OUT: [FINISHING], FINISHING: [DONE], - DONE: [READY_FOR_JOB] + DONE: [READY_FOR_JOB], } - def __init__(self, actor_id: str, config: Config, - session_log_dir: str, actor_no: int, actor_count: int, job: PandaJob = None, event_ranges: Sequence[EventRange] = None) -> None: + def __init__( + self, + actor_id: str, + config: Config, + session_log_dir: str, + actor_no: int, + actor_count: int, + job: PandaJob = None, + event_ranges: Sequence[EventRange] = None, + ) -> None: """ Initialize attributes, instantiate a payload and setup the workdir @@ -123,14 +137,23 @@ def __init__(self, actor_id: str, config: Config, self.actor_ray_logs_dir = None self.cpu_monitor = None self.workdir = os.path.expandvars( - self.config.ray.get('workdir', os.getcwd())) + self.config.ray.get("workdir", os.getcwd()) + ) if not os.path.isdir(self.workdir): self.workdir = os.getcwd() self.output_dir = self.config.ray.get("outputdir") - self.pilot_kill_file = os.path.expandvars(self.config.payload.get('pilotkillfile', 'pilot_kill_payload')) - self.pilot_kill_time = self.config.payload.get('pilotkilltime', 600) - self.time_monitor_file = os.path.expandvars(self.config.payload.get('timemonitorfile', 'RaythenaTimeMonitor.txt')) - self.payload: Union[BasePayload, ESPayload] = PilotHttpPayload(self.id, self.config) + self.pilot_kill_file = os.path.expandvars( + self.config.payload.get("pilotkillfile", "pilot_kill_payload") + ) + self.pilot_kill_time = self.config.payload.get("pilotkilltime", 600) + self.time_monitor_file = os.path.expandvars( + self.config.payload.get( + "timemonitorfile", "RaythenaTimeMonitor.txt" + ) + ) + self.payload: Union[BasePayload, ESPayload] = PilotHttpPayload( + self.id, self.config + ) self.start_time = -1 self.time_limit = -1 self.elapsed = 1 @@ -149,20 +172,29 @@ def check_time(self) -> None: """ while True: curtime = datetime.datetime.now() - time_elapsed = curtime.hour * 3600 + curtime.minute * 60 + curtime.second - self.start_time + time_elapsed = ( + curtime.hour * 3600 + + curtime.minute * 60 + + curtime.second + - self.start_time + ) if time_elapsed <= 0: time_elapsed = 24 * 3600 + time_elapsed if time_elapsed // 300 >= self.elapsed: self.elapsed += 1 try: - if self.config.logging.get('copyraylogs', False): + if self.config.logging.get("copyraylogs", False): if os.path.isdir(self.actor_ray_logs_dir): shutil.rmtree(self.actor_ray_logs_dir) - shutil.copytree(self.session_log_dir, self.actor_ray_logs_dir) + shutil.copytree( + self.session_log_dir, self.actor_ray_logs_dir + ) except Exception as e: - self._logger.warning(f"Failed to copy ray logs to actor directory: {e}") + self._logger.warning( + f"Failed to copy ray logs to actor directory: {e}" + ) if time_elapsed > self.time_limit - self.pilot_kill_time: - killsignal = open(self.pilot_kill_file, 'w') + killsignal = open(self.pilot_kill_file, "w") killsignal.close() self._logger.info("killsignal sent to payload") break @@ -186,27 +218,47 @@ def modify_job(self, job: PandaJob) -> PandaJob: input_evnt_file = re.findall(r"\-\-inputEVNTFile=([\w\.\,]*) \-", cmd) if len(input_evnt_file) != 1: return job - in_files = [os.path.join(os.path.expandvars(self.config.harvester['endpoint']), x) - for x in input_evnt_file[0].split(",")] + in_files = [ + os.path.join( + os.path.expandvars(self.config.harvester["endpoint"]), x + ) + for x in input_evnt_file[0].split(",") + ] in_files = ",".join(in_files[0:1]) - cmd = re.sub(r"\-\-inputEVNTFile=([\w\.\,]*) \-", f"--inputEVNTFile={in_files} -", cmd) + cmd = re.sub( + r"\-\-inputEVNTFile=([\w\.\,]*) \-", + f"--inputEVNTFile={in_files} -", + cmd, + ) # convert args of the form --outputHITSFile=HITS.30737678._[011001,...].pool.root to --outputHITSFile=HITS.30737678._011001.pool.root - match = re.findall(r"--outputHITSFile=([0-9A-Z._]+)\[([0-9,]+)\](.pool.root)", cmd) + match = re.findall( + r"--outputHITSFile=([0-9A-Z._]+)\[([0-9,]+)\](.pool.root)", cmd + ) if match: match_tuple = match[0] prefix = match_tuple[0] suffix = match_tuple[2] nums = match_tuple[1].split(",") dummy_name = f"{prefix}{nums[0]}{suffix}" - cmd = re.sub(r"--outputHITSFile=[0-9A-Z._]+\[[0-9,]+\].pool.root", f"--outputHITSFile={dummy_name}", cmd) - - job_number = max(int(job["attemptNr"]) - 1, 0) * self.actor_count + self.actor_no + 1 + cmd = re.sub( + r"--outputHITSFile=[0-9A-Z._]+\[[0-9,]+\].pool.root", + f"--outputHITSFile={dummy_name}", + cmd, + ) + + job_number = ( + max(int(job["attemptNr"]) - 1, 0) * self.actor_count + + self.actor_no + + 1 + ) if "--jobNumber=" in cmd: - cmd = re.sub(r"--jobNumber=[0-9]+", f"--jobNumber={job_number}", cmd) + cmd = re.sub( + r"--jobNumber=[0-9]+", f"--jobNumber={job_number}", cmd + ) else: cmd = f"{cmd} --jobNumber={job_number} " - maxEvents = min(500, job['nEventsPerInputFile']) + maxEvents = min(500, job["nEventsPerInputFile"]) if "--maxEvents=" in cmd: cmd = re.sub(r"--maxEvents=[0-9]+", f"--maxEvents={maxEvents}", cmd) else: @@ -227,24 +279,44 @@ def stagein(self) -> None: Raises: StageInFailed: If creating / moving to the work directory fails or the call to the payload stage-in raises an exception. """ - self.payload_job_dir = os.path.join(self.workdir, self.job['PandaID']) + self.payload_job_dir = os.path.join(self.workdir, self.job["PandaID"]) if not os.path.isdir(self.payload_job_dir): - self._logger.warning(f"Specified path {self.payload_job_dir} does not exist. Using cwd {os.getcwd()}") + self._logger.warning( + f"Specified path {self.payload_job_dir} does not exist. Using cwd {os.getcwd()}" + ) self.payload_job_dir = self.workdir subdir = f"{self.id}" - self.payload_actor_process_dir = os.path.join(self.payload_job_dir, subdir) - self.payload_actor_output_dir = os.path.join(self.payload_job_dir, subdir, "esOutput") - self.actor_ray_logs_dir = os.path.join(self.payload_actor_process_dir, "ray_logs") + self.payload_actor_process_dir = os.path.join( + self.payload_job_dir, subdir + ) + self.payload_actor_output_dir = os.path.join( + self.payload_job_dir, subdir, "esOutput" + ) + self.actor_ray_logs_dir = os.path.join( + self.payload_actor_process_dir, "ray_logs" + ) try: - time_limit_monitor = open(os.path.join(self.workdir, self.time_monitor_file)) - start_time = time_limit_monitor.readline().split(':') - self.start_time = int(start_time[0]) * 3600 + int(start_time[1]) * 60 + int(start_time[2]) - time_limit = time_limit_monitor.readline().split(':') + time_limit_monitor = open( + os.path.join(self.workdir, self.time_monitor_file) + ) + start_time = time_limit_monitor.readline().split(":") + self.start_time = ( + int(start_time[0]) * 3600 + + int(start_time[1]) * 60 + + int(start_time[2]) + ) + time_limit = time_limit_monitor.readline().split(":") if len(time_limit) < 3: - time_limit = ['0'] + time_limit - self.time_limit = int(time_limit[0]) * 3600 + int(time_limit[1]) * 60 + int(time_limit[2]) - timer_thread = threading.Thread(name='timer', target=self.check_time, daemon=True) + time_limit = ["0"] + time_limit + self.time_limit = ( + int(time_limit[0]) * 3600 + + int(time_limit[1]) * 60 + + int(time_limit[2]) + ) + timer_thread = threading.Thread( + name="timer", target=self.check_time, daemon=True + ) timer_thread.start() except Exception as e: self._logger.warning(f"Failed to setup timer thread: {e}") @@ -252,9 +324,15 @@ def stagein(self) -> None: try: os.mkdir(self.payload_actor_process_dir) os.chdir(self.payload_actor_process_dir) - worker_logfile = self.config.logging.get('workerlogfile', None) + worker_logfile = self.config.logging.get("workerlogfile", None) if worker_logfile: - log_to_file(self.config.logging.get('level', 'warning').upper(), os.path.join(self.payload_actor_process_dir, os.path.basename(worker_logfile))) + log_to_file( + self.config.logging.get("level", "warning").upper(), + os.path.join( + self.payload_actor_process_dir, + os.path.basename(worker_logfile), + ), + ) disable_stdout_logging() self._logger.info(f"Ray worker started on node {gethostname()}") @@ -272,8 +350,11 @@ def stagein(self) -> None: except Exception as e: self._logger.warning(f"Failed to stagein payload: {e}") raise StageInFailed(self.id) - self.transition_state(ESWorker.READY_FOR_EVENTS if self. - is_event_service_job() else ESWorker.PROCESSING) + self.transition_state( + ESWorker.READY_FOR_EVENTS + if self.is_event_service_job() + else ESWorker.PROCESSING + ) def stageout(self) -> None: """ @@ -301,10 +382,14 @@ def transition_state(self, dest: int) -> None: IllegalWorkerState if the transition isn't allowed """ if dest not in self.transitions[self.state]: - self._logger.error(f"Illegal transition from {ESWorker.STATES_NAME[self.state]} to {ESWorker.STATES_NAME[dest]}") - raise IllegalWorkerState(worker_id=self.id, - src_state=ESWorker.STATES_NAME[self.state], - dst_state=ESWorker.STATES_NAME[dest]) + self._logger.error( + f"Illegal transition from {ESWorker.STATES_NAME[self.state]} to {ESWorker.STATES_NAME[dest]}" + ) + raise IllegalWorkerState( + worker_id=self.id, + src_state=ESWorker.STATES_NAME[self.state], + dst_state=ESWorker.STATES_NAME[dest], + ) self.state = dest def is_event_service_job(self) -> bool: @@ -365,8 +450,8 @@ def mark_new_job(self) -> WorkerResponse: return self.return_message(Messages.REQUEST_NEW_JOB) def receive_event_ranges( - self, reply: int, - event_ranges: Sequence[EventRange]) -> WorkerResponse: + self, reply: int, event_ranges: Sequence[EventRange] + ) -> WorkerResponse: """ Sends event ranges to be processed by the worker. Update the PFN of event ranges to an absolute path if it is a relative path. If no ranges are provided, the worker will not expect any more ranges in the future and @@ -395,16 +480,15 @@ def receive_event_ranges( for crange in event_ranges: if not os.path.isabs(crange.PFN): crange.PFN = os.path.join( - os.path.expandvars(self.config.harvester['endpoint']), - crange.PFN) + os.path.expandvars(self.config.harvester["endpoint"]), + crange.PFN, + ) self.payload.submit_new_ranges(event_ranges) self.transition_state(ESWorker.PROCESSING) return self.return_message(Messages.REPLY_OK) - def return_message(self, - message: int, - data: Any = None) -> WorkerResponse: + def return_message(self, message: int, data: Any = None) -> WorkerResponse: """ Utility function to build a tuple response for to the driver @@ -459,8 +543,8 @@ def should_request_ranges(self) -> bool: return res def stageout_event_service_files( - self, - ranges_update: Mapping[str, str]) -> Optional[EventRangeUpdate]: + self, ranges_update: Mapping[str, str] + ) -> Optional[EventRangeUpdate]: """ Move the HITS files reported by the pilot payload. Files are moved from the Athena work directory to the worker-specific output directory. @@ -471,7 +555,7 @@ def stageout_event_service_files( Returns: Updated event ranges update referencing the moved output files """ - ranges = json.loads(ranges_update['eventRanges'][0]) + ranges = json.loads(ranges_update["eventRanges"][0]) ranges = EventRangeUpdate.build_from_dict(self.job.get_id(), ranges) # stage-out finished event ranges for range_update in ranges[self.job.get_id()]: @@ -488,16 +572,21 @@ def stageout_event_service_files( if cfile: dst = os.path.join( self.output_dir, - os.path.basename(cfile) if os.path.isabs(cfile) else cfile) + os.path.basename(cfile) if os.path.isabs(cfile) else cfile, + ) if os.path.isfile(cfile): try: os.replace(cfile, dst) except OSError as e: - self._logger.error(f"Failed to move file {cfile} to {dst}: errno {e.errno}: {e.strerror}") + self._logger.error( + f"Failed to move file {cfile} to {dst}: errno {e.errno}: {e.strerror}" + ) raise StageOutFailed(self.id) range_update[cfile_key] = dst else: - self._logger.warning(f"Couldn't stageout file {cfile} as it doesn't exist") + self._logger.warning( + f"Couldn't stageout file {cfile} as it doesn't exist" + ) raise StageOutFailed(self.id) return ranges @@ -512,8 +601,9 @@ def get_payload_message(self) -> Optional[WorkerResponse]: ranges_update = self.payload.fetch_ranges_update() if ranges_update: ranges_update = self.stageout_event_service_files(ranges_update) - return self.return_message(Messages.UPDATE_EVENT_RANGES, - ranges_update) + return self.return_message( + Messages.UPDATE_EVENT_RANGES, ranges_update + ) job_update = self.payload.fetch_job_update() if job_update: @@ -556,14 +646,20 @@ def get_message(self) -> WorkerResponse: self.stageout() return self.return_message(Messages.PROCESS_DONE) elif self.is_event_service_job() and ( - self.state == ESWorker.READY_FOR_EVENTS or - self.should_request_ranges()): + self.state == ESWorker.READY_FOR_EVENTS + or self.should_request_ranges() + ): req = EventRangeRequest() - req.add_event_request(self.job['PandaID'], - self.config.resources.get('corepernode', 64), - self.job['taskID'], self.job['jobsetID']) + req.add_event_request( + self.job["PandaID"], + self.config.resources.get("corepernode", 64), + self.job["taskID"], + self.job["jobsetID"], + ) self.transition_state(ESWorker.EVENT_RANGES_REQUESTED) - return self.return_message(Messages.REQUEST_EVENT_RANGES, req) + return self.return_message( + Messages.REQUEST_EVENT_RANGES, req + ) elif self.state == ESWorker.DONE: return self.return_message(Messages.PROCESS_DONE) else: diff --git a/src/raythena/actors/payloads/basePayload.py b/src/raythena/actors/payloads/basePayload.py index 7355167..f527e08 100644 --- a/src/raythena/actors/payloads/basePayload.py +++ b/src/raythena/actors/payloads/basePayload.py @@ -11,8 +11,7 @@ class BasePayload(ABC): panda job specification and are responsible to handle the execution of the """ - def __init__(self, worker_id: str, - config: Config) -> None: + def __init__(self, worker_id: str, config: Config) -> None: """ Setup base payload attributes diff --git a/src/raythena/actors/payloads/eventservice/esPayload.py b/src/raythena/actors/payloads/eventservice/esPayload.py index 742ad2b..0725832 100644 --- a/src/raythena/actors/payloads/eventservice/esPayload.py +++ b/src/raythena/actors/payloads/eventservice/esPayload.py @@ -23,7 +23,9 @@ def __init__(self, worker_id: str, config: Config): super().__init__(worker_id, config) @abstractmethod - def submit_new_ranges(self, event_ranges: Optional[Sequence[EventRange]]) -> None: + def submit_new_ranges( + self, event_ranges: Optional[Sequence[EventRange]] + ) -> None: """ Submit a new list of event ranges to the payload. The event ranges should be saved until is can be processed diff --git a/src/raythena/actors/payloads/eventservice/pilothttp.py b/src/raythena/actors/payloads/eventservice/pilothttp.py index 6ecc4c3..6f143e0 100644 --- a/src/raythena/actors/payloads/eventservice/pilothttp.py +++ b/src/raythena/actors/payloads/eventservice/pilothttp.py @@ -96,7 +96,7 @@ def __init__(self, worker_id: str, config: Config) -> None: """ super().__init__(worker_id, config) self._logger = make_logger(self.config, self.worker_id) - self.host = '127.0.0.1' + self.host = "127.0.0.1" self.port = 8080 self.json_encoder = functools.partial(json.dumps, cls=ESEncoder) self.server_thread = None @@ -114,19 +114,25 @@ def __init__(self, worker_id: str, config: Config) -> None: self.ranges_queue = Queue() self.router = AsyncRouter() - self.router.register('/', self.handle_get_job) - self.router.register('/server/panda/getJob', self.handle_get_job) - self.router.register('/server/panda/updateJob', self.handle_update_job) - self.router.register('/server/panda/updateWorkerPilotStatus', self.handle_update_job) - self.router.register('/server/panda/updateJobsInBulk', - self.handle_update_jobs_in_bulk) - self.router.register('/server/panda/getStatus', self.handle_get_status) - self.router.register('/server/panda/getEventRanges', - self.handle_get_event_ranges) - self.router.register('/server/panda/updateEventRanges', - self.handle_update_event_ranges) - self.router.register('/server/panda/getKeyPair', - self.handle_get_key_pair) + self.router.register("/", self.handle_get_job) + self.router.register("/server/panda/getJob", self.handle_get_job) + self.router.register("/server/panda/updateJob", self.handle_update_job) + self.router.register( + "/server/panda/updateWorkerPilotStatus", self.handle_update_job + ) + self.router.register( + "/server/panda/updateJobsInBulk", self.handle_update_jobs_in_bulk + ) + self.router.register("/server/panda/getStatus", self.handle_get_status) + self.router.register( + "/server/panda/getEventRanges", self.handle_get_event_ranges + ) + self.router.register( + "/server/panda/updateEventRanges", self.handle_update_event_ranges + ) + self.router.register( + "/server/panda/getKeyPair", self.handle_get_key_pair + ) def _start_payload(self) -> None: """ @@ -137,13 +143,17 @@ def _start_payload(self) -> None: # we're not reading data using communicate() and the pipe buffer becomes full as pilot2 # generates a lot of data to the stdout pipe # see https://docs.python.org/3.7/library/subprocess.html#subprocess.Popen.wait - self.pilot_process = Popen(command, - stdin=DEVNULL, - stdout=DEVNULL, - stderr=DEVNULL, - shell=True, - close_fds=True) - self._logger.info(f"Pilot payload started with PID {self.pilot_process.pid}") + self.pilot_process = Popen( + command, + stdin=DEVNULL, + stdout=DEVNULL, + stderr=DEVNULL, + shell=True, + close_fds=True, + ) + self._logger.info( + f"Pilot payload started with PID {self.pilot_process.pid}" + ) def _build_pilot_command(self) -> str: """ @@ -159,49 +169,62 @@ def _build_pilot_command(self) -> str: """ cmd = "" - extra_setup = self.config.payload.get('extrasetup', None) + extra_setup = self.config.payload.get("extrasetup", None) if extra_setup is not None: - cmd += f"{extra_setup}{';' if not extra_setup.endswith(';') else ''}" + cmd += ( + f"{extra_setup}{';' if not extra_setup.endswith(';') else ''}" + ) pilot_base = "pilot3" pilot_version = self.config.payload.get("pilotversion", "latest") - pilot_src = f"/cvmfs/atlas.cern.ch/repo/sw/PandaPilot/pilot3/{pilot_version}" + pilot_src = ( + f"/cvmfs/atlas.cern.ch/repo/sw/PandaPilot/pilot3/{pilot_version}" + ) if not os.path.isdir(pilot_src): - raise FailedPayload(self.worker_id, f"Pilot release {pilot_src} not found") + raise FailedPayload( + self.worker_id, f"Pilot release {pilot_src} not found" + ) cmd += f"ln -s {pilot_src} {os.path.join(os.getcwd(), pilot_base)};" - prod_source_label = shlex.quote(self.current_job['prodSourceLabel']) + prod_source_label = shlex.quote(self.current_job["prodSourceLabel"]) pilotwrapper_bin = "/cvmfs/atlas.cern.ch/repo/sw/PandaPilotWrapper/latest/runpilot2-wrapper.sh" if not os.path.isfile(pilotwrapper_bin): raise FailedPayload(self.worker_id) - queue_escaped = shlex.quote(self.config.payload['pandaqueue']) + queue_escaped = shlex.quote(self.config.payload["pandaqueue"]) cmd += f"{shlex.quote(pilotwrapper_bin)} --localpy --piloturl local -q {queue_escaped} -r {queue_escaped} -s {queue_escaped} " cmd += "--pilotversion 3 --pythonversion 3 " - cmd += f"-i PR -j {prod_source_label} --container --mute --pilot-user=atlas -t -u --es-executor-type=raythena -v 1 " \ - f"-d --cleanup=False -w generic --use-https False --allow-same-user=False --resource-type MCORE " \ + cmd += ( + f"-i PR -j {prod_source_label} --container --mute --pilot-user=atlas -t -u --es-executor-type=raythena -v 1 " + f"-d --cleanup=False -w generic --use-https False --allow-same-user=False --resource-type MCORE " f"--hpc-resource {shlex.quote(self.config.payload['hpcresource'])};" + ) - extra_script = self.config.payload.get('extrapostpayload', None) + extra_script = self.config.payload.get("extrapostpayload", None) if extra_script is not None: - cmd += f"{extra_script}{';' if not extra_script.endswith(';') else ''}" + cmd += ( + f"{extra_script}{';' if not extra_script.endswith(';') else ''}" + ) cmd_script = os.path.join(os.getcwd(), "payload.sh") - with open(cmd_script, 'w') as f: + with open(cmd_script, "w") as f: f.write(cmd) st = os.stat(cmd_script) os.chmod(cmd_script, st.st_mode | stat.S_IEXEC) payload_log = shlex.quote( - self.config.payload.get('logfilename', 'wrapper')) - return (f"/bin/bash {cmd_script} " - f"> {payload_log} 2> {payload_log}.stderr") + self.config.payload.get("logfilename", "wrapper") + ) + return ( + f"/bin/bash {cmd_script} " + f"> {payload_log} 2> {payload_log}.stderr" + ) def stagein(self) -> None: """ @@ -210,15 +233,25 @@ def stagein(self) -> None: """ cwd = os.getcwd() - ddm_endpoints_file = "/cvmfs/atlas.cern.ch/repo/sw/local/etc/cric_ddmendpoints.json" + ddm_endpoints_file = ( + "/cvmfs/atlas.cern.ch/repo/sw/local/etc/cric_ddmendpoints.json" + ) if os.path.isfile(ddm_endpoints_file): - os.symlink(ddm_endpoints_file, os.path.join(cwd, "cric_ddmendpoints.json")) + os.symlink( + ddm_endpoints_file, os.path.join(cwd, "cric_ddmendpoints.json") + ) - pandaqueues_file = "/cvmfs/atlas.cern.ch/repo/sw/local/etc/cric_pandaqueues.json" + pandaqueues_file = ( + "/cvmfs/atlas.cern.ch/repo/sw/local/etc/cric_pandaqueues.json" + ) if os.path.isfile(pandaqueues_file): - os.symlink(pandaqueues_file, os.path.join(cwd, "cric_pandaqueues.json")) + os.symlink( + pandaqueues_file, os.path.join(cwd, "cric_pandaqueues.json") + ) - queue_escaped = "/cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_schedconf.json" + queue_escaped = ( + "/cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_schedconf.json" + ) if os.path.isfile(queue_escaped): os.symlink(queue_escaped, os.path.join(cwd, "queuedata.json")) @@ -236,8 +269,10 @@ def is_complete(self) -> bool: Returns: False if the payload has not finished yet, True otherwise """ - return self.pilot_process is not None and self.pilot_process.poll( - ) is not None + return ( + self.pilot_process is not None + and self.pilot_process.poll() is not None + ) def return_code(self) -> Optional[int]: """ @@ -263,8 +298,7 @@ def start(self, job: PandaJob) -> None: self.current_job = job self.ranges_queue = Queue() self.no_more_ranges = False - self.server_thread = ExThread(target=self.run, - name="http-server") + self.server_thread = ExThread(target=self.run, name="http-server") self.server_thread.start() def stop(self) -> None: @@ -273,27 +307,32 @@ def stop(self) -> None: and wait until it exits then stop the http server """ if self.server_thread and self.server_thread.is_alive(): - pexit = self.pilot_process.poll() if pexit is None: self.pilot_process.terminate() pexit = self.pilot_process.wait() self._logger.debug(f"Payload return code: {pexit}") - asyncio.run_coroutine_threadsafe(self.notify_stop_server_task(), - self.loop) + asyncio.run_coroutine_threadsafe( + self.notify_stop_server_task(), self.loop + ) self.server_thread.join() - def submit_new_range(self, event_range: Optional[EventRange]) -> asyncio.Future: + def submit_new_range( + self, event_range: Optional[EventRange] + ) -> asyncio.Future: """ Submits a new evnet range to the payload thread by adding it to the event ranges queue. Args: event_range: range to forward to pilot """ - return asyncio.run_coroutine_threadsafe(self.ranges_queue.put(event_range), - self.loop) + return asyncio.run_coroutine_threadsafe( + self.ranges_queue.put(event_range), self.loop + ) - def submit_new_ranges(self, event_ranges: Optional[Iterable[EventRange]]) -> None: + def submit_new_ranges( + self, event_ranges: Optional[Iterable[EventRange]] + ) -> None: """ Wrapper for submit_new_range that accepts an iterable of event ranges. @@ -366,8 +405,9 @@ async def http_handler(self, request: web.BaseRequest) -> web.Response: try: return await self.router.route(request.path, request=request) except Exception: - return web.json_response({"StatusCode": 500}, - dumps=self.json_encoder) + return web.json_response( + {"StatusCode": 500}, dumps=self.json_encoder + ) @staticmethod async def parse_qs_body(request: web.BaseRequest) -> Dict[str, List[str]]: @@ -418,8 +458,9 @@ async def handle_update_job(self, request: web.BaseRequest) -> web.Response: # self._logger.debug(f"job update queue size is {self.job_update.qsize()}") return web.json_response(res, dumps=self.json_encoder) - async def handle_get_event_ranges(self, - request: web.BaseRequest) -> web.Response: + async def handle_get_event_ranges( + self, request: web.BaseRequest + ) -> web.Response: """ Handler for getEventRanges call, retrieve event ranges from the queue and returns ranges to pilot. If not enough event ranges are available yet, wait until more ranges become available or a message indicating @@ -433,13 +474,13 @@ async def handle_get_event_ranges(self, """ body = await PilotHttpPayload.parse_qs_body(request) status = 0 - panda_id = body['pandaID'][0] + panda_id = body["pandaID"][0] ranges = list() # PandaID does not match the current job, return an error - if panda_id != self.current_job['PandaID']: + if panda_id != self.current_job["PandaID"]: status = -1 else: - n_ranges = int(body['nRanges'][0]) + n_ranges = int(body["nRanges"][0]) if not self.no_more_ranges: for i in range(n_ranges): crange = await self.ranges_queue.get() @@ -452,7 +493,8 @@ async def handle_get_event_ranges(self, return web.json_response(res, dumps=self.json_encoder) async def handle_update_event_ranges( - self, request: web.BaseRequest) -> web.Response: + self, request: web.BaseRequest + ) -> web.Response: """ Handler for updateEventRanges call, adds the event ranges update to a queue to be retrieved by the worker @@ -469,7 +511,8 @@ async def handle_update_event_ranges( return web.json_response(res, dumps=self.json_encoder) async def handle_update_jobs_in_bulk( - self, request: web.BaseRequest) -> web.Response: + self, request: web.BaseRequest + ) -> web.Response: """ Not used by pilot in the current workflow @@ -499,8 +542,9 @@ async def handle_get_status(self, request: web.BaseRequest) -> web.Response: """ raise NotImplementedError(f"{request.path} handler not implemented") - async def handle_get_key_pair(self, - request: web.BaseRequest) -> web.Response: + async def handle_get_key_pair( + self, request: web.BaseRequest + ) -> web.Response: """ Not used by pilot in the current workflow diff --git a/src/raythena/drivers/communicators/baseCommunicator.py b/src/raythena/drivers/communicators/baseCommunicator.py index abfcf82..2ca75d6 100644 --- a/src/raythena/drivers/communicators/baseCommunicator.py +++ b/src/raythena/drivers/communicators/baseCommunicator.py @@ -14,7 +14,13 @@ PandaJobUpdate, ) -RequestData = Union[PandaJobRequest, EventRangeUpdate, JobReport, EventRangeRequest, PandaJobUpdate] +RequestData = Union[ + PandaJobRequest, + EventRangeUpdate, + JobReport, + EventRangeRequest, + PandaJobUpdate, +] class BaseCommunicator(ABC): @@ -23,8 +29,13 @@ class BaseCommunicator(ABC): to be implemented by different communicators as well as setting up queues used to communicate with other threads. """ - def __init__(self, requests_queue: 'Queue[RequestData]', job_queue: 'Queue[Mapping[str, JobDef]]', - event_ranges_queue: 'Queue[Mapping[str, Sequence[EventRangeDef]]]', config: Config) -> None: + def __init__( + self, + requests_queue: "Queue[RequestData]", + job_queue: "Queue[Mapping[str, JobDef]]", + event_ranges_queue: "Queue[Mapping[str, Sequence[EventRangeDef]]]", + config: Config, + ) -> None: """ Base constructor setting up queues and application config @@ -36,7 +47,9 @@ def __init__(self, requests_queue: 'Queue[RequestData]', job_queue: 'Queue[Mappi """ self.requests_queue: Queue[RequestData] = requests_queue self.job_queue: Queue[Mapping[str, JobDef]] = job_queue - self.event_ranges_queue: Queue[Mapping[str, Sequence[EventRangeDef]]] = event_ranges_queue + self.event_ranges_queue: Queue[ + Mapping[str, Sequence[EventRangeDef]] + ] = event_ranges_queue self.config = config @abstractmethod diff --git a/src/raythena/drivers/communicators/harvesterFileMessenger.py b/src/raythena/drivers/communicators/harvesterFileMessenger.py index f154936..e3c7cb4 100644 --- a/src/raythena/drivers/communicators/harvesterFileMessenger.py +++ b/src/raythena/drivers/communicators/harvesterFileMessenger.py @@ -27,8 +27,13 @@ class HarvesterFileCommunicator(BaseCommunicator): system is required. """ - def __init__(self, requests_queue: Queue, job_queue: Queue, - event_ranges_queue: Queue, config: Config) -> None: + def __init__( + self, + requests_queue: Queue, + job_queue: Queue, + event_ranges_queue: Queue, + config: Config, + ) -> None: """ Initialize communicator thread and parses the harvester config file @@ -40,15 +45,17 @@ def __init__(self, requests_queue: Queue, job_queue: Queue, """ super().__init__(requests_queue, job_queue, event_ranges_queue, config) self.harvester_workdir = os.path.expandvars( - self.config.harvester['endpoint']) + self.config.harvester["endpoint"] + ) self.ranges_requests_count = 0 self._parse_harvester_config() self.id = "HarvesterCommunicator" self._logger = make_logger(self.config, self.id) self.event_ranges_update_buffer = EventRangeUpdate() self.event_ranges_update_interval = 5 * 60 - self.communicator_thread = ExThread(target=self.run, - name="communicator-thread") + self.communicator_thread = ExThread( + target=self.run, name="communicator-thread" + ) def _parse_harvester_config(self) -> None: """ @@ -66,28 +73,33 @@ def _parse_harvester_config(self) -> None: FileNotFoundError if the harvester config file doesn't exist """ self.harvester_conf_file = os.path.expandvars( - self.config.harvester['harvesterconf']) + self.config.harvester["harvesterconf"] + ) if not os.path.isfile(self.harvester_conf_file): raise FileNotFoundError("Harvester config file not found") self.harvester_conf = configparser.ConfigParser() self.harvester_conf.read(self.harvester_conf_file) - for k in self.harvester_conf['payload_interaction']: + for k in self.harvester_conf["payload_interaction"]: setattr( - self, k, - os.path.join(self.harvester_workdir, - self.harvester_conf['payload_interaction'][k])) + self, + k, + os.path.join( + self.harvester_workdir, + self.harvester_conf["payload_interaction"][k], + ), + ) if not hasattr(self, "jobspecfile"): - self.jobspecfile = '' + self.jobspecfile = "" if not hasattr(self, "jobspecfile"): - self.jobrequestfile = '' + self.jobrequestfile = "" if not hasattr(self, "eventrangesfile"): - self.eventrangesfile = '' + self.eventrangesfile = "" if not hasattr(self, "eventrequestfile"): - self.eventrequestfile = '' + self.eventrequestfile = "" if not hasattr(self, "eventstatusdumpjsonfile"): - self.eventstatusdumpjsonfile = '' + self.eventstatusdumpjsonfile = "" if not hasattr(self, "jobreportfile"): - self.jobreportfile = '' + self.jobreportfile = "" def request_job(self, request: PandaJobRequest) -> None: """ @@ -109,7 +121,7 @@ def request_job(self, request: PandaJobRequest) -> None: # create request file if necessary if not os.path.isfile(self.jobrequestfile): request_tmp = f"{self.jobrequestfile}.tmp" - with open(request_tmp, 'w') as f: + with open(request_tmp, "w") as f: json.dump(request.to_dict(), f) shutil.move(request_tmp, self.jobrequestfile) @@ -145,17 +157,22 @@ def request_event_ranges(self, request: EventRangeRequest) -> None: None """ if not os.path.isfile(self.eventrangesfile) and not os.path.exists( - self.eventrequestfile): + self.eventrequestfile + ): event_request_file_tmp = f"{self.eventrequestfile}.tmp" - with open(event_request_file_tmp, 'w') as f: + with open(event_request_file_tmp, "w") as f: json.dump(request.request, f) shutil.move(event_request_file_tmp, self.eventrequestfile) - self._logger.debug(f"request_event_ranges: created new {self.eventrequestfile} file") + self._logger.debug( + f"request_event_ranges: created new {self.eventrequestfile} file" + ) while not os.path.isfile(self.eventrangesfile): time.sleep(1) - self._logger.debug(f"request_event_ranges: found a {self.eventrangesfile} file") + self._logger.debug( + f"request_event_ranges: found a {self.eventrangesfile} file" + ) while os.path.isfile(self.eventrangesfile): try: with open(self.eventrangesfile) as f: @@ -163,10 +180,13 @@ def request_event_ranges(self, request: EventRangeRequest) -> None: if os.path.isfile(self.eventrangesfile): shutil.move( self.eventrangesfile, - f"{self.eventrangesfile}-{self.ranges_requests_count}") + f"{self.eventrangesfile}-{self.ranges_requests_count}", + ) except Exception: time.sleep(5) - if os.path.exists(f"{self.eventrangesfile}-{self.ranges_requests_count}"): + if os.path.exists( + f"{self.eventrangesfile}-{self.ranges_requests_count}" + ): self.ranges_requests_count += 1 try: @@ -212,7 +232,9 @@ def update_events(self, request: EventRangeUpdate) -> None: current_update = json.load(f) os.remove(tmp_status_dump_file) except Exception as e: - self._logger.critical("Failed to read and remove leftover tmp update file. Update will never get reported to harvester.") + self._logger.critical( + "Failed to read and remove leftover tmp update file. Update will never get reported to harvester." + ) self._logger.critical(e) else: request.merge_update(EventRangeUpdate(current_update)) @@ -229,31 +251,43 @@ def update_events(self, request: EventRangeUpdate) -> None: try: shutil.move(tmp_status_dump_file, self.eventstatusdumpjsonfile) except Exception as e: - self._logger.critical(f"Failed to move temporary event status file to harvester dump file: {e}") + self._logger.critical( + f"Failed to move temporary event status file to harvester dump file: {e}" + ) - def merge_write_dump_file(self, request: EventRangeUpdate, tmp_status_dump_file: str) -> None: + def merge_write_dump_file( + self, request: EventRangeUpdate, tmp_status_dump_file: str + ) -> None: if os.path.isfile(self.eventstatusdumpjsonfile): - self._logger.debug("Dump file already exists, merge with upcoming update") + self._logger.debug( + "Dump file already exists, merge with upcoming update" + ) try: shutil.move(self.eventstatusdumpjsonfile, tmp_status_dump_file) with open(tmp_status_dump_file) as f: current_update = json.load(f) except Exception as e: - self._logger.error(f"Failed to move and load existing dump file: {e} ") + self._logger.error( + f"Failed to move and load existing dump file: {e} " + ) else: request.merge_update(EventRangeUpdate(current_update)) self._logger.debug("Writting event ranges update to temporary file") try: - with open(tmp_status_dump_file, 'w') as f: + with open(tmp_status_dump_file, "w") as f: json.dump(request.range_update, f) except Exception as e: - self._logger.error(f"Failed to write event update to temporary file: {e}") + self._logger.error( + f"Failed to write event update to temporary file: {e}" + ) def cleanup_tmp_files(self) -> None: tmp_status_dump_file = f"{self.eventstatusdumpjsonfile}.tmp" if os.path.isfile(tmp_status_dump_file): - self._logger.warning("About to quit with leftover temporary files... Last try to move it") + self._logger.warning( + "About to quit with leftover temporary files... Last try to move it" + ) try: with open(tmp_status_dump_file) as f: current_update = json.load(f) @@ -276,7 +310,7 @@ def create_job_report(self, request: JobReport) -> None: """ job_report_file = f"{self.jobreportfile}" - with open(job_report_file, 'w') as f: + with open(job_report_file, "w") as f: json.dump(request.to_dict(), f) def run(self) -> None: @@ -300,7 +334,10 @@ def run(self) -> None: elif isinstance(request, EventRangeUpdate): self.event_ranges_update_buffer.merge_update(request) now = time.time() - if now - last_event_range_update > self.event_ranges_update_interval: + if ( + now - last_event_range_update + > self.event_ranges_update_interval + ): self.update_events(self.event_ranges_update_buffer) last_event_range_update = now self.event_ranges_update_buffer = EventRangeUpdate() @@ -309,7 +346,9 @@ def run(self) -> None: else: # if any other request is received, stop the thread break except Exception as e: - self._logger.error(f"Exception occured while handling request: {e}") + self._logger.error( + f"Exception occured while handling request: {e}" + ) if self.event_ranges_update_buffer: self.update_events(self.event_ranges_update_buffer) @@ -336,5 +375,6 @@ def stop(self) -> None: if self.communicator_thread.is_alive(): self.requests_queue.put(None) self.communicator_thread.join() - self.communicator_thread = ExThread(target=self.run, - name="communicator-thread") + self.communicator_thread = ExThread( + target=self.run, name="communicator-thread" + ) diff --git a/src/raythena/drivers/communicators/harvesterMock.py b/src/raythena/drivers/communicators/harvesterMock.py index b94b378..e37d0ee 100644 --- a/src/raythena/drivers/communicators/harvesterMock.py +++ b/src/raythena/drivers/communicators/harvesterMock.py @@ -25,24 +25,30 @@ class HarvesterMock(BaseCommunicator): Input files specified in the inFiles attribute should exist in the ray workdir before starting ray """ - def __init__(self, requests_queue: Queue, job_queue: Queue, - event_ranges_queue: Queue, config: Config) -> None: + def __init__( + self, + requests_queue: Queue, + job_queue: Queue, + event_ranges_queue: Queue, + config: Config, + ) -> None: super().__init__(requests_queue, job_queue, event_ranges_queue, config) """ Initialize communicator thread, input files name, job worker_id, number of events to be distributed """ - self.communicator_thread = ExThread(target=self.run, - name="communicator-thread") + self.communicator_thread = ExThread( + target=self.run, name="communicator-thread" + ) self.event_ranges = None self.pandaID = random.randint(0, 100) self.jobsetId = random.randint(0, 100) self.taskId = random.randint(0, 100) self.config = config - self.scope = 'mc16_13TeV' - self.guid = '74DFB3ED-DAA7-E011-8954-001E4F3D9CB1,74DFB3ED-DAA7-E011-8954-001E4F3D9CB1' + self.scope = "mc16_13TeV" + self.guid = "74DFB3ED-DAA7-E011-8954-001E4F3D9CB1,74DFB3ED-DAA7-E011-8954-001E4F3D9CB1" self.guids = self.guid.split(",") self.inFiles = "EVNT.12458444._000048.pool.root.1,EVNT.12458444._000052.pool.root.1" - workdir = os.path.expandvars(self.config.ray['workdir']) + workdir = os.path.expandvars(self.config.ray["workdir"]) self.files = self.inFiles.split(",") self.nfiles = len(self.files) self.inFilesAbs = list() @@ -52,7 +58,7 @@ def __init__(self, requests_queue: Queue, job_queue: Queue, self.nevents_per_file = 5000 self.nevents = self.nevents_per_file * self.nfiles self.served_events = 0 - self.ncores = self.config.resources['corepernode'] + self.ncores = self.config.resources["corepernode"] def run(self) -> None: """ @@ -117,20 +123,24 @@ def request_event_ranges(self, request: EventRangeRequest) -> None: for pandaID in request: range_list = list() request_dict = request[pandaID] - nranges = min(self.nevents - self.served_events, - request_dict['nRanges']) - for i in range(self.served_events + 1, - self.served_events + nranges + 1): + nranges = min( + self.nevents - self.served_events, request_dict["nRanges"] + ) + for i in range( + self.served_events + 1, self.served_events + nranges + 1 + ): file_idx = self.served_events // self.nevents_per_file range_id = f"Range-{i:05}" - range_list.append({ - 'lastEvent': i - file_idx * self.nevents_per_file, - 'eventRangeID': range_id, - 'startEvent': i - file_idx * self.nevents_per_file, - 'scope': self.scope, - 'LFN': self.inFilesAbs[file_idx], - 'GUID': self.guids[file_idx] - }) + range_list.append( + { + "lastEvent": i - file_idx * self.nevents_per_file, + "eventRangeID": range_id, + "startEvent": i - file_idx * self.nevents_per_file, + "scope": self.scope, + "LFN": self.inFilesAbs[file_idx], + "GUID": self.guids[file_idx], + } + ) self.served_events += 1 @@ -166,7 +176,7 @@ def get_panda_queue_name(self) -> str: Returns: The name of the pandaqueue from which jobs are retrieved. """ - return self.config.payload['pandaqueue'] + return self.config.payload["pandaqueue"] def request_job(self, job_request: PandaJobRequest) -> None: """ @@ -181,122 +191,77 @@ def request_job(self, job_request: PandaJobRequest) -> None: """ md5_hash = hashlib.md5() - md5_hash.update(str(time.time()).encode('utf-8')) + md5_hash.update(str(time.time()).encode("utf-8")) log_guid = md5_hash.hexdigest() - md5_hash.update(str(time.time()).encode('utf-8')) + md5_hash.update(str(time.time()).encode("utf-8")) job_name = md5_hash.hexdigest() - self.job_queue.put({ - str(self.pandaID): { - 'jobsetID': - self.jobsetId, - 'logGUID': - log_guid, - 'cmtConfig': - 'x86_64-slc6-gcc49-opt', - 'prodDBlocks': - 'user.mlassnig:user.mlassnig.pilot.test.single.hits', - 'dispatchDBlockTokenForOut': - 'NULL,NULL', - 'destinationDBlockToken': - 'NULL,NULL', - 'destinationSE': - self.get_panda_queue_name(), - 'realDatasets': - job_name, - 'prodUserID': - 'no_one', - 'GUID': - self.guid, - 'realDatasetsIn': - 'user.mlassnig:user.mlassnig.pilot.test.single.hits', - 'nSent': - 0, - 'eventService': - 'true', - 'cloud': - 'US', - 'StatusCode': - 0, - 'homepackage': - 'AtlasOffline/21.0.15', - 'inFiles': - self.inFiles, - 'processingType': - 'pilot-ptest', - 'ddmEndPointOut': - 'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', - 'fsize': - '118612262', - 'fileDestinationSE': - f"{self.get_panda_queue_name()},{self.get_panda_queue_name()}", - 'scopeOut': - 'panda', - 'minRamCount': - 0, - 'jobDefinitionID': - 7932, - 'maxWalltime': - 'NULL', - 'scopeLog': - 'panda', - 'transformation': - 'Sim_tf.py', - 'maxDiskCount': - 0, - 'coreCount': - self.ncores, - 'prodDBlockToken': - 'NULL', - 'transferType': - 'NULL', - 'destinationDblock': - job_name, - 'dispatchDBlockToken': - 'NULL', - 'jobPars': ( - '--eventService=True --skipEvents=0 --firstEvent=1 --preExec \'from AthenaCommon.DetFlags ' - 'import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();' - 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff()\' ' - '--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so ' - '--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,' - 'SimulationJobOptions/preInclude.BeamPipeKill.py ' - '--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT ' - '--randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 ' - '--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root' - % (self.inFiles, job_name)), - 'attemptNr': - 0, - 'swRelease': - 'Atlas-21.0.15', - 'nucleus': - 'NULL', - 'maxCpuCount': - 0, - 'outFiles': - 'HITS_%s.pool.root,%s.job.log.tgz' % (job_name, job_name), - 'currentPriority': - 1000, - 'scopeIn': - self.scope, - 'PandaID': - self.pandaID, - 'sourceSite': - 'NULL', - 'dispatchDblock': - 'NULL', - 'prodSourceLabel': - 'ptest', - 'checksum': - 'ad:5d000974', - 'jobName': - job_name, - 'ddmEndPointIn': - 'UTA_SWT2_DATADISK', - 'taskID': - self.taskId, - 'logFile': - '%s.job.log.tgz' % job_name + self.job_queue.put( + { + str(self.pandaID): { + "jobsetID": self.jobsetId, + "logGUID": log_guid, + "cmtConfig": "x86_64-slc6-gcc49-opt", + "prodDBlocks": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "dispatchDBlockTokenForOut": "NULL,NULL", + "destinationDBlockToken": "NULL,NULL", + "destinationSE": self.get_panda_queue_name(), + "realDatasets": job_name, + "prodUserID": "no_one", + "GUID": self.guid, + "realDatasetsIn": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "nSent": 0, + "eventService": "true", + "cloud": "US", + "StatusCode": 0, + "homepackage": "AtlasOffline/21.0.15", + "inFiles": self.inFiles, + "processingType": "pilot-ptest", + "ddmEndPointOut": "UTA_SWT2_DATADISK,UTA_SWT2_DATADISK", + "fsize": "118612262", + "fileDestinationSE": f"{self.get_panda_queue_name()},{self.get_panda_queue_name()}", + "scopeOut": "panda", + "minRamCount": 0, + "jobDefinitionID": 7932, + "maxWalltime": "NULL", + "scopeLog": "panda", + "transformation": "Sim_tf.py", + "maxDiskCount": 0, + "coreCount": self.ncores, + "prodDBlockToken": "NULL", + "transferType": "NULL", + "destinationDblock": job_name, + "dispatchDBlockToken": "NULL", + "jobPars": ( + "--eventService=True --skipEvents=0 --firstEvent=1 --preExec 'from AthenaCommon.DetFlags " + "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" + "DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff()' " + "--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so " + "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py," + "SimulationJobOptions/preInclude.BeamPipeKill.py " + "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT " + "--randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " + "--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root" + % (self.inFiles, job_name) + ), + "attemptNr": 0, + "swRelease": "Atlas-21.0.15", + "nucleus": "NULL", + "maxCpuCount": 0, + "outFiles": "HITS_%s.pool.root,%s.job.log.tgz" + % (job_name, job_name), + "currentPriority": 1000, + "scopeIn": self.scope, + "PandaID": self.pandaID, + "sourceSite": "NULL", + "dispatchDblock": "NULL", + "prodSourceLabel": "ptest", + "checksum": "ad:5d000974", + "jobName": job_name, + "ddmEndPointIn": "UTA_SWT2_DATADISK", + "taskID": self.taskId, + "logFile": "%s.job.log.tgz" % job_name, + } } - }) + ) diff --git a/src/raythena/drivers/communicators/harvesterMock2205.py b/src/raythena/drivers/communicators/harvesterMock2205.py index fe21803..58e51f3 100644 --- a/src/raythena/drivers/communicators/harvesterMock2205.py +++ b/src/raythena/drivers/communicators/harvesterMock2205.py @@ -15,24 +15,30 @@ class HarvesterMock2205(HarvesterMock): Same purposes as HarvesterMock except that a job spec for Athena/22.0.5 is provided """ - def __init__(self, requests_queue: Queue, job_queue: Queue, - event_ranges_queue: Queue, config: Config) -> None: + def __init__( + self, + requests_queue: Queue, + job_queue: Queue, + event_ranges_queue: Queue, + config: Config, + ) -> None: """ Initialize communicator thread, input files name, job worker_id, number of events to be distributed """ super().__init__(requests_queue, job_queue, event_ranges_queue, config) - self.communicator_thread = ExThread(target=self.run, - name="communicator-thread") + self.communicator_thread = ExThread( + target=self.run, name="communicator-thread" + ) self.event_ranges = None self.pandaID = random.randint(0, 100) self.jobsetId = random.randint(0, 100) self.taskId = random.randint(0, 100) self.config = config - self.scope = 'mc16_13TeV' - self.guid = '74DFB3ED-DAA7-E011-8954-001E4F3D9CB1,74DFB3ED-DAA7-E011-8954-001E4F3D9CB1' + self.scope = "mc16_13TeV" + self.guid = "74DFB3ED-DAA7-E011-8954-001E4F3D9CB1,74DFB3ED-DAA7-E011-8954-001E4F3D9CB1" self.guids = self.guid.split(",") self.inFiles = "EVNT.12458444._000048.pool.root.1,EVNT.12458444._000052.pool.root.1" - workdir = os.path.expandvars(self.config.ray['workdir']) + workdir = os.path.expandvars(self.config.ray["workdir"]) self.files = self.inFiles.split(",") self.nfiles = len(self.files) self.inFilesAbs = list() @@ -42,7 +48,7 @@ def __init__(self, requests_queue: Queue, job_queue: Queue, self.nevents_per_file = 50 self.nevents = self.nevents_per_file * self.nfiles self.served_events = 0 - self.ncores = self.config.resources['corepernode'] + self.ncores = self.config.resources["corepernode"] def request_job(self, job_request: PandaJobRequest) -> None: """ @@ -57,124 +63,79 @@ def request_job(self, job_request: PandaJobRequest) -> None: """ md5_hash = hashlib.md5() - md5_hash.update(str(time.time()).encode('utf-8')) + md5_hash.update(str(time.time()).encode("utf-8")) log_guid = md5_hash.hexdigest() - md5_hash.update(str(time.time()).encode('utf-8')) + md5_hash.update(str(time.time()).encode("utf-8")) job_name = md5_hash.hexdigest() - self.job_queue.put({ - str(self.pandaID): { - 'jobsetID': - self.jobsetId, - 'logGUID': - log_guid, - 'cmtConfig': - 'x86_64-centos7-gcc8-opt', - 'prodDBlocks': - 'user.mlassnig:user.mlassnig.pilot.test.single.hits', - 'dispatchDBlockTokenForOut': - 'NULL,NULL', - 'destinationDBlockToken': - 'NULL,NULL', - 'destinationSE': - self.get_panda_queue_name(), - 'realDatasets': - job_name, - 'prodUserID': - 'no_one', - 'GUID': - self.guid, - 'realDatasetsIn': - 'user.mlassnig:user.mlassnig.pilot.test.single.hits', - 'nSent': - 0, - 'eventService': - 'true', - 'cloud': - 'US', - 'StatusCode': - 0, - 'homepackage': - 'Athena/22.0.5', - 'inFiles': - self.inFiles, - 'processingType': - 'pilot-ptest', - 'ddmEndPointOut': - 'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', - 'fsize': - '118612262', - 'fileDestinationSE': - f"{self.get_panda_queue_name()},{self.get_panda_queue_name()}", - 'scopeOut': - 'panda', - 'minRamCount': - 0, - 'jobDefinitionID': - 7932, - 'maxWalltime': - 'NULL', - 'scopeLog': - 'panda', - 'transformation': - 'Sim_tf.py', - 'maxDiskCount': - 0, - 'coreCount': - self.ncores, - 'prodDBlockToken': - 'NULL', - 'transferType': - 'NULL', - 'destinationDblock': - job_name, - 'dispatchDBlockToken': - 'NULL', - 'jobPars': ( - '--multiprocess --eventService=True --skipEvents=0 --firstEvent=1 ' - '--preExec \'from AthenaCommon.DetFlags ' - 'import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();' - 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff()\' ' - '--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so ' - '--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,' - 'SimulationJobOptions/preInclude.BeamPipeKill.py ' - '--geometryVersion default:ATLAS-R2-2016-01-00-01_VALIDATION ' - '--physicsList FTFP_BERT_ATL_VALIDATION --randomSeed 1234 ' - '--conditionsTag default:OFLCOND-MC16-SDR-14 ' - '--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root' - % (self.inFiles, job_name)), - 'attemptNr': - 0, - 'swRelease': - 'Atlas-22.0.5', - 'nucleus': - 'NULL', - 'maxCpuCount': - 0, - 'outFiles': - 'HITS_%s.pool.root,%s.job.log.tgz' % (job_name, job_name), - 'currentPriority': - 1000, - 'scopeIn': - self.scope, - 'PandaID': - self.pandaID, - 'sourceSite': - 'NULL', - 'dispatchDblock': - 'NULL', - 'prodSourceLabel': - 'ptest', - 'checksum': - 'ad:5d000974', - 'jobName': - job_name, - 'ddmEndPointIn': - 'UTA_SWT2_DATADISK', - 'taskID': - self.taskId, - 'logFile': - '%s.job.log.tgz' % job_name + self.job_queue.put( + { + str(self.pandaID): { + "jobsetID": self.jobsetId, + "logGUID": log_guid, + "cmtConfig": "x86_64-centos7-gcc8-opt", + "prodDBlocks": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "dispatchDBlockTokenForOut": "NULL,NULL", + "destinationDBlockToken": "NULL,NULL", + "destinationSE": self.get_panda_queue_name(), + "realDatasets": job_name, + "prodUserID": "no_one", + "GUID": self.guid, + "realDatasetsIn": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "nSent": 0, + "eventService": "true", + "cloud": "US", + "StatusCode": 0, + "homepackage": "Athena/22.0.5", + "inFiles": self.inFiles, + "processingType": "pilot-ptest", + "ddmEndPointOut": "UTA_SWT2_DATADISK,UTA_SWT2_DATADISK", + "fsize": "118612262", + "fileDestinationSE": f"{self.get_panda_queue_name()},{self.get_panda_queue_name()}", + "scopeOut": "panda", + "minRamCount": 0, + "jobDefinitionID": 7932, + "maxWalltime": "NULL", + "scopeLog": "panda", + "transformation": "Sim_tf.py", + "maxDiskCount": 0, + "coreCount": self.ncores, + "prodDBlockToken": "NULL", + "transferType": "NULL", + "destinationDblock": job_name, + "dispatchDBlockToken": "NULL", + "jobPars": ( + "--multiprocess --eventService=True --skipEvents=0 --firstEvent=1 " + "--preExec 'from AthenaCommon.DetFlags " + "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" + "DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff()' " + "--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so " + "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py," + "SimulationJobOptions/preInclude.BeamPipeKill.py " + "--geometryVersion default:ATLAS-R2-2016-01-00-01_VALIDATION " + "--physicsList FTFP_BERT_ATL_VALIDATION --randomSeed 1234 " + "--conditionsTag default:OFLCOND-MC16-SDR-14 " + "--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root" + % (self.inFiles, job_name) + ), + "attemptNr": 0, + "swRelease": "Atlas-22.0.5", + "nucleus": "NULL", + "maxCpuCount": 0, + "outFiles": "HITS_%s.pool.root,%s.job.log.tgz" + % (job_name, job_name), + "currentPriority": 1000, + "scopeIn": self.scope, + "PandaID": self.pandaID, + "sourceSite": "NULL", + "dispatchDblock": "NULL", + "prodSourceLabel": "ptest", + "checksum": "ad:5d000974", + "jobName": job_name, + "ddmEndPointIn": "UTA_SWT2_DATADISK", + "taskID": self.taskId, + "logFile": "%s.job.log.tgz" % job_name, + } } - }) + ) diff --git a/src/raythena/drivers/esdriver.py b/src/raythena/drivers/esdriver.py index c70d653..a3d9603 100644 --- a/src/raythena/drivers/esdriver.py +++ b/src/raythena/drivers/esdriver.py @@ -83,16 +83,20 @@ def __init__(self, config: Config, session_dir: str) -> None: self.id = "Driver" self._logger = make_logger(self.config, self.id) self.session_log_dir = os.path.join(self.session_dir, "logs") - self.nodes = build_nodes_resource_list(self.config, run_actor_on_head=False) + self.nodes = build_nodes_resource_list( + self.config, run_actor_on_head=False + ) self.requests_queue: Queue[RequestData] = Queue() self.jobs_queue: Queue[Mapping[str, JobDef]] = Queue() - self.event_ranges_queue: Queue[Mapping[str, Sequence[EventRangeDef]]] = Queue() + self.event_ranges_queue: Queue[ + Mapping[str, Sequence[EventRangeDef]] + ] = Queue() - workdir = os.path.expandvars(self.config.ray.get('workdir')) + workdir = os.path.expandvars(self.config.ray.get("workdir")) if not workdir or not os.path.exists(workdir): workdir = os.getcwd() - self.config.ray['workdir'] = workdir + self.config.ray["workdir"] = workdir self.workdir = workdir self.output_dir = "" self.merged_files_dir = "" @@ -102,16 +106,20 @@ def __init__(self, config: Config, session_dir: str) -> None: # TODO removing stdout on the root logger will also disable ray logging and collected stdout from actors disable_stdout_logging() - self._logger.debug(f"Raythena v{__version__} initializing, running Ray {ray.__version__} on {gethostname()}") + self._logger.debug( + f"Raythena v{__version__} initializing, running Ray {ray.__version__} on {gethostname()}" + ) self.task_workdir_path_file = f"{workdir}/task_workdir_path.txt" # self.cpu_monitor = CPUMonitor(os.path.join(workdir, "cpu_monitor_driver.json")) # self.cpu_monitor.start() - self.communicator: BaseCommunicator = HarvesterFileCommunicator(self.requests_queue, - self.jobs_queue, - self.event_ranges_queue, - self.config) + self.communicator: BaseCommunicator = HarvesterFileCommunicator( + self.requests_queue, + self.jobs_queue, + self.event_ranges_queue, + self.config, + ) self.communicator.start() self.requests_queue.put(PandaJobRequest()) self.actors: Dict[str, ESWorker] = dict() @@ -124,42 +132,67 @@ def __init__(self, config: Config, session_dir: str) -> None: self.max_retries_error_failed_tasks = 3 self.first_event_range_request = True self.no_more_events = False - self.cache_size_factor = self.config.ray.get('cachesizefactor', 3) - self.cores_per_node = self.config.resources.get('corepernode', os.cpu_count()) + self.cache_size_factor = self.config.ray.get("cachesizefactor", 3) + self.cores_per_node = self.config.resources.get( + "corepernode", os.cpu_count() + ) self.n_actors = len(self.nodes) - self.events_cache_size = self.cores_per_node * self.n_actors * self.cache_size_factor - self.timeoutinterval = self.config.ray['timeoutinterval'] - self.max_running_merge_transforms = self.config.ray['mergemaxprocesses'] + self.events_cache_size = ( + self.cores_per_node * self.n_actors * self.cache_size_factor + ) + self.timeoutinterval = self.config.ray["timeoutinterval"] + self.max_running_merge_transforms = self.config.ray["mergemaxprocesses"] self.panda_taskid = None - self.pandaqueue = self.config.payload['pandaqueue'] + self.pandaqueue = self.config.payload["pandaqueue"] parser = configparser.ConfigParser() - harvester_config = self.config.harvester['harvesterconf'] + harvester_config = self.config.harvester["harvesterconf"] self.queuedata_file = "" self.container_options = "" self.container_type = "" self.jobreport_name = "" if not os.path.isfile(harvester_config): - self._logger.warning(f"Couldn't find harvester config file {harvester_config}") + self._logger.warning( + f"Couldn't find harvester config file {harvester_config}" + ) else: parser.read(harvester_config) - queuedata_config = [queue.split('|')[-1] for queue in parser["cacher"]["data"].splitlines() if queue.startswith(self.pandaqueue)] + queuedata_config = [ + queue.split("|")[-1] + for queue in parser["cacher"]["data"].splitlines() + if queue.startswith(self.pandaqueue) + ] self.jobreport_name = parser["payload_interaction"]["jobReportFile"] if not queuedata_config: - self._logger.warning(f"No queuedata config found for {self.pandaqueue}") + self._logger.warning( + f"No queuedata config found for {self.pandaqueue}" + ) elif not os.path.isfile(queuedata_config[0]): - self._logger.warning(f"cached queudata file not found: {queuedata_config[0]}") + self._logger.warning( + f"cached queudata file not found: {queuedata_config[0]}" + ) else: self.queuedata_file = queuedata_config[0] with open(self.queuedata_file) as f: queuedata = json.load(f) self.container_options = queuedata["container_options"] - self.container_type = queuedata["container_type"].split(":")[0] - if self.container_type != self.config.payload['containerengine']: - self._logger.warning("Mismatch between pandaqueue and raythena container type. Overriding raythena config") - self.config.payload['containerengine'] = self.container_type + self.container_type = queuedata["container_type"].split( + ":" + )[0] + if ( + self.container_type + != self.config.payload["containerengine"] + ): + self._logger.warning( + "Mismatch between pandaqueue and raythena container type. Overriding raythena config" + ) + self.config.payload["containerengine"] = ( + self.container_type + ) # {input_filename, {merged_output_filename, ([(event_range_id, EventRange)], subprocess handle)}} - self.running_merge_transforms: Dict[str, Tuple[List[Tuple[str, EventRange]], Popen, str]] = dict() + self.running_merge_transforms: Dict[ + str, Tuple[List[Tuple[str, EventRange]], Popen, str] + ] = dict() self.total_running_merge_transforms = 0 self.failed_actor_tasks_count = dict() self.available_events_per_actor = 0 @@ -205,32 +238,41 @@ def create_actors(self) -> None: Returns: None """ - events_per_actor = min(self.available_events_per_actor, self.cores_per_node) + events_per_actor = min( + self.available_events_per_actor, self.cores_per_node + ) for i, node in enumerate(self.nodes): - nodeip = node['NodeManagerAddress'] + nodeip = node["NodeManagerAddress"] node_constraint = f"node:{nodeip}" actor_id = f"Actor_{i}" kwargs = { - 'actor_id': actor_id, - 'config': self.config_remote, - 'session_log_dir': self.session_log_dir, - 'actor_no': i, - 'actor_count': self.n_actors, + "actor_id": actor_id, + "config": self.config_remote, + "session_log_dir": self.session_log_dir, + "actor_no": i, + "actor_count": self.n_actors, } job = self.bookKeeper.assign_job_to_actor(actor_id) if job: - job_remote = self.remote_jobdef_byid[job['PandaID']] - kwargs['job'] = job_remote - event_ranges = self.bookKeeper.fetch_event_ranges(actor_id, events_per_actor) + job_remote = self.remote_jobdef_byid[job["PandaID"]] + kwargs["job"] = job_remote + event_ranges = self.bookKeeper.fetch_event_ranges( + actor_id, events_per_actor + ) if event_ranges: - kwargs['event_ranges'] = event_ranges + kwargs["event_ranges"] = event_ranges self._logger.debug( - f"Prefetched job {job['PandaID']} and {len(event_ranges)} event ranges for {actor_id}") + f"Prefetched job {job['PandaID']} and {len(event_ranges)} event ranges for {actor_id}" + ) - actor = ESWorker.options(resources={node_constraint: 1}).remote(**kwargs) + actor = ESWorker.options(resources={node_constraint: 1}).remote( + **kwargs + ) self.actors[actor_id] = actor - def retrieve_actors_messages(self, ready: Sequence[ObjectRef]) -> Iterator[WorkerResponse]: + def retrieve_actors_messages( + self, ready: Sequence[ObjectRef] + ) -> Iterator[WorkerResponse]: """ Given a list of ready futures from actors, unwrap them and return an interable over the result of each future. In case one of the futures raised an exception, the exception is handled by this function and not propagated to the caller. @@ -253,7 +295,9 @@ def retrieve_actors_messages(self, ready: Sequence[ObjectRef]) -> Iterator[Worke except RayActorError as e: self._logger.error(f"RayActorError: {e.error_msg}") except Exception as e: - self._logger.error(f"Caught exception while fetching result from {self.pending_objectref_to_actor[r]}: {e}") + self._logger.error( + f"Caught exception while fetching result from {self.pending_objectref_to_actor[r]}: {e}" + ) else: yield actor_id, message, data else: @@ -274,13 +318,19 @@ def handle_actors(self) -> None: new_messages, self.actors_message_queue = self.wait_on_messages() total_sent = 0 while new_messages and self.running: - for actor_id, message, data in self.retrieve_actors_messages(new_messages): + for actor_id, message, data in self.retrieve_actors_messages( + new_messages + ): if message == Messages.IDLE or message == Messages.REPLY_OK: - self.enqueue_actor_call(actor_id, self[actor_id].get_message.remote()) + self.enqueue_actor_call( + actor_id, self[actor_id].get_message.remote() + ) elif message == Messages.REQUEST_NEW_JOB: self.handle_job_request(actor_id) elif message == Messages.REQUEST_EVENT_RANGES: - total_sent = self.handle_request_event_ranges(actor_id, data, total_sent) + total_sent = self.handle_request_event_ranges( + actor_id, data, total_sent + ) elif message == Messages.UPDATE_JOB: self.handle_update_job(actor_id, data) elif message == Messages.UPDATE_EVENT_RANGES: @@ -290,7 +340,9 @@ def handle_actors(self) -> None: self.on_tick() new_messages, self.actors_message_queue = self.wait_on_messages() - self._logger.debug("Finished handling the Actors. Raythena will shutdown now.") + self._logger.debug( + "Finished handling the Actors. Raythena will shutdown now." + ) def wait_on_messages(self) -> Tuple[List[ObjectRef], List[ObjectRef]]: """ @@ -308,13 +360,22 @@ def wait_on_messages(self) -> Tuple[List[ObjectRef], List[ObjectRef]]: timeoutinterval = None messages, queue = ray.wait( - self.actors_message_queue, num_returns=max(1, len(self.actors_message_queue) // 2), timeout=1) + self.actors_message_queue, + num_returns=max(1, len(self.actors_message_queue) // 2), + timeout=1, + ) if not messages: messages, queue = ray.wait( - self.actors_message_queue, num_returns=max(1, len(self.actors_message_queue) // 10), timeout=1) + self.actors_message_queue, + num_returns=max(1, len(self.actors_message_queue) // 10), + timeout=1, + ) if not messages: messages, queue = ray.wait( - self.actors_message_queue, num_returns=1, timeout=timeoutinterval) + self.actors_message_queue, + num_returns=1, + timeout=timeoutinterval, + ) return messages, queue def handle_actor_done(self, actor_id: str) -> bool: @@ -332,7 +393,9 @@ def handle_actor_done(self, actor_id: str) -> bool: # TODO: Temporary hack has_jobs = False if has_jobs: - self.enqueue_actor_call(actor_id, self[actor_id].mark_new_job.remote()) + self.enqueue_actor_call( + actor_id, self[actor_id].mark_new_job.remote() + ) else: self.terminated.append(actor_id) self.bookKeeper.process_actor_end(actor_id) @@ -340,7 +403,9 @@ def handle_actor_done(self, actor_id: str) -> bool: # do not get new messages from this actor return has_jobs - def handle_update_event_ranges(self, actor_id: str, data: EventRangeUpdate) -> None: + def handle_update_event_ranges( + self, actor_id: str, data: EventRangeUpdate + ) -> None: """ Handle worker update event ranges @@ -367,7 +432,9 @@ def handle_update_job(self, actor_id: str, data: Any) -> None: """ self.enqueue_actor_call(actor_id, self[actor_id].get_message.remote()) - def handle_request_event_ranges(self, actor_id: str, data: EventRangeRequest, total_sent: int) -> int: + def handle_request_event_ranges( + self, actor_id: str, data: EventRangeRequest, total_sent: int + ) -> int: """ Handle event ranges request. Event ranges are distributed evenly amongst workers, the number of events returned in a single request is capped to the number of local events @@ -388,10 +455,11 @@ def handle_request_event_ranges(self, actor_id: str, data: EventRangeRequest, to panda_id = self.bookKeeper.get_actor_job(actor_id) # get the min between requested ranges and what is available for each actor - n_ranges = min(data[panda_id]['nRanges'], self.available_events_per_actor) + n_ranges = min( + data[panda_id]["nRanges"], self.available_events_per_actor + ) - evt_range = self.bookKeeper.fetch_event_ranges( - actor_id, n_ranges) + evt_range = self.bookKeeper.fetch_event_ranges(actor_id, n_ranges) # did not fetch enough events and harvester might have more, needs to get more events now # while (len(evt_range) < n_ranges and # not self.bookKeeper.is_flagged_no_more_events( @@ -402,9 +470,15 @@ def handle_request_event_ranges(self, actor_id: str, data: EventRangeRequest, to # actor_id, n_ranges) if evt_range: total_sent += len(evt_range) - self.enqueue_actor_call(actor_id, self[actor_id].receive_event_ranges.remote( - Messages.REPLY_OK if evt_range else - Messages.REPLY_NO_MORE_EVENT_RANGES, evt_range)) + self.enqueue_actor_call( + actor_id, + self[actor_id].receive_event_ranges.remote( + Messages.REPLY_OK + if evt_range + else Messages.REPLY_NO_MORE_EVENT_RANGES, + evt_range, + ), + ) self._logger.info(f"Sending {len(evt_range)} events to {actor_id}") return total_sent @@ -424,8 +498,13 @@ def handle_job_request(self, actor_id: str) -> None: # self.request_event_ranges(block=True) # job = self.bookKeeper.assign_job_to_actor(actor_id) - self.enqueue_actor_call(actor_id, self[actor_id].receive_job.remote(Messages.REPLY_OK - if job else Messages.REPLY_NO_MORE_JOBS, self.remote_jobdef_byid[job['PandaID']])) + self.enqueue_actor_call( + actor_id, + self[actor_id].receive_job.remote( + Messages.REPLY_OK if job else Messages.REPLY_NO_MORE_JOBS, + self.remote_jobdef_byid[job["PandaID"]], + ), + ) def request_event_ranges(self, block: bool = False) -> None: """ @@ -451,13 +530,17 @@ def request_event_ranges(self, block: bool = False) -> None: n_available_ranges = self.bookKeeper.n_ready(pandaID) job = self.bookKeeper.jobs[pandaID] if n_available_ranges < self.events_cache_size: - event_request.add_event_request(pandaID, - self.events_cache_size, - job['taskID'], - job['jobsetID']) + event_request.add_event_request( + pandaID, + self.events_cache_size, + job["taskID"], + job["jobsetID"], + ) if len(event_request) > 0: - self._logger.debug(f"Sending event ranges request to harvester for {self.events_cache_size} events") + self._logger.debug( + f"Sending event ranges request to harvester for {self.events_cache_size} events" + ) self.requests_queue.put(event_request) self.n_eventsrequest += 1 @@ -467,13 +550,17 @@ def request_event_ranges(self, block: bool = False) -> None: n_received_events = 0 for pandaID, ranges_list in ranges.items(): n_received_events += len(ranges_list) - self._logger.debug(f"got event ranges for job {pandaID}: {len(ranges_list)}") + self._logger.debug( + f"got event ranges for job {pandaID}: {len(ranges_list)}" + ) if self.first_event_range_request: self.first_event_range_request = False if n_received_events == 0: self.stop() self.bookKeeper.add_event_ranges(ranges) - self.available_events_per_actor = max(1, ceil(self.bookKeeper.n_ready(pandaID) / self.n_actors)) + self.available_events_per_actor = max( + 1, ceil(self.bookKeeper.n_ready(pandaID) / self.n_actors) + ) self.n_eventsrequest -= 1 except Empty: pass @@ -506,8 +593,11 @@ def cleanup(self) -> None: ray.get(handles) def setup_dirs(self): - self.output_dir = os.path.join(os.path.expandvars(self.config.ray.get("taskprogressbasedir")), str(self.panda_taskid)) - with open(self.task_workdir_path_file, 'w') as f: + self.output_dir = os.path.join( + os.path.expandvars(self.config.ray.get("taskprogressbasedir")), + str(self.panda_taskid), + ) + with open(self.task_workdir_path_file, "w") as f: f.write(self.output_dir) self.config.ray["outputdir"] = self.output_dir @@ -547,7 +637,9 @@ def run(self) -> None: # gets initial jobs and send an eventranges request for each jobs jobs = self.jobs_queue.get() if not jobs: - self._logger.critical("No jobs provided by communicator, stopping...") + self._logger.critical( + "No jobs provided by communicator, stopping..." + ) return if len(jobs) > 1: self._logger.critical("Raythena can only handle one job") @@ -568,7 +660,9 @@ def run(self) -> None: elif self.cmt_config: self.the_platform = self.cmt_config else: - self._logger.warning(f"No container or CmtConfig found, using default platform {self.the_platform}") + self._logger.warning( + f"No container or CmtConfig found, using default platform {self.the_platform}" + ) self.cmt_config = job["cmtConfig"] = self.the_platform self.setup_dirs() self._logger.debug("Adding job and generating event ranges...") @@ -581,13 +675,17 @@ def run(self) -> None: self.bookKeeper.stop_cleaner_thread() self.bookKeeper.stop_saver_thread() self.communicator.stop() - self._logger.critical("Couldn't fetch a job with event ranges, stopping...") + self._logger.critical( + "Couldn't fetch a job with event ranges, stopping..." + ) return job_id = self.bookKeeper.jobs.next_job_id_to_process() total_events = self.bookKeeper.n_ready(job_id) - os.makedirs(os.path.join(self.config.ray['workdir'], job_id)) + os.makedirs(os.path.join(self.config.ray["workdir"], job_id)) if total_events: - self.available_events_per_actor = max(1, ceil(total_events / self.n_actors)) + self.available_events_per_actor = max( + 1, ceil(total_events / self.n_actors) + ) for pandaID in self.bookKeeper.jobs: cjob = self.bookKeeper.jobs[pandaID] self.remote_jobdef_byid[pandaID] = ray.put(cjob) @@ -600,16 +698,22 @@ def run(self) -> None: self.handle_actors() except Exception as e: self._logger.error(f"{traceback.format_exc()}") - self._logger.error(f"Error while handling actors: {e}. stopping...") + self._logger.error( + f"Error while handling actors: {e}. stopping..." + ) - if self.config.logging.get('copyraylogs', False): + if self.config.logging.get("copyraylogs", False): ray_logs = os.path.join(self.workdir, "ray_logs") try: shutil.copytree(self.session_log_dir, ray_logs) except Exception as e: - self._logger.error(f"Failed to copy ray logs to workdir: {e}") + self._logger.error( + f"Failed to copy ray logs to workdir: {e}" + ) else: - self._logger.info("No events to process, check for remaining merge jobs...") + self._logger.info( + "No events to process, check for remaining merge jobs..." + ) self._logger.debug("Waiting on merge transforms") # Workers might have sent event ranges update since last check, create possible merge jobs self.bookKeeper.stop_saver_thread() @@ -631,7 +735,11 @@ def run(self) -> None: # need to explicitely save as we stopped saver_thread self.bookKeeper.save_status() task_status = self.bookKeeper.taskstatus.get(self.panda_taskid, None) - if task_status and task_status.get_nmerged() + task_status.get_nfailed() == task_status.total_events(): + if ( + task_status + and task_status.get_nmerged() + task_status.get_nfailed() + == task_status.total_events() + ): assert job_id output_map = self.bookKeeper.remap_output_files(job_id) self.rename_output_files(output_map) @@ -650,11 +758,18 @@ def rename_output_files(self, output_map: Dict[str, str]): new_filename = output_map[file] except KeyError: # read the commit log to recover the correct name. If we get another KeyError, we can't recover - new_filename = output_map.get(self.bookKeeper.recover_outputfile_name(file)) + new_filename = output_map.get( + self.bookKeeper.recover_outputfile_name(file) + ) if not new_filename: - self._logger.warning(f"Couldn't find new name for {file}, will not be staged out correctly") + self._logger.warning( + f"Couldn't find new name for {file}, will not be staged out correctly" + ) continue - os.rename(os.path.join(self.merged_files_dir, file), os.path.join(self.merged_files_dir, new_filename)) + os.rename( + os.path.join(self.merged_files_dir, file), + os.path.join(self.merged_files_dir, new_filename), + ) def produce_final_report(self, output_map: Dict[str, str]): """ @@ -676,30 +791,40 @@ def produce_final_report(self, output_map: Dict[str, str]): new_filename = output_map[old_filename] except KeyError: # read the commit log to recover the correct name. If we get another KeyError, we can't recover - new_filename = output_map[self.bookKeeper.recover_outputfile_name(old_filename)] + new_filename = output_map[ + self.bookKeeper.recover_outputfile_name(old_filename) + ] output_file_entry["name"] = new_filename - with open(os.path.join(self.job_reports_dir, files[0]), 'w') as f: + with open(os.path.join(self.job_reports_dir, files[0]), "w") as f: json.dump(final_report, f) for file in files[1:]: current_file = os.path.join(self.job_reports_dir, file) with open(current_file) as f: current_report = json.load(f) - final_report_files["input"].append(current_report["files"]["input"][0]) - output_file_entry = current_report["files"]["output"][0]["subFiles"][0] + final_report_files["input"].append( + current_report["files"]["input"][0] + ) + output_file_entry = current_report["files"]["output"][0][ + "subFiles" + ][0] old_filename = output_file_entry["name"] try: new_filename = output_map[old_filename] except KeyError: # read the commit log to recover the correct name. If we get another KeyError, we can't recover - new_filename = output_map[self.bookKeeper.recover_outputfile_name(old_filename)] + new_filename = output_map[ + self.bookKeeper.recover_outputfile_name(old_filename) + ] output_file_entry["name"] = new_filename - final_report_files["output"][0]["subFiles"].append(output_file_entry) - with open(current_file, 'w') as f: + final_report_files["output"][0]["subFiles"].append( + output_file_entry + ) + with open(current_file, "w") as f: json.dump(current_report, f) tmp = os.path.join(self.workdir, self.jobreport_name + ".tmp") - with open(tmp, 'w') as f: + with open(tmp, "w") as f: json.dump(final_report, f) shutil.move(tmp, os.path.join(self.workdir, self.jobreport_name)) @@ -736,11 +861,20 @@ def handle_actor_exception(self, actor_id: str, ex: Exception) -> None: self.failed_actor_tasks_count[actor_id] = 0 self.failed_actor_tasks_count[actor_id] += 1 - if self.failed_actor_tasks_count[actor_id] < self.max_retries_error_failed_tasks: - self.enqueue_actor_call(actor_id, self[actor_id].get_message.remote()) - self._logger.warning(f"{actor_id} failed {self.failed_actor_tasks_count[actor_id]} times. Retrying...") + if ( + self.failed_actor_tasks_count[actor_id] + < self.max_retries_error_failed_tasks + ): + self.enqueue_actor_call( + actor_id, self[actor_id].get_message.remote() + ) + self._logger.warning( + f"{actor_id} failed {self.failed_actor_tasks_count[actor_id]} times. Retrying..." + ) else: - self._logger.warning(f"{actor_id} failed too many times. No longer fetching messages from it") + self._logger.warning( + f"{actor_id} failed too many times. No longer fetching messages from it" + ) if actor_id not in self.terminated: self.terminated.append(actor_id) @@ -751,7 +885,9 @@ def get_output_file_guid(self, job_report_file) -> Optional[str]: with open(job_report_file) as f: job_report = json.load(f) try: - guid = job_report["files"]["output"][0]["subFiles"][0]["file_guid"] + guid = job_report["files"]["output"][0]["subFiles"][0][ + "file_guid" + ] except KeyError: guid = None return guid @@ -769,23 +905,41 @@ def handle_merge_transforms(self, wait_for_completion=False) -> bool: """ new_transforms = False - if self.total_running_merge_transforms < self.max_running_merge_transforms: + if ( + self.total_running_merge_transforms + < self.max_running_merge_transforms + ): self.bookKeeper.check_mergeable_files() merge_files = self.bookKeeper.get_file_to_merge() while merge_files: (output_filename, event_ranges) = merge_files assert len(event_ranges) > 0 - (sub_process, job_report_file) = self.hits_merge_transform([e[0] for e in event_ranges], output_filename) - self._logger.debug(f"Starting merge transform for {output_filename}") - self.running_merge_transforms[output_filename] = (event_ranges, sub_process, job_report_file) + (sub_process, job_report_file) = self.hits_merge_transform( + [e[0] for e in event_ranges], output_filename + ) + self._logger.debug( + f"Starting merge transform for {output_filename}" + ) + self.running_merge_transforms[output_filename] = ( + event_ranges, + sub_process, + job_report_file, + ) self.total_running_merge_transforms += 1 new_transforms = True - if self.total_running_merge_transforms >= self.max_running_merge_transforms: + if ( + self.total_running_merge_transforms + >= self.max_running_merge_transforms + ): break merge_files = self.bookKeeper.get_file_to_merge() to_remove = [] - for output_filename, (event_ranges, sub_process, job_report_file) in self.running_merge_transforms.items(): + for output_filename, ( + event_ranges, + sub_process, + job_report_file, + ) in self.running_merge_transforms.items(): if wait_for_completion: while sub_process.poll() is None: time.sleep(5) @@ -793,20 +947,37 @@ def handle_merge_transforms(self, wait_for_completion=False) -> bool: to_remove.append(output_filename) self.total_running_merge_transforms -= 1 if sub_process.returncode == 0: - self._logger.debug(f"Merge transform for file {output_filename} finished.") + self._logger.debug( + f"Merge transform for file {output_filename} finished." + ) event_ranges_map = {} guid = self.get_output_file_guid(job_report_file) - for (event_range_output, event_range) in event_ranges: - event_ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict(event_range, event_range_output) - self.bookKeeper.report_merged_file(self.panda_taskid, output_filename, event_ranges_map, guid) + for event_range_output, event_range in event_ranges: + event_ranges_map[event_range.eventRangeID] = ( + TaskStatus.build_eventrange_dict( + event_range, event_range_output + ) + ) + self.bookKeeper.report_merged_file( + self.panda_taskid, + output_filename, + event_ranges_map, + guid, + ) else: - self.bookKeeper.report_failed_merge_transform(self.panda_taskid, output_filename) - self._logger.debug(f"Merge transform for {output_filename} failed with return code {sub_process.returncode}") + self.bookKeeper.report_failed_merge_transform( + self.panda_taskid, output_filename + ) + self._logger.debug( + f"Merge transform for {output_filename} failed with return code {sub_process.returncode}" + ) for o in to_remove: del self.running_merge_transforms[o] return new_transforms - def hits_merge_transform(self, input_files: Iterable[str], output_file: str) -> Tuple[Popen, str]: + def hits_merge_transform( + self, input_files: Iterable[str], output_file: str + ) -> Tuple[Popen, str]: """ Prepare the shell command for the merging subprocess and starts it. @@ -821,47 +992,94 @@ def hits_merge_transform(self, input_files: Iterable[str], output_file: str) -> return tmp_dir = tempfile.mkdtemp() file_list = "\n".join(input_files) - job_report_name = os.path.join(self.job_reports_dir, output_file) + ".json" + job_report_name = ( + os.path.join(self.job_reports_dir, output_file) + ".json" + ) output_file = os.path.join(self.merged_files_dir, output_file) file_list_path = os.path.join(tmp_dir, "file_list.txt") - with open(file_list_path, 'w') as f: + with open(file_list_path, "w") as f: f.write(file_list) - transform_params = re.sub(r"@inputFor_\$\{OUTPUT0\}", f"@/srv/{os.path.basename(file_list_path)}", self.merge_transform_params) - transform_params = re.sub(r"--inputHitsFile=", "--inputHitsFile ", transform_params) - transform_params = re.sub(r"--inputHITSFile=", "--inputHITSFile ", transform_params) - transform_params = re.sub(r"\$\{OUTPUT0\}", output_file, transform_params, count=1) - transform_params = re.sub(r"--autoConfiguration=everything", "", transform_params) + transform_params = re.sub( + r"@inputFor_\$\{OUTPUT0\}", + f"@/srv/{os.path.basename(file_list_path)}", + self.merge_transform_params, + ) + transform_params = re.sub( + r"--inputHitsFile=", "--inputHitsFile ", transform_params + ) + transform_params = re.sub( + r"--inputHITSFile=", "--inputHITSFile ", transform_params + ) + transform_params = re.sub( + r"\$\{OUTPUT0\}", output_file, transform_params, count=1 + ) + transform_params = re.sub( + r"--autoConfiguration=everything", "", transform_params + ) transform_params = re.sub(r"--DBRelease=current", "", transform_params) - endtoken = "" if self.config.payload['containerextrasetup'].strip().endswith(";") else ";" + endtoken = ( + "" + if self.config.payload["containerextrasetup"].strip().endswith(";") + else ";" + ) container_script = f"{self.config.payload['containerextrasetup']}{endtoken}{self.merge_transform} {transform_params}" merge_script_path = os.path.join(tmp_dir, "merge_transform.sh") - with open(merge_script_path, 'w') as f: + with open(merge_script_path, "w") as f: f.write(container_script) - os.chmod(merge_script_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) + os.chmod( + merge_script_path, + stat.S_IRUSR + | stat.S_IWUSR + | stat.S_IXUSR + | stat.S_IRGRP + | stat.S_IXGRP + | stat.S_IROTH + | stat.S_IXOTH, + ) setup_script_path = os.path.join(tmp_dir, "release_setup.sh") - setup_script = f"asetup Athena,{self.release},notest --platform {self.cmt_config} --makeflags=\'$MAKEFLAGS\'" + setup_script = f"asetup Athena,{self.release},notest --platform {self.cmt_config} --makeflags='$MAKEFLAGS'" self._logger.debug(f"Setting up release with: {setup_script}") - with open(setup_script_path, 'w') as f: + with open(setup_script_path, "w") as f: f.write(setup_script) - os.chmod(setup_script_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) + os.chmod( + setup_script_path, + stat.S_IRUSR + | stat.S_IWUSR + | stat.S_IXUSR + | stat.S_IRGRP + | stat.S_IXGRP + | stat.S_IROTH + | stat.S_IXOTH, + ) cmd = "" cmd += "export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase;" - cmd += f"export thePlatform=\"{self.the_platform}\";" - endtoken = "" if self.config.payload['containerextraargs'].strip().endswith(";") else ";" - cmd += (f"{self.config.payload['containerextraargs']}{endtoken}" - f"source ${{ATLAS_LOCAL_ROOT_BASE}}/user/atlasLocalSetup.sh --swtype {self.config.payload['containerengine']}" - f" -c $thePlatform -s /srv/release_setup.sh -r /srv/merge_transform.sh -e \"{self.container_options}\";" - f"RETURN_VAL=$?;if [ \"$RETURN_VAL\" -eq 0 ]; then cp jobReport.json {job_report_name};fi;exit $RETURN_VAL;") - return (Popen(cmd, - stdin=DEVNULL, - stdout=DEVNULL, - stderr=DEVNULL, - shell=True, - cwd=tmp_dir, - close_fds=True), job_report_name) + cmd += f'export thePlatform="{self.the_platform}";' + endtoken = ( + "" + if self.config.payload["containerextraargs"].strip().endswith(";") + else ";" + ) + cmd += ( + f"{self.config.payload['containerextraargs']}{endtoken}" + f"source ${{ATLAS_LOCAL_ROOT_BASE}}/user/atlasLocalSetup.sh --swtype {self.config.payload['containerengine']}" + f" -c $thePlatform -s /srv/release_setup.sh -r /srv/merge_transform.sh -e \"{self.container_options}\";" + f"RETURN_VAL=$?;if [ \"$RETURN_VAL\" -eq 0 ]; then cp jobReport.json {job_report_name};fi;exit $RETURN_VAL;" + ) + return ( + Popen( + cmd, + stdin=DEVNULL, + stdout=DEVNULL, + stderr=DEVNULL, + shell=True, + cwd=tmp_dir, + close_fds=True, + ), + job_report_name, + ) diff --git a/src/raythena/scripts/raythena.py b/src/raythena/scripts/raythena.py index a75a1cb..229072e 100755 --- a/src/raythena/scripts/raythena.py +++ b/src/raythena/scripts/raythena.py @@ -13,43 +13,24 @@ @click.command() +@click.option("--config", required=True, help="raythena configuration file.") +@click.option("-d", "--debug", is_flag=True, help="Debug log level") +@click.option("--ray-head-ip", help="IP address of ray head node") @click.option( - '--config', - required=True, - help='raythena configuration file.' + "--ray-redis-port", help="Port of redis instance used by the ray cluster" ) @click.option( - '-d', '--debug', - is_flag=True, - help='Debug log level' + "--ray-redis-password", help="Redis password setup in the ray cluster" ) +@click.option("--ray-workdir", help="Workdirectory for ray actors") @click.option( - '--ray-head-ip', - help='IP address of ray head node' + "--harvester-endpoint", + help="Directory to use to communicate with harvester", ) +@click.option("--panda-queue", help="Panda queue provided to the payload") @click.option( - '--ray-redis-port', - help='Port of redis instance used by the ray cluster' -) -@click.option( - '--ray-redis-password', - help='Redis password setup in the ray cluster' -) -@click.option( - '--ray-workdir', - help='Workdirectory for ray actors' -) -@click.option( - '--harvester-endpoint', - help='Directory to use to communicate with harvester' -) -@click.option( - '--panda-queue', - help='Panda queue provided to the payload' -) -@click.option( - '--core-per-node', - help='Used to determine how many events should be buffered by ray actors' + "--core-per-node", + help="Used to determine how many events should be buffered by ray actors", ) def cli(*args, **kwargs): """ @@ -59,18 +40,28 @@ def cli(*args, **kwargs): Returns: None """ - config = Config(kwargs['config'], *args, **kwargs) + config = Config(kwargs["config"], *args, **kwargs) cluster_config = setup_ray(config) try: - driver = ESDriver(config, cluster_config['session_dir']) + driver = ESDriver(config, cluster_config["session_dir"]) signal.signal(signal.SIGINT, functools.partial(cleanup, config, driver)) - signal.signal(signal.SIGTERM, functools.partial(cleanup, config, driver)) - signal.signal(signal.SIGQUIT, functools.partial(cleanup, config, driver)) - signal.signal(signal.SIGSEGV, functools.partial(cleanup, config, driver)) - signal.signal(signal.SIGXCPU, functools.partial(cleanup, config, driver)) - signal.signal(signal.SIGUSR1, functools.partial(cleanup, config, driver)) + signal.signal( + signal.SIGTERM, functools.partial(cleanup, config, driver) + ) + signal.signal( + signal.SIGQUIT, functools.partial(cleanup, config, driver) + ) + signal.signal( + signal.SIGSEGV, functools.partial(cleanup, config, driver) + ) + signal.signal( + signal.SIGXCPU, functools.partial(cleanup, config, driver) + ) + signal.signal( + signal.SIGUSR1, functools.partial(cleanup, config, driver) + ) signal.signal(signal.SIGBUS, functools.partial(cleanup, config, driver)) driver.run() except Exception as e: @@ -81,7 +72,12 @@ def cli(*args, **kwargs): shutdown_ray(config) -def cleanup(config: Config, driver: BaseDriver, signum: signal.Signals, frame: types.FrameType) -> None: +def cleanup( + config: Config, + driver: BaseDriver, + signum: signal.Signals, + frame: types.FrameType, +) -> None: """ Signal handler, notify the ray driver to stop @@ -98,7 +94,7 @@ def cleanup(config: Config, driver: BaseDriver, signum: signal.Signals, frame: t def main(): - cli(auto_envvar_prefix='RAYTHENA') + cli(auto_envvar_prefix="RAYTHENA") if __name__ == "__main__": diff --git a/src/raythena/utils/bookkeeper.py b/src/raythena/utils/bookkeeper.py index 337f55a..d4fcefb 100644 --- a/src/raythena/utils/bookkeeper.py +++ b/src/raythena/utils/bookkeeper.py @@ -55,7 +55,9 @@ class TaskStatus: MERGING = "merging" FAILED = "failed" - def __init__(self, job: PandaJob, merged_files_dir: str, config: Config) -> None: + def __init__( + self, job: PandaJob, merged_files_dir: str, config: Config + ) -> None: self.config = config self.job = job self._logger = make_logger(self.config, "TaskStatus") @@ -63,16 +65,25 @@ def __init__(self, job: PandaJob, merged_files_dir: str, config: Config) -> None self.merged_files_dir = merged_files_dir self.filepath = os.path.join(self.output_dir, "state.json") self.tmpfilepath = f"{self.filepath}.tmp" - self._events_per_file = int(job['nEventsPerInputFile']) - self._nfiles = len(job['inFiles'].split(",")) + self._events_per_file = int(job["nEventsPerInputFile"]) + self._nfiles = len(job["inFiles"].split(",")) self._nevents = self._events_per_file * self._nfiles - self._hits_per_file = int(job['esmergeSpec']['nEventsPerOutputFile']) - assert (self._events_per_file % self._hits_per_file == 0) or ( - self._hits_per_file % self._events_per_file == 0), "Expected number of events per input file to be a multiple of number of hits per merged file" + self._hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) + assert ( + (self._events_per_file % self._hits_per_file == 0) + or (self._hits_per_file % self._events_per_file == 0) + ), "Expected number of events per input file to be a multiple of number of hits per merged file" # if _hits_per_file > _events_per_file, each input file has a single output file - self._n_output_per_input_file = max(1, self._events_per_file // self._hits_per_file) - self._status: Dict[str, Union[Dict[str, Dict[str, Dict[str, str]]], Dict[str, List[str]]]] = dict() - self._update_queue: Deque[Tuple[str, Union[EventRange, Tuple]]] = collections.deque() + self._n_output_per_input_file = max( + 1, self._events_per_file // self._hits_per_file + ) + self._status: Dict[ + str, + Union[Dict[str, Dict[str, Dict[str, str]]], Dict[str, List[str]]], + ] = dict() + self._update_queue: Deque[Tuple[str, Union[EventRange, Tuple]]] = ( + collections.deque() + ) self._restore_status() def _default_init_status(self): @@ -104,7 +115,9 @@ def _restore_status(self): self._status = json.load(f) except OSError as e: # failed to load status, try to read from a possible tmp file if it exists and not already done - if filename != self.tmpfilepath and os.path.isfile(self.tmpfilepath): + if filename != self.tmpfilepath and os.path.isfile( + self.tmpfilepath + ): try: with open(self.tmpfilepath) as f: self._status = json.load(f) @@ -113,7 +126,7 @@ def _restore_status(self): self._logger.error(ee.strerror) self._default_init_status() - def save_status(self, write_to_tmp=True, force_update = False): + def save_status(self, write_to_tmp=True, force_update=False): """ Save the current status to a json file. Before saving to file, the update queue will be drained, actually carrying out the operations to the dictionary that will be written to file. @@ -140,7 +153,7 @@ def save_status(self, write_to_tmp=True, force_update = False): if write_to_tmp: filename = self.tmpfilepath try: - with open(filename, 'w') as f: + with open(filename, "w") as f: json.dump(self._status, f) if write_to_tmp: @@ -155,7 +168,9 @@ def is_stale(self) -> bool: return len(self._update_queue) > 0 @staticmethod - def build_eventrange_dict(eventrange: EventRange, output_file: str = None) -> Dict[str, Any]: + def build_eventrange_dict( + eventrange: EventRange, output_file: str = None + ) -> Dict[str, Any]: """ Takes an EventRange object and retuns the dict representation which should be saved in the state file @@ -164,12 +179,18 @@ def build_eventrange_dict(eventrange: EventRange, output_file: str = None) -> Di Returns: The dictionnary to serialize """ - res = {"eventRangeID": eventrange.eventRangeID, "startEvent": eventrange.startEvent, "lastEvent": eventrange.lastEvent} + res = { + "eventRangeID": eventrange.eventRangeID, + "startEvent": eventrange.startEvent, + "lastEvent": eventrange.lastEvent, + } if output_file: res["path"] = output_file return res - def set_eventrange_simulated(self, eventrange: EventRange, simulation_output_file: str): + def set_eventrange_simulated( + self, eventrange: EventRange, simulation_output_file: str + ): """ Enqueue a message indicating that an event range has been simulated @@ -177,9 +198,13 @@ def set_eventrange_simulated(self, eventrange: EventRange, simulation_output_fil eventrange: the event range simulation_output_file: produced file """ - self._update_queue.append((TaskStatus.SIMULATED, (eventrange, simulation_output_file))) + self._update_queue.append( + (TaskStatus.SIMULATED, (eventrange, simulation_output_file)) + ) - def _set_eventrange_simulated(self, eventrange: EventRange, simulation_output_file: str): + def _set_eventrange_simulated( + self, eventrange: EventRange, simulation_output_file: str + ): """ Performs the update of the internal dictionnary of a simulated event range @@ -191,9 +216,17 @@ def _set_eventrange_simulated(self, eventrange: EventRange, simulation_output_fi simulated_dict = self._status[TaskStatus.SIMULATED] if filename not in simulated_dict: simulated_dict[filename] = dict() - simulated_dict[filename][eventrange.eventRangeID] = TaskStatus.build_eventrange_dict(eventrange, simulation_output_file) - - def set_file_merged(self, input_files: List[str], outputfile: str, event_ranges: Mapping[str, Mapping[str, str]], guid: Optional[str]): + simulated_dict[filename][eventrange.eventRangeID] = ( + TaskStatus.build_eventrange_dict(eventrange, simulation_output_file) + ) + + def set_file_merged( + self, + input_files: List[str], + outputfile: str, + event_ranges: Mapping[str, Mapping[str, str]], + guid: Optional[str], + ): """ Enqueue a message indicating that a file has been merged. @@ -202,9 +235,17 @@ def set_file_merged(self, input_files: List[str], outputfile: str, event_ranges: outputfile: produced merged hits file event_ranges: event ranges merged in the outputfile. Map of [event_range_id, [k, v]] """ - self._update_queue.append((TaskStatus.MERGING, (input_files, outputfile, event_ranges, guid))) + self._update_queue.append( + (TaskStatus.MERGING, (input_files, outputfile, event_ranges, guid)) + ) - def _set_file_merged(self, input_files: List[str], outputfile: str, event_ranges: Mapping[str, Mapping[str, str]], guid: Optional[str]): + def _set_file_merged( + self, + input_files: List[str], + outputfile: str, + event_ranges: Mapping[str, Mapping[str, str]], + guid: Optional[str], + ): """ Performs the update of the internal dictionnary of a merged file. @@ -218,23 +259,41 @@ def _set_file_merged(self, input_files: List[str], outputfile: str, event_ranges for file in input_files: if file in failed_dict: total_failed += len(failed_dict[file]) - assert len(event_ranges) + total_failed == self._hits_per_file, f"Expected {self._hits_per_file} hits in {outputfile}, got {len(event_ranges)}" + assert ( + len(event_ranges) + total_failed == self._hits_per_file + ), f"Expected {self._hits_per_file} hits in {outputfile}, got {len(event_ranges)}" for inputfile in input_files: if inputfile not in self._status[TaskStatus.MERGING]: - self._status[TaskStatus.MERGING][inputfile] = {outputfile: event_ranges} + self._status[TaskStatus.MERGING][inputfile] = { + outputfile: event_ranges + } else: - self._status[TaskStatus.MERGING][inputfile][outputfile] = event_ranges - - if len(self._status[TaskStatus.MERGING][inputfile]) == self._n_output_per_input_file: + self._status[TaskStatus.MERGING][inputfile][outputfile] = ( + event_ranges + ) + + if ( + len(self._status[TaskStatus.MERGING][inputfile]) + == self._n_output_per_input_file + ): merged_dict = dict() self._status[TaskStatus.MERGED][inputfile] = merged_dict - for merged_outputfile in self._status[TaskStatus.MERGING][inputfile].keys(): - merged_dict[merged_outputfile] = {"path": os.path.join(self.merged_files_dir, merged_outputfile), "guid": guid if guid else ""} + for merged_outputfile in self._status[TaskStatus.MERGING][ + inputfile + ].keys(): + merged_dict[merged_outputfile] = { + "path": os.path.join( + self.merged_files_dir, merged_outputfile + ), + "guid": guid if guid else "", + } del self._status[TaskStatus.MERGING][inputfile] del self._status[TaskStatus.SIMULATED][inputfile] else: for event_range_id in event_ranges: - del self._status[TaskStatus.SIMULATED][inputfile][event_range_id] + del self._status[TaskStatus.SIMULATED][inputfile][ + event_range_id + ] def set_eventrange_failed(self, eventrange: EventRange): """ @@ -256,8 +315,12 @@ def _set_eventrange_failed(self, eventrange: EventRange): failed_dict = self._status[TaskStatus.FAILED] if filename not in failed_dict: failed_dict[filename] = dict() - failed_dict[filename][eventrange.eventRangeID] = TaskStatus.build_eventrange_dict(eventrange) - if eventrange.eventRangeID in self._status[TaskStatus.SIMULATED].get(filename, {}): + failed_dict[filename][eventrange.eventRangeID] = ( + TaskStatus.build_eventrange_dict(eventrange) + ) + if eventrange.eventRangeID in self._status[TaskStatus.SIMULATED].get( + filename, {} + ): del self._status[TaskStatus.SIMULATED][eventrange.eventRangeID] def get_nsimulated(self, filename=None) -> int: @@ -275,11 +338,24 @@ def get_nsimulated(self, filename=None) -> int: if filename in self._status[TaskStatus.MERGED]: return merged elif filename in self._status[TaskStatus.MERGING]: - merged = len(self._status[TaskStatus.MERGING][filename]) * self._hits_per_file - return len(self._status[TaskStatus.SIMULATED].get(filename, [])) - merged - - return reduce(lambda acc, cur: acc + len(cur), self._status[TaskStatus.SIMULATED].values(), 0) - \ - reduce(lambda acc, cur: acc + len(cur) * self._hits_per_file, self._status[TaskStatus.MERGING].values(), 0) + merged = ( + len(self._status[TaskStatus.MERGING][filename]) + * self._hits_per_file + ) + return ( + len(self._status[TaskStatus.SIMULATED].get(filename, [])) + - merged + ) + + return reduce( + lambda acc, cur: acc + len(cur), + self._status[TaskStatus.SIMULATED].values(), + 0, + ) - reduce( + lambda acc, cur: acc + len(cur) * self._hits_per_file, + self._status[TaskStatus.MERGING].values(), + 0, + ) def get_nfailed(self, filename=None) -> int: """ @@ -293,7 +369,11 @@ def get_nfailed(self, filename=None) -> int: """ if filename: return len(self._status[TaskStatus.FAILED].get(filename, [])) - return reduce(lambda acc, cur: acc + len(cur), self._status[TaskStatus.FAILED].values(), 0) + return reduce( + lambda acc, cur: acc + len(cur), + self._status[TaskStatus.FAILED].values(), + 0, + ) def get_nmerged(self, filename=None) -> int: """ @@ -309,9 +389,17 @@ def get_nmerged(self, filename=None) -> int: if filename in self._status[TaskStatus.MERGED]: return self._events_per_file elif filename in self._status[TaskStatus.MERGING]: - return len(self._status[TaskStatus.MERGING][filename]) * self._hits_per_file - return len(self._status[TaskStatus.MERGED]) * self._events_per_file + \ - reduce(lambda acc, cur: acc + len(cur) * self._hits_per_file, self._status[TaskStatus.MERGING].values(), 0) + return ( + len(self._status[TaskStatus.MERGING][filename]) + * self._hits_per_file + ) + return len( + self._status[TaskStatus.MERGED] + ) * self._events_per_file + reduce( + lambda acc, cur: acc + len(cur) * self._hits_per_file, + self._status[TaskStatus.MERGING].values(), + 0, + ) def total_events(self) -> int: """ @@ -335,14 +423,20 @@ def __init__(self, config: Config) -> None: self.actors: Dict[str, Optional[str]] = dict() self.rangesID_by_actor: Dict[str, Set[str]] = dict() #  Output files for which we are ready to launch a merge transform - self.files_ready_to_merge: Dict[str, List[Tuple[str, EventRange]]] = dict() + self.files_ready_to_merge: Dict[str, List[Tuple[str, EventRange]]] = ( + dict() + ) # Event ranges for a given input file which have been simulated and a ready to be merged self.ranges_to_merge: Dict[str, List[Tuple[str, EventRange]]] = dict() # Accumulate event ranges of different input files into the same output file until we have enough to produce a merged file # Only used when multiple input files are merged in a single output (n-1) to pool input files together - self.output_merge_queue: Dict[str, List[Tuple[str, EventRange]]] = dict() + self.output_merge_queue: Dict[str, List[Tuple[str, EventRange]]] = ( + dict() + ) # Keep tracks of merge job definition that have been distributed to the driver for which we expect an update - self.ditributed_merge_tasks: Dict[str, List[Tuple[str, EventRange]]] = dict() + self.ditributed_merge_tasks: Dict[str, List[Tuple[str, EventRange]]] = ( + dict() + ) self.files_guids: Dict[str, str] = dict() self.last_status_print = time.time() self.taskstatus: Dict[str, TaskStatus] = dict() @@ -350,8 +444,12 @@ def __init__(self, config: Config) -> None: self._output_input_mapping: Dict[str, List[str]] = dict() self.stop_saver = threading.Event() self.stop_cleaner = threading.Event() - self.save_state_thread = ExThread(target=self._saver_thead_run, name="status-saver-thread") - self.cleaner_thread = ExThread(target=self._cleaner_thead_run, name="cleaner-thread") + self.save_state_thread = ExThread( + target=self._saver_thead_run, name="status-saver-thread" + ) + self.cleaner_thread = ExThread( + target=self._cleaner_thead_run, name="cleaner-thread" + ) def _cleaner_thead_run(self): """ @@ -362,14 +460,18 @@ def _cleaner_thead_run(self): if os.path.isdir(self.output_dir): files = set(os.listdir(self.output_dir)) for task_status in self.taskstatus.values(): - for merged_file in task_status._status[TaskStatus.MERGED].keys(): + for merged_file in task_status._status[ + TaskStatus.MERGED + ].keys(): if self.stop_cleaner.is_set(): break for temp_file in files: if self.stop_cleaner.is_set(): break if merged_file in temp_file: - os.remove(os.path.join(self.output_dir, temp_file)) + os.remove( + os.path.join(self.output_dir, temp_file) + ) removed.add(temp_file) files -= removed removed.clear() @@ -378,7 +480,6 @@ def _cleaner_thead_run(self): self.stop_cleaner.wait(60) def _saver_thead_run(self): - while not self.stop_saver.is_set(): self.save_status() # wait for 60s before next update or until the stop condition is met @@ -404,8 +505,8 @@ def check_mergeable_files(self): def _check_mergeable_files_1_n(self): for input_file, event_ranges in self.ranges_to_merge.items(): while len(event_ranges) >= self._hits_per_file: - ranges_to_merge = event_ranges[-self._hits_per_file:] - del event_ranges[-self._hits_per_file:] + ranges_to_merge = event_ranges[-self._hits_per_file :] + del event_ranges[-self._hits_per_file :] output_file = self._input_output_mapping[input_file].pop() self.files_ready_to_merge[output_file] = ranges_to_merge @@ -419,21 +520,30 @@ def _check_mergeable_files_n_1(self): self.output_merge_queue[output_filename] = [] self.output_merge_queue[output_filename].extend(event_ranges) event_ranges.clear() - if len(self.output_merge_queue[output_filename]) == self._hits_per_file: - self.files_ready_to_merge[output_filename] = self.output_merge_queue[output_filename] + if ( + len(self.output_merge_queue[output_filename]) + == self._hits_per_file + ): + self.files_ready_to_merge[output_filename] = ( + self.output_merge_queue[output_filename] + ) del self.output_merge_queue[output_filename] def stop_saver_thread(self): if self.save_state_thread.is_alive(): self.stop_saver.set() self.save_state_thread.join_with_ex() - self.save_state_thread = ExThread(target=self._saver_thead_run, name="status-saver-thread") + self.save_state_thread = ExThread( + target=self._saver_thead_run, name="status-saver-thread" + ) def stop_cleaner_thread(self): if self.cleaner_thread.is_alive(): self.stop_cleaner.set() self.cleaner_thread.join_with_ex() - self.cleaner_thread = ExThread(target=self._cleaner_thead_run, name="cleaner-thread") + self.cleaner_thread = ExThread( + target=self._cleaner_thead_run, name="cleaner-thread" + ) def start_threads(self): """ @@ -467,7 +577,7 @@ def add_jobs(self, jobs: Mapping[str, JobDef], start_threads=True) -> None: assert self.output_dir assert self.merged_files_dir ts = TaskStatus(job, self.merged_files_dir, self.config) - self.taskstatus[job['taskID']] = ts + self.taskstatus[job["taskID"]] = ts self.commitlog = os.path.join(self.output_dir, "commit_log") self._generate_input_output_mapping(job) self._generate_event_ranges(job, ts) @@ -479,10 +589,12 @@ def _generate_input_output_mapping(self, job: PandaJob): Goes through the list of input and ouput file names and matches expected output files for a given input file """ # Filter out potential log files, only interested in HITS files - output_files = [e for e in job["outFiles"].split(',') if e.startswith("HITS")] - input_files = job["inFiles"].split(',') - events_per_file = int(job['nEventsPerInputFile']) - hits_per_file = int(job['esmergeSpec']['nEventsPerOutputFile']) + output_files = [ + e for e in job["outFiles"].split(",") if e.startswith("HITS") + ] + input_files = job["inFiles"].split(",") + events_per_file = int(job["nEventsPerInputFile"]) + hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) input_output_mapping = dict() output_input_mapping = dict() @@ -502,9 +614,11 @@ def _generate_input_output_mapping(self, job: PandaJob): assert events_per_file % hits_per_file == 0 n = events_per_file // hits_per_file assert len(input_files) * n == len(output_files) - for i, j in zip(range(len(input_files)), range(0, len(output_files), n)): - input_output_mapping[input_files[i]] = output_files[j:(j + n)] - for output_file in output_files[j:(j + n)]: + for i, j in zip( + range(len(input_files)), range(0, len(output_files), n) + ): + input_output_mapping[input_files[i]] = output_files[j : (j + n)] + for output_file in output_files[j : (j + n)]: output_input_mapping[output_file] = [input_files[i]] self._input_output_mapping = input_output_mapping self._output_input_mapping = output_input_mapping @@ -524,14 +638,19 @@ def remap_output_files(self, panda_id: str) -> Dict[str, str]: merged_files = task_status._status[TaskStatus.MERGED] previous_to_current_output_lookup: Dict[str, str] = dict() - with open(self.commitlog, 'a') as f: + with open(self.commitlog, "a") as f: for input_file, output_files in self._input_output_mapping.items(): merged_output_files = merged_files[input_file] assert isinstance(merged_output_files, dict) assert len(merged_output_files) == len(output_files) - for merged_file, new_file in zip(merged_output_files, output_files): + for merged_file, new_file in zip( + merged_output_files, output_files + ): if merged_file in previous_to_current_output_lookup: - assert new_file == previous_to_current_output_lookup[merged_file] + assert ( + new_file + == previous_to_current_output_lookup[merged_file] + ) continue previous_to_current_output_lookup[merged_file] = new_file f.write(f"rename_output {merged_file} {new_file}\n") @@ -581,13 +700,13 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): job: the job to which the generated event ranges will be assigned task_status: current status of the panda task """ - self._events_per_file = int(job['nEventsPerInputFile']) + self._events_per_file = int(job["nEventsPerInputFile"]) # We only ever get one job - self._hits_per_file = int(job['esmergeSpec']['nEventsPerOutputFile']) + self._hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) is_n_to_one = self._hits_per_file >= self._events_per_file - files = job["inFiles"].split(',') + files = job["inFiles"].split(",") if files: - guids = job["GUID"].split(',') + guids = job["GUID"].split(",") for file, guid in zip(files, guids): self.files_guids[file] = guid if "scopeIn" in job: @@ -614,7 +733,9 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): guid = self.files_guids[file] for i in range(1, self._events_per_file + 1): range_id = BookKeeper.generate_event_range_id(file, i) - event_range = EventRange(range_id, i, i, file, guid, scope) + event_range = EventRange( + range_id, i, i, file, guid, scope + ) event_range.status = EventRange.FAILED failed_event_ranges.append(event_range) task_status.set_eventrange_failed(event_range) @@ -641,14 +762,23 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): skip_event = False continue # event range hasn't been merged but already simulated, add it as ready to be merged - if file_simulated_ranges is not None and range_id in file_simulated_ranges: - item = (file_simulated_ranges[range_id]["path"], event_range) + if ( + file_simulated_ranges is not None + and range_id in file_simulated_ranges + ): + item = ( + file_simulated_ranges[range_id]["path"], + event_range, + ) if event_range.PFN not in self.ranges_to_merge: self.ranges_to_merge[event_range.PFN] = [item] else: self.ranges_to_merge[event_range.PFN].append(item) # only for 1-to-n jobs, failure in n-t-1 have been handled in the 1st pass - elif file_failed_ranges is not None and range_id in file_failed_ranges: + elif ( + file_failed_ranges is not None + and range_id in file_failed_ranges + ): event_range.status = EventRange.FAILED job.event_ranges_queue.append(event_range) # event range hasn't been simulated, add it to the event range queue @@ -658,7 +788,8 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): job.event_ranges_queue.add_new_event_ranges(event_ranges) def add_event_ranges( - self, event_ranges: Mapping[str, Sequence[EventRangeDef]]) -> None: + self, event_ranges: Mapping[str, Sequence[EventRangeDef]] + ) -> None: """ Assign event ranges to the jobs in queue. @@ -738,11 +869,16 @@ def fetch_event_ranges(self, actor_id: str, n: int) -> List[EventRange]: if actor_id not in self.rangesID_by_actor: self.rangesID_by_actor[actor_id] = set() ranges = self.jobs.get_event_ranges( - self.actors[actor_id]).get_next_ranges(n) - self.rangesID_by_actor[actor_id].update(map(lambda e: e.eventRangeID, ranges)) + self.actors[actor_id] + ).get_next_ranges(n) + self.rangesID_by_actor[actor_id].update( + map(lambda e: e.eventRangeID, ranges) + ) return ranges - def get_file_to_merge(self) -> Optional[Tuple[str, List[Tuple[str, EventRange]]]]: + def get_file_to_merge( + self, + ) -> Optional[Tuple[str, List[Tuple[str, EventRange]]]]: """ Returns a merge tasks available for an arbitrary input file if available, None otherwise. """ @@ -752,17 +888,36 @@ def get_file_to_merge(self) -> Optional[Tuple[str, List[Tuple[str, EventRange]]] return merge_task return None - def report_merged_file(self, taskID: str, merged_output_file: str, merged_event_ranges: Mapping[str, Mapping[str, str]], guid: Optional[str]): + def report_merged_file( + self, + taskID: str, + merged_output_file: str, + merged_event_ranges: Mapping[str, Mapping[str, str]], + guid: Optional[str], + ): assert merged_output_file in self.ditributed_merge_tasks del self.ditributed_merge_tasks[merged_output_file] - self.taskstatus[taskID].set_file_merged(self._output_input_mapping[merged_output_file], merged_output_file, merged_event_ranges, guid) - - def report_failed_merge_transform(self, taskID: str, merged_output_file: str): + self.taskstatus[taskID].set_file_merged( + self._output_input_mapping[merged_output_file], + merged_output_file, + merged_event_ranges, + guid, + ) + + def report_failed_merge_transform( + self, taskID: str, merged_output_file: str + ): assert merged_output_file in self.ditributed_merge_tasks old_task = self.ditributed_merge_tasks.pop(merged_output_file) self.files_ready_to_merge[merged_output_file] = old_task - def process_event_ranges_update(self, actor_id: str, event_ranges_update: Union[Sequence[PilotEventRangeUpdateDef], EventRangeUpdate]): + def process_event_ranges_update( + self, + actor_id: str, + event_ranges_update: Union[ + Sequence[PilotEventRangeUpdateDef], EventRangeUpdate + ], + ): """ Process the event ranges update sent by the worker. This will update the status of event ranges in the update as well as building the list of event ranges to be tarred up for each input file. @@ -781,22 +936,29 @@ def process_event_ranges_update(self, actor_id: str, event_ranges_update: Union[ if not isinstance(event_ranges_update, EventRangeUpdate): event_ranges_update = EventRangeUpdate.build_from_dict( - panda_id, event_ranges_update) + panda_id, event_ranges_update + ) self.jobs.process_event_ranges_update(event_ranges_update) - task_status = self.taskstatus[self.jobs[panda_id]['taskID']] + task_status = self.taskstatus[self.jobs[panda_id]["taskID"]] job_ranges = self.jobs.get_event_ranges(panda_id) actor_ranges = self.rangesID_by_actor[actor_id] # 1st pass for failed ranges failed_files = [] for r in event_ranges_update[panda_id]: - status = r['eventStatus'] - if 'eventRangeID' in r and r['eventRangeID'] in actor_ranges and status in [EventRange.FAILED, EventRange.FATAL]: + status = r["eventStatus"] + if ( + "eventRangeID" in r + and r["eventRangeID"] in actor_ranges + and status in [EventRange.FAILED, EventRange.FATAL] + ): self._logger.info(f"Received failed event from {actor_id}: {r}") - evnt_range = job_ranges[r['eventRangeID']] + evnt_range = job_ranges[r["eventRangeID"]] if evnt_range.PFN in failed_files: continue - failed_files.extend(self.get_files_to_merge_with(evnt_range.PFN)) + failed_files.extend( + self.get_files_to_merge_with(evnt_range.PFN) + ) for file in failed_files: for i in range(1, self._events_per_file + 1): @@ -805,17 +967,19 @@ def process_event_ranges_update(self, actor_id: str, event_ranges_update: Union[ task_status.set_eventrange_failed(job_ranges[event_range_id]) for r in event_ranges_update[panda_id]: - if 'eventRangeID' in r and r['eventRangeID'] in actor_ranges: - range_id = r['eventRangeID'] + if "eventRangeID" in r and r["eventRangeID"] in actor_ranges: + range_id = r["eventRangeID"] actor_ranges.remove(range_id) evnt_range = job_ranges[range_id] if evnt_range.PFN in failed_files: continue - if r['eventStatus'] == EventRange.DONE: - task_status.set_eventrange_simulated(evnt_range, r['path']) + if r["eventStatus"] == EventRange.DONE: + task_status.set_eventrange_simulated(evnt_range, r["path"]) if evnt_range.PFN not in self.ranges_to_merge: self.ranges_to_merge[evnt_range.PFN] = list() - self.ranges_to_merge[evnt_range.PFN].append((r["path"], evnt_range)) + self.ranges_to_merge[evnt_range.PFN].append( + (r["path"], evnt_range) + ) now = time.time() if now - self.last_status_print > 60: self.last_status_print = now @@ -857,10 +1021,13 @@ def process_actor_end(self, actor_id: str) -> None: actor_ranges = self.rangesID_by_actor.get(actor_id, None) if not actor_ranges: return - self._logger.info(f"{actor_id} finished with {len(actor_ranges)} events remaining to process") + self._logger.info( + f"{actor_id} finished with {len(actor_ranges)} events remaining to process" + ) for rangeID in actor_ranges: self.jobs.get_event_ranges(panda_id).update_range_state( - rangeID, EventRange.READY) + rangeID, EventRange.READY + ) actor_ranges.clear() self.actors[actor_id] = None @@ -878,13 +1045,13 @@ def n_ready(self, panda_id: str) -> int: def n_events(self, panda_id: str) -> int: """ - Total number of events for a given Panda job + Total number of events for a given Panda job - Args: - panda_id: job worker_id to check + Args: + panda_id: job worker_id to check - Returns: - Number of events in panda_id + Returns: + Number of events in panda_id """ return len(self.jobs.get_event_ranges(panda_id)) diff --git a/src/raythena/utils/config.py b/src/raythena/utils/config.py index 00a9231..29a6f39 100644 --- a/src/raythena/utils/config.py +++ b/src/raythena/utils/config.py @@ -16,7 +16,7 @@ class Config: """ required_conf_settings = { - 'payload': { + "payload": { "pandaqueue": str, "logfilename": str, "extrasetup": str, @@ -30,29 +30,29 @@ class Config: "pilotkilltime": int, "timemonitorfile": str, }, - 'harvester': { - 'endpoint': str, - 'harvesterconf': str, + "harvester": { + "endpoint": str, + "harvesterconf": str, }, - 'ray': { - 'workdir': str, - 'taskprogressbasedir': str, - 'headip': str, - 'redisport': int, - 'redispassword': str, - 'timeoutinterval': int, - 'mergemaxprocesses': int, - 'cachesizefactor': int, + "ray": { + "workdir": str, + "taskprogressbasedir": str, + "headip": str, + "redisport": int, + "redispassword": str, + "timeoutinterval": int, + "mergemaxprocesses": int, + "cachesizefactor": int, }, - 'resources': { - 'corepernode': int, + "resources": { + "corepernode": int, + }, + "logging": { + "level": str, + "driverlogfile": str, + "workerlogfile": str, + "copyraylogs": bool, }, - 'logging': { - 'level': str, - 'driverlogfile': str, - 'workerlogfile': str, - 'copyraylogs': bool - } } def __init__(self, config_path: str, *args, **kwargs) -> None: @@ -89,11 +89,18 @@ def __str__(self): """ return str(self.__dict__) - def _parse_cli_args(self, config: str, debug: bool, - ray_head_ip: str, - ray_redis_password: str, ray_redis_port: str, - ray_workdir: str, harvester_endpoint: str, - panda_queue: str, core_per_node: int) -> None: + def _parse_cli_args( + self, + config: str, + debug: bool, + ray_head_ip: str, + ray_redis_password: str, + ray_redis_port: str, + ray_workdir: str, + harvester_endpoint: str, + panda_queue: str, + core_per_node: int, + ) -> None: """ Overrides config settings with settings specified via cli / env vars @@ -114,24 +121,28 @@ def _parse_cli_args(self, config: str, debug: bool, None """ if debug: - self.logging['level'] = 'debug' + self.logging["level"] = "debug" if ray_head_ip: - self.ray['headip'] = ray_head_ip + self.ray["headip"] = ray_head_ip if ray_redis_port: - self.ray['redispassword'] = ray_redis_password + self.ray["redispassword"] = ray_redis_password if ray_redis_port: - self.ray['redisport'] = ray_redis_port + self.ray["redisport"] = ray_redis_port if ray_workdir: - self.ray['workdir'] = ray_workdir + self.ray["workdir"] = ray_workdir if harvester_endpoint: - self.harvester['endpoint'] = harvester_endpoint + self.harvester["endpoint"] = harvester_endpoint if panda_queue: - self.payload['pandaqueue'] = panda_queue + self.payload["pandaqueue"] = panda_queue if core_per_node: - self.resources['corepernode'] = int(core_per_node) - - def _validate_section(self, template_section_name: str, - section_params: dict, template_params: dict) -> None: + self.resources["corepernode"] = int(core_per_node) + + def _validate_section( + self, + template_section_name: str, + section_params: dict, + template_params: dict, + ) -> None: """ Validate one section of the config file @@ -152,8 +163,11 @@ def _validate_section(self, template_section_name: str, f"Param '{name}' not found in conf section '{template_section_name}'" ) if isinstance(value, dict): - self._validate_section(f"{template_section_name}.{name}", - section_params.get(name), value) + self._validate_section( + f"{template_section_name}.{name}", + section_params.get(name), + value, + ) def _validate(self) -> None: """ @@ -166,12 +180,15 @@ def _validate(self) -> None: Exception: config file is invalid """ # validate pilot section - for template_section, template_params in Config.required_conf_settings.items( - ): + for ( + template_section, + template_params, + ) in Config.required_conf_settings.items(): section_params = getattr(self, template_section, None) if section_params is None: raise Exception( f"Malformed configuration file: section '{template_section}' not found" ) - self._validate_section(template_section, section_params, - template_params) + self._validate_section( + template_section, section_params, template_params + ) diff --git a/src/raythena/utils/eventservice.py b/src/raythena/utils/eventservice.py index cd1a858..dc27a34 100644 --- a/src/raythena/utils/eventservice.py +++ b/src/raythena/utils/eventservice.py @@ -16,18 +16,13 @@ EventRangeDef = MutableMapping[str, Builtin] FileInfo = Mapping[str, Builtin] PilotEventRangeUpdateDef = Mapping[ - str, - Union[ - Builtin, - FileInfo, - Sequence[ - EventRangeDef - ] - ] + str, Union[Builtin, FileInfo, Sequence[EventRangeDef]] ] HarvesterEventRangeUpdateDef = Sequence[MutableMapping[str, Builtin]] -EventRangeUpdateDef = Union[Sequence[PilotEventRangeUpdateDef], HarvesterEventRangeUpdateDef] +EventRangeUpdateDef = Union[ + Sequence[PilotEventRangeUpdateDef], HarvesterEventRangeUpdateDef +] EventRangeRequestDef = Mapping[str, Mapping[str, Builtin]] @@ -36,6 +31,7 @@ class Messages: """ Defines messages exchanged between ray actors and the driver """ + REQUEST_NEW_JOB = 0 REQUEST_EVENT_RANGES = 1 UPDATE_JOB = 2 @@ -109,10 +105,10 @@ def __init__(self, jobs: Mapping[str, JobDef] = None) -> None: if jobs: self.add_jobs(jobs) - def __getitem__(self, k: str) -> 'PandaJob': + def __getitem__(self, k: str) -> "PandaJob": return self.jobs[k] - def __setitem__(self, k: str, v: 'PandaJob') -> None: + def __setitem__(self, k: str, v: "PandaJob") -> None: if isinstance(v, PandaJob): self.jobs[k] = v else: @@ -127,7 +123,7 @@ def __len__(self) -> int: def __contains__(self, k: str) -> bool: return self.has_job(k) - def next_job_to_process(self) -> Optional['PandaJob']: + def next_job_to_process(self) -> Optional["PandaJob"]: """ Retrieve the next available job in the jobqueue. If the job is an eventservice job, it needs to have event ranges available otherwise it will not be considered as available @@ -181,7 +177,7 @@ def add_jobs(self, jobs: Mapping[str, JobDef]) -> None: for jobID, jobDef in jobs.items(): self.jobs[jobID] = PandaJob(jobDef) - def get_event_ranges(self, panda_id: str) -> 'EventRangeQueue': + def get_event_ranges(self, panda_id: str) -> "EventRangeQueue": """ Retrieve the EventRangeQueue for the given panda job @@ -194,8 +190,9 @@ def get_event_ranges(self, panda_id: str) -> 'EventRangeQueue': if panda_id in self.jobs: return self[panda_id].event_ranges_queue - def process_event_ranges_update(self, - ranges_update: 'EventRangeUpdate') -> None: + def process_event_ranges_update( + self, ranges_update: "EventRangeUpdate" + ) -> None: """ Update the range status Args: @@ -207,7 +204,9 @@ def process_event_ranges_update(self, for pandaID in ranges_update: self.get_event_ranges(pandaID).update_ranges(ranges_update[pandaID]) - def process_event_ranges_reply(self, reply: Mapping[str, HarvesterEventRangeUpdateDef]) -> None: + def process_event_ranges_reply( + self, reply: Mapping[str, HarvesterEventRangeUpdateDef] + ) -> None: """ Process an event ranges reply from harvester by adding ranges to each corresponding job already present in the queue. If an empty event list is received for a job, assume that no more events will be provided for this job @@ -224,11 +223,14 @@ def process_event_ranges_reply(self, reply: Mapping[str, HarvesterEventRangeUpda if not ranges: self[pandaID].no_more_ranges = True else: - ranges_obj = [EventRange.build_from_dict(range_dict) for range_dict in ranges] + ranges_obj = [ + EventRange.build_from_dict(range_dict) + for range_dict in ranges + ] self.get_event_ranges(pandaID).add_new_event_ranges(ranges_obj) @staticmethod - def build_from_dict(jobs_dict: Mapping[str, JobDef]) -> 'PandaJobQueue': + def build_from_dict(jobs_dict: Mapping[str, JobDef]) -> "PandaJobQueue": """ Convert dict of jobs returned by harvester to a PandaJobQueue. Args: @@ -361,14 +363,16 @@ def __iter__(self) -> Iterable[str]: def __len__(self) -> int: return len(self.event_ranges_by_id) - def __getitem__(self, k: str) -> 'EventRange': + def __getitem__(self, k: str) -> "EventRange": return self.event_ranges_by_id[k] - def __setitem__(self, k: str, v: 'EventRange') -> None: + def __setitem__(self, k: str, v: "EventRange") -> None: if not isinstance(v, EventRange): raise Exception(f"{v} should be of type {EventRange}") if k != v.eventRangeID: - raise Exception(f"Specified key '{k}' should be equals to the event range id '{v.eventRangeID}' ") + raise Exception( + f"Specified key '{k}' should be equals to the event range id '{v.eventRangeID}' " + ) if k in self.event_ranges_by_id: self.rangesID_by_state[v.status].remove(k) if v.PFN in self.rangesID_by_file: @@ -381,7 +385,9 @@ def __contains__(self, k: str) -> bool: return k in self.event_ranges_by_id @staticmethod - def build_from_list(ranges_list: Iterable[EventRangeDef]) -> 'EventRangeQueue': + def build_from_list( + ranges_list: Iterable[EventRangeDef], + ) -> "EventRangeQueue": """ Build an EventRangeQueue from a list of event ranges sent by harvester @@ -399,7 +405,7 @@ def build_from_list(ranges_list: Iterable[EventRangeDef]) -> 'EventRangeQueue': def _get_file_from_id(self, range_id: str) -> str: return os.path.basename(self.event_ranges_by_id[range_id].PFN) - def update_range_state(self, range_id: str, new_state: str) -> 'EventRange': + def update_range_state(self, range_id: str, new_state: str) -> "EventRange": """ Update the status of an event range Args: @@ -411,10 +417,14 @@ def update_range_state(self, range_id: str, new_state: str) -> 'EventRange': """ if range_id not in self.event_ranges_by_id: raise Exception( - f"Trying to update non-existing eventrange {range_id}") + f"Trying to update non-existing eventrange {range_id}" + ) event_range = self.event_ranges_by_id[range_id] - if new_state != EventRange.READY and event_range.status == EventRange.READY: + if ( + new_state != EventRange.READY + and event_range.status == EventRange.READY + ): self.rangesID_by_file[event_range.PFN].remove(range_id) elif new_state == EventRange.READY: self.rangesID_by_file[event_range.PFN].add(range_id) @@ -427,7 +437,7 @@ def update_range_state(self, range_id: str, new_state: str) -> 'EventRange': # rangesID_by_file only hold ids of ranges that are ready to be assigned return event_range - def assign_ready_ranges(self, n_ranges=1) -> List['EventRange']: + def assign_ready_ranges(self, n_ranges=1) -> List["EventRange"]: n_ranges = min(self.nranges_available(), n_ranges) if not n_ranges: return list() @@ -467,8 +477,8 @@ def update_ranges(self, ranges_update: Sequence[EventRangeDef]) -> None: None """ for r in ranges_update: - range_id = r['eventRangeID'] - range_status = r['eventStatus'] + range_id = r["eventRangeID"] + range_status = r["eventStatus"] if range_id not in self.event_ranges_by_id: raise Exception() self.update_range_state(range_id, range_status) @@ -483,8 +493,9 @@ def nranges_remaining(self) -> int: Returns: Number of event ranges which are not finished or failed """ - return len(self.event_ranges_by_id) - (self.nranges_done() + - self.nranges_failed()) + return len(self.event_ranges_by_id) - ( + self.nranges_done() + self.nranges_failed() + ) def nranges_available(self) -> int: """ @@ -522,7 +533,7 @@ def nranges_done(self) -> int: """ return self._get_ranges_count(EventRange.DONE) - def append(self, event_range: Union[EventRangeDef, 'EventRange']) -> None: + def append(self, event_range: Union[EventRangeDef, "EventRange"]) -> None: """ Append a single event range to the queue @@ -543,9 +554,11 @@ def append(self, event_range: Union[EventRangeDef, 'EventRange']) -> None: self.rangesID_by_file[event_range.PFN].add(event_range.eventRangeID) self.event_ranges_count[event_range.status] += 1 - def add_new_event_ranges(self, ranges: Sequence['EventRange']) -> None: + def add_new_event_ranges(self, ranges: Sequence["EventRange"]) -> None: # PRE: all ranges in the list are in state ready - self.rangesID_by_state[EventRange.READY].update(map(lambda e: e.eventRangeID, ranges)) + self.rangesID_by_state[EventRange.READY].update( + map(lambda e: e.eventRangeID, ranges) + ) self.event_ranges_count[EventRange.READY] += len(ranges) for r in ranges: self.event_ranges_by_id[r.eventRangeID] = r @@ -553,7 +566,9 @@ def add_new_event_ranges(self, ranges: Sequence['EventRange']) -> None: self.rangesID_by_file[r.PFN] = set() self.rangesID_by_file[r.PFN].add(r.eventRangeID) - def concat(self, ranges: Sequence[Union[EventRangeDef, 'EventRange']]) -> None: + def concat( + self, ranges: Sequence[Union[EventRangeDef, "EventRange"]] + ) -> None: """ Concatenate a list of event ranges to the queue @@ -566,7 +581,7 @@ def concat(self, ranges: Sequence[Union[EventRangeDef, 'EventRange']]) -> None: for r in ranges: self.append(r) - def get_next_ranges(self, nranges: int) -> List['EventRange']: + def get_next_ranges(self, nranges: int) -> List["EventRange"]: """ Dequeue event ranges. Event ranges which were dequeued are updated to the 'ASSIGNED' status and should be assigned to workers to be processed. In case more ranges are requested @@ -686,7 +701,12 @@ class EventRangeUpdate: """ - def __init__(self, range_update: Dict[str, List[MutableMapping[str, Union[str, int]]]] = None) -> None: + def __init__( + self, + range_update: Dict[ + str, List[MutableMapping[str, Union[str, int]]] + ] = None, + ) -> None: """ Wraps the range update dict in an object. The range update should be in the harvester-supported format. @@ -699,7 +719,9 @@ def __init__(self, range_update: Dict[str, List[MutableMapping[str, Union[str, i for v in range_update.values(): if not isinstance(v, list): raise Exception(f"Expecting type list for element {v}") - self.range_update: Dict[str, HarvesterEventRangeUpdateDef] = range_update + self.range_update: Dict[str, HarvesterEventRangeUpdateDef] = ( + range_update + ) def __len__(self) -> int: return len(self.range_update) @@ -718,7 +740,7 @@ def __setitem__(self, k: str, v: HarvesterEventRangeUpdateDef) -> None: raise Exception(f"Expecting type list for element {v}") self.range_update[k] = v - def merge_update(self, other: 'EventRangeUpdate') -> None: + def merge_update(self, other: "EventRangeUpdate") -> None: for pandaID in other: if pandaID in self: self[pandaID] += other[pandaID] @@ -726,8 +748,9 @@ def merge_update(self, other: 'EventRangeUpdate') -> None: self[pandaID] = other[pandaID] @staticmethod - def build_from_dict(panda_id: str, - range_update: Sequence[PilotEventRangeUpdateDef]) -> 'EventRangeUpdate': + def build_from_dict( + panda_id: str, range_update: Sequence[PilotEventRangeUpdateDef] + ) -> "EventRangeUpdate": """ Parses a range_update dict to a format adapted to be sent to harvester. @@ -740,23 +763,29 @@ def build_from_dict(panda_id: str, """ update_dict = dict() update_dict[panda_id] = list() - if isinstance( - range_update, dict - ) and "zipFile" not in range_update and "esOutput" not in range_update \ - and "eventRangeID" not in range_update: - range_update: Sequence[PilotEventRangeUpdateDef] = json.loads(range_update['eventRanges'][0]) + if ( + isinstance(range_update, dict) + and "zipFile" not in range_update + and "esOutput" not in range_update + and "eventRangeID" not in range_update + ): + range_update: Sequence[PilotEventRangeUpdateDef] = json.loads( + range_update["eventRanges"][0] + ) for range_elt in range_update: if "zipFile" in range_elt and range_elt["zipFile"]: range_update_type = "zipFile" - file_info: FileInfo = range_elt.get('zipFile', None) + file_info: FileInfo = range_elt.get("zipFile", None) elif "esOutput" in range_elt and range_elt["esOutput"]: range_update_type = "esOutput" - file_info: FileInfo = range_elt.get('esOutput', None) + file_info: FileInfo = range_elt.get("esOutput", None) else: range_update_type = None file_info: None = None - ranges_info: Sequence[EventRangeDef] = range_elt.get('eventRanges', None) + ranges_info: Sequence[EventRangeDef] = range_elt.get( + "eventRanges", None + ) file_data = dict() if file_info: @@ -764,30 +793,30 @@ def build_from_dict(panda_id: str, ftype = "es_output" else: ftype = "zip_output" - file_data['path'] = file_info['lfn'] - file_data['chksum'] = file_info['adler32'] - file_data['fsize'] = file_info['fsize'] - file_data['type'] = ftype + file_data["path"] = file_info["lfn"] + file_data["chksum"] = file_info["adler32"] + file_data["fsize"] = file_info["fsize"] + file_data["type"] = ftype if ranges_info: for rangeInfo in ranges_info: elt = dict() - elt['eventRangeID'] = rangeInfo['eventRangeID'] - elt['eventStatus'] = rangeInfo['eventStatus'] + elt["eventRangeID"] = rangeInfo["eventRangeID"] + elt["eventStatus"] = rangeInfo["eventStatus"] if range_update_type == "esOutput": - elt['path'] = rangeInfo['pfn'] - elt['chksum'] = rangeInfo['adler32'] - elt['fsize'] = rangeInfo['fsize'] + elt["path"] = rangeInfo["pfn"] + elt["chksum"] = rangeInfo["adler32"] + elt["fsize"] = rangeInfo["fsize"] elt.update(file_data) update_dict[panda_id].append(elt) else: elt = dict() - elt['eventRangeID'] = range_elt['eventRangeID'] - elt['eventStatus'] = range_elt['eventStatus'] + elt["eventRangeID"] = range_elt["eventRangeID"] + elt["eventStatus"] = range_elt["eventStatus"] if range_update_type == "esOutput": - elt['path'] = range_elt['pfn'] - elt['chksum'] = range_elt['adler32'] - elt['fsize'] = range_elt['fsize'] + elt["path"] = range_elt["pfn"] + elt["chksum"] = range_elt["adler32"] + elt["fsize"] = range_elt["fsize"] elt.update(file_data) update_dict[panda_id].append(elt) @@ -814,17 +843,19 @@ class PandaJobRequest: Note that harvester will ignore the content of the job request file and simply check if it exists """ - def __init__(self, - node: str = None, - disk_space: str = None, - working_group: str = None, - prod_source_label: str = None, - computing_element: str = None, - site_name: str = None, - resource_type: str = None, - mem: str = None, - cpu: str = None, - allow_other_country: str = None) -> None: + def __init__( + self, + node: str = None, + disk_space: str = None, + working_group: str = None, + prod_source_label: str = None, + computing_element: str = None, + site_name: str = None, + resource_type: str = None, + mem: str = None, + cpu: str = None, + allow_other_country: str = None, + ) -> None: self.node = node self.diskSpace = disk_space self.workingGroup = working_group @@ -873,7 +904,9 @@ def __getitem__(self, k: str) -> Dict[str, Builtin]: def __str__(self) -> str: return json.dumps(self.request) - def add_event_request(self, panda_id: str, n_ranges: int, task_id: str, jobset_id: str) -> None: + def add_event_request( + self, panda_id: str, n_ranges: int, task_id: str, jobset_id: str + ) -> None: """ Adds a job for which event ranges should be requested to the request object @@ -887,14 +920,16 @@ def add_event_request(self, panda_id: str, n_ranges: int, task_id: str, jobset_i """ self.request[panda_id] = { - 'pandaID': panda_id, - 'nRanges': n_ranges, - 'taskID': task_id, - 'jobsetID': jobset_id + "pandaID": panda_id, + "nRanges": n_ranges, + "taskID": task_id, + "jobsetID": jobset_id, } @staticmethod - def build_from_dict(request_dict: Mapping[str, Dict[str, Builtin]]) -> 'EventRangeRequest': + def build_from_dict( + request_dict: Mapping[str, Dict[str, Builtin]], + ) -> "EventRangeRequest": """ Build a request object from a dict parsed from its json representation @@ -1002,7 +1037,7 @@ def nranges_available(self) -> int: """ return self.event_ranges_queue.nranges_available() - def get_next_ranges(self, nranges: int) -> List['EventRange']: + def get_next_ranges(self, nranges: int) -> List["EventRange"]: """ See Also: EventRangeQueue.get_next_ranges() @@ -1016,7 +1051,7 @@ def get_pandaQueue(self) -> str: Returns: Name of the panda queue from which harvester is retrieving jobs """ - return self['destinationSE'] + return self["destinationSE"] def get_id(self) -> str: """ @@ -1025,7 +1060,7 @@ def get_id(self) -> str: Returns: the job worker_id """ - return self['PandaID'] + return self["PandaID"] def __str__(self) -> str: return json.dumps(self.job) @@ -1073,8 +1108,15 @@ class EventRange: FATAL = "fatal" STATES = [READY, ASSIGNED, DONE, FAILED, FATAL] - def __init__(self, event_range_id: str, start_event: int, last_event: int, - pfn: str, guid: str, scope: str) -> None: + def __init__( + self, + event_range_id: str, + start_event: int, + last_event: int, + pfn: str, + guid: str, + scope: str, + ) -> None: """ Initialize the range @@ -1140,7 +1182,7 @@ def __str__(self) -> str: """ return json.dumps(self.to_dict()) - def __eq__(self, o: 'EventRange') -> bool: + def __eq__(self, o: "EventRange") -> bool: if not isinstance(o, EventRange): return False return self.eventRangeID == o.eventRangeID @@ -1153,15 +1195,15 @@ def to_dict(self) -> EventRangeDef: dict serialization of the range """ return { - 'PFN': self.PFN, - 'lastEvent': self.lastEvent, - 'eventRangeID': self.eventRangeID, - 'startEvent': self.startEvent, - 'GUID': self.GUID + "PFN": self.PFN, + "lastEvent": self.lastEvent, + "eventRangeID": self.eventRangeID, + "startEvent": self.startEvent, + "GUID": self.GUID, } @staticmethod - def build_from_dict(event_ranges_dict: EventRangeDef) -> 'EventRange': + def build_from_dict(event_ranges_dict: EventRangeDef) -> "EventRange": """ Construct an event range from a dict returned by harvester @@ -1172,10 +1214,13 @@ def build_from_dict(event_ranges_dict: EventRangeDef) -> 'EventRange': EventRange object """ return EventRange( - event_ranges_dict['eventRangeID'], event_ranges_dict['startEvent'], - event_ranges_dict['lastEvent'], - event_ranges_dict.get('PFN', event_ranges_dict.get('LFN', None)), - event_ranges_dict['GUID'], event_ranges_dict['scope']) + event_ranges_dict["eventRangeID"], + event_ranges_dict["startEvent"], + event_ranges_dict["lastEvent"], + event_ranges_dict.get("PFN", event_ranges_dict.get("LFN", None)), + event_ranges_dict["GUID"], + event_ranges_dict["scope"], + ) class JobReport: @@ -1190,10 +1235,9 @@ class JobReport: """ - def __init__(self, - exitCode: int = 0, - exitMsg: str = None, - exitMsgExtra: str = None) -> None: + def __init__( + self, exitCode: int = 0, exitMsg: str = None, exitMsgExtra: str = None + ) -> None: self.exitCode = exitCode self.exitMsg = exitMsg self.exitMsgExtra = exitMsgExtra diff --git a/src/raythena/utils/exception.py b/src/raythena/utils/exception.py index 7845869..4029114 100644 --- a/src/raythena/utils/exception.py +++ b/src/raythena/utils/exception.py @@ -18,7 +18,7 @@ class ErrorCodes: STAGEIN_FAILED: "Failed to stagein data", STAGEOUT_FAILED: "Failed to stageout data", PAYLOAD_FAILED: "Payload execution failed", - UNKNOWN: "Unknown error" + UNKNOWN: "Unknown error", } @staticmethod @@ -105,7 +105,9 @@ class BaseRaythenaException(Exception): Base class for raythena exception """ - def __init__(self, worker_id: str, error_code: int, message: str = None) -> None: + def __init__( + self, worker_id: str, error_code: int, message: str = None + ) -> None: """ Initialize worker_id, error code and message @@ -116,8 +118,9 @@ def __init__(self, worker_id: str, error_code: int, message: str = None) -> None """ self.worker_id = worker_id self.error_code = error_code - self.message = message if message else ErrorCodes.get_error_message( - error_code) + self.message = ( + message if message else ErrorCodes.get_error_message(error_code) + ) super().__init__(self.message) def __reduce__(self): @@ -129,13 +132,22 @@ class IllegalWorkerState(BaseRaythenaException): Raised when the worker state tries to transition to a state he shouldn't be able to from its current state. """ - def __init__(self, worker_id: str, src_state: str, dst_state: str, message: str = None) -> None: + def __init__( + self, + worker_id: str, + src_state: str, + dst_state: str, + message: str = None, + ) -> None: super().__init__(worker_id, ErrorCodes.ILLEGAL_WORKER_STATE, message) self.src_state = src_state self.dst_state = dst_state def __reduce__(self): - return (self.__class__, (self.worker_id, self.src_state, self.dst_state, self.message)) + return ( + self.__class__, + (self.worker_id, self.src_state, self.dst_state, self.message), + ) class StageInFailed(BaseRaythenaException): @@ -192,7 +204,9 @@ class WrappedException(BaseRaythenaException): """ def __init__(self, worker_id: str, e: Exception) -> None: - super().__init__(worker_id, ErrorCodes.UNKNOWN, f"Wrapped exception {e}") + super().__init__( + worker_id, ErrorCodes.UNKNOWN, f"Wrapped exception {e}" + ) self.wrapped_exception = e def __reduce__(self): diff --git a/src/raythena/utils/importUtils.py b/src/raythena/utils/importUtils.py index 17ae8ef..7b3168d 100644 --- a/src/raythena/utils/importUtils.py +++ b/src/raythena/utils/importUtils.py @@ -16,10 +16,10 @@ def import_from_string(module_path: str) -> Callable: Raises: ImportError if the specified class couldn't be found """ - module, _, instance = module_path.partition(':') + module, _, instance = module_path.partition(":") module = importlib.import_module(module) - for elt in instance.split('.'): + for elt in instance.split("."): if not hasattr(module, elt): raise ImportError(f"Can't import {elt} from {module}") module = getattr(module, elt) diff --git a/src/raythena/utils/logging.py b/src/raythena/utils/logging.py index 90e0858..7716827 100644 --- a/src/raythena/utils/logging.py +++ b/src/raythena/utils/logging.py @@ -7,7 +7,9 @@ _initialized = False -def make_logger(config: Config, name: str, filepath: str = None) -> logging.Logger: +def make_logger( + config: Config, name: str, filepath: str = None +) -> logging.Logger: global _initialized if not _initialized: configure_logger(config, filepath) @@ -16,7 +18,7 @@ def make_logger(config: Config, name: str, filepath: str = None) -> logging.Logg def log_to_file(log_level, filepath: str): - fh = logging.FileHandler(filepath, mode='w') + fh = logging.FileHandler(filepath, mode="w") fh.setFormatter(logging.Formatter(*get_fmt(log_level))) logging.getLogger().addHandler(fh) @@ -33,7 +35,7 @@ def get_fmt(log_level): fmt = "{asctime} | {levelname:8} | {name}:{funcName} | {message}" else: fmt = "{asctime} | {levelname:8} | {name} | {message}" - return fmt, "%Y-%m-%d %H:%M:%S", '{' + return fmt, "%Y-%m-%d %H:%M:%S", "{" def configure_logger(config: Config, filepath: str) -> None: @@ -47,11 +49,11 @@ def configure_logger(config: Config, filepath: str) -> None: Returns: None """ - log_level = config.logging.get('level', 'warning').upper() + log_level = config.logging.get("level", "warning").upper() logging.Formatter.converter = gmtime handlers = list() if filepath: - fh = logging.FileHandler(filepath, mode='w') + fh = logging.FileHandler(filepath, mode="w") handlers.append(fh) else: ch = logging.StreamHandler(sys.stdout) @@ -62,4 +64,5 @@ def configure_logger(config: Config, filepath: str) -> None: style=style, datefmt=datefmt, level=logging.getLevelName(log_level), - handlers=handlers) + handlers=handlers, + ) diff --git a/src/raythena/utils/ray.py b/src/raythena/utils/ray.py index e1d9fe9..342de59 100644 --- a/src/raythena/utils/ray.py +++ b/src/raythena/utils/ray.py @@ -6,8 +6,9 @@ from raythena.utils.config import Config -def build_nodes_resource_list(config: Config, - run_actor_on_head: bool = False) -> List[Mapping[str, Any]]: +def build_nodes_resource_list( + config: Config, run_actor_on_head: bool = False +) -> List[Mapping[str, Any]]: """ Build and setup ray custom resources. Actors should then be instantiated by requiring one of the resource in the returned list. @@ -23,11 +24,11 @@ def build_nodes_resource_list(config: Config, nodes = ray.nodes() if len(nodes) == 1: # only a head node run_actor_on_head = True - head_ip = config.ray['headip'] + head_ip = config.ray["headip"] resource_list = list() for node in nodes: - naddr = node['NodeManagerAddress'] - if not node['alive'] or (not run_actor_on_head and naddr == head_ip): + naddr = node["NodeManagerAddress"] + if not node["alive"] or (not run_actor_on_head and naddr == head_ip): continue else: resource_list.extend([node]) @@ -56,8 +57,9 @@ def is_external_cluster(config: Config) -> bool: Returns: True if raythena is connecting to an existing cluster, False otherwise """ - return config.ray['headip'] is not None and config.ray[ - 'redisport'] is not None + return ( + config.ray["headip"] is not None and config.ray["redisport"] is not None + ) def setup_ray(config: Config) -> Any: @@ -70,10 +72,16 @@ def setup_ray(config: Config) -> Any: Returns: dict of cluster params """ - log_to_driver = True if not config.logging.get('workerlogfile', None) else False + log_to_driver = ( + True if not config.logging.get("workerlogfile", None) else False + ) if is_external_cluster(config): ray_url = f"{config.ray['headip']}:{config.ray['redisport']}" - return ray.init(address=ray_url, _redis_password=config.ray['redispassword'], log_to_driver=log_to_driver) + return ray.init( + address=ray_url, + _redis_password=config.ray["redispassword"], + log_to_driver=log_to_driver, + ) else: return ray.init(log_to_driver=log_to_driver) diff --git a/src/raythena/utils/timing.py b/src/raythena/utils/timing.py index 1e44268..b17188e 100644 --- a/src/raythena/utils/timing.py +++ b/src/raythena/utils/timing.py @@ -12,11 +12,14 @@ class CPUMonitor: """ Monitoring tools recording system cpu utilization as well as process cpu utilization to a file """ + def __init__(self, log_file: str, pid: Any = None) -> None: self.process = psutil.Process(pid) self.log_file = log_file self.stop_event = Event() - self.monitor_thread = ExThread(target=self.monitor_cpu, name="cpu_monitor") + self.monitor_thread = ExThread( + target=self.monitor_cpu, name="cpu_monitor" + ) self.write_interval = 10 * 60 self.time_step = 1 @@ -40,10 +43,14 @@ def stop(self) -> None: if not self.stop_event.is_set(): self.stop_event.set() self.monitor_thread.join() - self.monitor_thread = ExThread(target=self.monitor_cpu, name="cpu_monitor") + self.monitor_thread = ExThread( + target=self.monitor_cpu, name="cpu_monitor" + ) self.stop_event = Event() - def _log_to_file(self, data: Dict[str, Union[Dict[str, List], List, int]]) -> None: + def _log_to_file( + self, data: Dict[str, Union[Dict[str, List], List, int]] + ) -> None: """ Write data to log file @@ -53,7 +60,7 @@ def _log_to_file(self, data: Dict[str, Union[Dict[str, List], List, int]]) -> No Returns: None """ - with open(self.log_file, 'w') as f: + with open(self.log_file, "w") as f: json.dump(data, f) def monitor_cpu(self) -> None: @@ -93,7 +100,7 @@ def monitor_cpu(self) -> None: "system_usage": system_usage, "process_usage": process_usage, "process_times": process_times, - "time_step": self.time_step + "time_step": self.time_step, } while not self.stop_event.is_set(): @@ -112,7 +119,6 @@ def monitor_cpu(self) -> None: class Timing: - def __init__(self): self._timings = dict() diff --git a/tests/conftest.py b/tests/conftest.py index e227244..584161b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,16 +20,18 @@ def requires_ray(config_base): @pytest.fixture(scope="class") def config_base(config_path): - return Config(config_path, - config=None, - debug=False, - ray_head_ip=None, - ray_redis_password=None, - ray_redis_port=None, - ray_workdir=None, - harvester_endpoint=None, - panda_queue=None, - core_per_node=None) + return Config( + config_path, + config=None, + debug=False, + ray_head_ip=None, + ray_redis_password=None, + ray_redis_port=None, + ray_workdir=None, + harvester_endpoint=None, + panda_queue=None, + core_per_node=None, + ) @pytest.fixture @@ -58,7 +60,7 @@ def pandaids(njobs): res = [] for i in range(njobs): hash = hashlib.md5() - hash.update(str(time.time()).encode('utf-8')) + hash.update(str(time.time()).encode("utf-8")) res.append(hash.hexdigest()) return res @@ -80,7 +82,11 @@ def nhits_per_file(nevents_per_file): @pytest.fixture def range_ids(nfiles, nevents_per_file): - return [f"EVNT_{file}.pool.root.1-{event}" for event in range(1, nevents_per_file + 1) for file in range(nfiles)] + return [ + f"EVNT_{file}.pool.root.1-{event}" + for event in range(1, nevents_per_file + 1) + for file in range(nfiles) + ] @pytest.fixture @@ -94,41 +100,42 @@ def sample_ranges(range_ids, pandaids, input_output_file_list): range_list = list() res[pandaID] = range_list for i in range(nevents): - range_list.append({ - 'lastEvent': i, - 'eventRangeID': range_ids[i], - 'startEvent': i, - 'scope': '13Mev', - 'LFN': files[i % nfiles], - 'GUID': '0' - }) + range_list.append( + { + "lastEvent": i, + "eventRangeID": range_ids[i], + "startEvent": i, + "scope": "13Mev", + "LFN": files[i % nfiles], + "GUID": "0", + } + ) return res @pytest.fixture def sample_rangeupdate(range_ids): - return [{ - "zipFile": { - "numEvents": len(range_ids), - "lfn": "EventService_premerge_Range-00000.tar", - "adler32": "36503831", - "objstoreID": 1641, - "fsize": 860160, - "pathConvention": 1000 - }, - "eventRanges": [{ - "eventRangeID": r, - "eventStatus": "finished" - } for r in range_ids] - }] + return [ + { + "zipFile": { + "numEvents": len(range_ids), + "lfn": "EventService_premerge_Range-00000.tar", + "adler32": "36503831", + "objstoreID": 1641, + "fsize": 860160, + "pathConvention": 1000, + }, + "eventRanges": [ + {"eventRangeID": r, "eventStatus": "finished"} + for r in range_ids + ], + } + ] @pytest.fixture def sample_failed_rangeupdate(range_ids): - return [{ - "eventRangeID": r, - "eventStatus": "failed" - } for r in range_ids] + return [{"eventRangeID": r, "eventStatus": "failed"} for r in range_ids] @pytest.fixture @@ -147,159 +154,120 @@ def input_output_file_list(nfiles, nhits_per_file, nevents_per_file): @pytest.fixture -def sample_multijobs(request, input_output_file_list, is_eventservice, pandaids, nhits_per_file, nevents_per_file): +def sample_multijobs( + request, + input_output_file_list, + is_eventservice, + pandaids, + nhits_per_file, + nevents_per_file, +): res = {} (input_files, output_files) = input_output_file_list for pandaID in pandaids: hash = hashlib.md5() - hash.update(str(time.time()).encode('utf-8')) + hash.update(str(time.time()).encode("utf-8")) log_guid = hash.hexdigest() - hash.update(str(time.time()).encode('utf-8')) + hash.update(str(time.time()).encode("utf-8")) job_name = hash.hexdigest() - jobsetId = '0' - taskId = '0' - ncores = '8' - guid = '0' + jobsetId = "0" + taskId = "0" + ncores = "8" + guid = "0" scope = "13Mev" panda_queue_name = f"pandaqueue_{hash.hexdigest()}" inFiles = ",".join(input_files) outFiles = ",".join(output_files) outFilesShort = f"[{','.join([str(i) for i in range(len(outFiles))])}]" res[pandaID] = { - 'jobsetID': - jobsetId, - 'nEventsPerInputFile': nevents_per_file, - 'esmergeSpec': { + "jobsetID": jobsetId, + "nEventsPerInputFile": nevents_per_file, + "esmergeSpec": { "transPath": "", "jobParameters": "", - "nEventsPerOutputFile": nhits_per_file + "nEventsPerOutputFile": nhits_per_file, }, - 'logGUID': - log_guid, - 'cmtConfig': - 'x86_64-slc6-gcc49-opt', - 'prodDBlocks': - 'user.mlassnig:user.mlassnig.pilot.test.single.hits', - 'dispatchDBlockTokenForOut': - 'NULL,NULL', - 'destinationDBlockToken': - 'NULL,NULL', - 'destinationSE': - panda_queue_name, - 'realDatasets': - job_name, - 'prodUserID': - 'no_one', - 'GUID': - ",".join([f"{guid}{i}" for i in range(len(input_files))]), - 'realDatasetsIn': - 'user.mlassnig:user.mlassnig.pilot.test.single.hits', - 'nSent': - 0, - 'eventService': - str(is_eventservice), - 'cloud': - 'US', - 'StatusCode': - 0, - 'homepackage': - 'AtlasOffline/21.0.15', - 'inFiles': - inFiles, - 'processingType': - 'pilot-ptest', - 'ddmEndPointOut': - 'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', - 'fsize': - '118612262', - 'fileDestinationSE': - f"{panda_queue_name},{panda_queue_name}", - 'scopeOut': - 'panda', - 'minRamCount': - 0, - 'jobDefinitionID': - 7932, - 'maxWalltime': - 'NULL', - 'scopeLog': - 'panda', - 'transformation': - 'Sim_tf.py', - 'maxDiskCount': - 0, - 'coreCount': - ncores, - 'prodDBlockToken': - 'NULL', - 'transferType': - 'NULL', - 'destinationDblock': - job_name, - 'dispatchDBlockToken': - 'NULL', - 'jobPars': ( + "logGUID": log_guid, + "cmtConfig": "x86_64-slc6-gcc49-opt", + "prodDBlocks": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "dispatchDBlockTokenForOut": "NULL,NULL", + "destinationDBlockToken": "NULL,NULL", + "destinationSE": panda_queue_name, + "realDatasets": job_name, + "prodUserID": "no_one", + "GUID": ",".join([f"{guid}{i}" for i in range(len(input_files))]), + "realDatasetsIn": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "nSent": 0, + "eventService": str(is_eventservice), + "cloud": "US", + "StatusCode": 0, + "homepackage": "AtlasOffline/21.0.15", + "inFiles": inFiles, + "processingType": "pilot-ptest", + "ddmEndPointOut": "UTA_SWT2_DATADISK,UTA_SWT2_DATADISK", + "fsize": "118612262", + "fileDestinationSE": f"{panda_queue_name},{panda_queue_name}", + "scopeOut": "panda", + "minRamCount": 0, + "jobDefinitionID": 7932, + "maxWalltime": "NULL", + "scopeLog": "panda", + "transformation": "Sim_tf.py", + "maxDiskCount": 0, + "coreCount": ncores, + "prodDBlockToken": "NULL", + "transferType": "NULL", + "destinationDblock": job_name, + "dispatchDBlockToken": "NULL", + "jobPars": ( '--eventService=%s --skipEvents=0 --firstEvent=1 --preExec "from AthenaCommon.DetFlags ' - 'import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();' + "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff() "' - '--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so ' - '--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py ' - '--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 ' - '--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root)' - % (str(is_eventservice), inFiles, outFilesShort)), - 'attemptNr': - 0, - 'swRelease': - 'Atlas-21.0.15', - 'nucleus': - 'NULL', - 'maxCpuCount': - 0, - 'outFiles': - outFiles, - 'currentPriority': - 1000, - 'scopeIn': - scope, - 'PandaID': - pandaID, - 'sourceSite': - 'NULL', - 'dispatchDblock': - 'NULL', - 'prodSourceLabel': - 'ptest', - 'checksum': - 'ad:5d000974', - 'jobName': - job_name, - 'ddmEndPointIn': - 'UTA_SWT2_DATADISK', - 'taskID': - taskId, - 'logFile': - '%s.job.log.tgz' % job_name + "--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so " + "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py " + "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " + "--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root)" + % (str(is_eventservice), inFiles, outFilesShort) + ), + "attemptNr": 0, + "swRelease": "Atlas-21.0.15", + "nucleus": "NULL", + "maxCpuCount": 0, + "outFiles": outFiles, + "currentPriority": 1000, + "scopeIn": scope, + "PandaID": pandaID, + "sourceSite": "NULL", + "dispatchDblock": "NULL", + "prodSourceLabel": "ptest", + "checksum": "ad:5d000974", + "jobName": job_name, + "ddmEndPointIn": "UTA_SWT2_DATADISK", + "taskID": taskId, + "logFile": "%s.job.log.tgz" % job_name, } return res @pytest.fixture -def sample_job(is_eventservice, input_output_file_list, nhits_per_file, nevents_per_file): +def sample_job( + is_eventservice, input_output_file_list, nhits_per_file, nevents_per_file +): hash = hashlib.md5() (input_files, output_files) = input_output_file_list - hash.update(str(time.time()).encode('utf-8')) + hash.update(str(time.time()).encode("utf-8")) log_guid = hash.hexdigest() - hash.update(str(time.time()).encode('utf-8')) + hash.update(str(time.time()).encode("utf-8")) job_name = hash.hexdigest() - pandaID = '0' - jobsetId = '0' - taskId = '0' - ncores = '8' - guid = '0' + pandaID = "0" + jobsetId = "0" + taskId = "0" + ncores = "8" + guid = "0" scope = "13Mev" panda_queue_name = "pandaqueue" inFiles = ",".join(input_files) @@ -307,118 +275,70 @@ def sample_job(is_eventservice, input_output_file_list, nhits_per_file, nevents_ outFilesShort = f"[{','.join([str(i) for i in range(len(outFiles))])}]" return { pandaID: { - 'jobsetID': - jobsetId, - 'logGUID': - log_guid, - 'nEventsPerInputFile': nevents_per_file, - 'esmergeSpec': { + "jobsetID": jobsetId, + "logGUID": log_guid, + "nEventsPerInputFile": nevents_per_file, + "esmergeSpec": { "transPath": "", "jobParameters": "", - "nEventsPerOutputFile": nhits_per_file + "nEventsPerOutputFile": nhits_per_file, }, - 'cmtConfig': - 'x86_64-slc6-gcc49-opt', - 'prodDBlocks': - 'user.mlassnig:user.mlassnig.pilot.test.single.hits', - 'dispatchDBlockTokenForOut': - 'NULL,NULL', - 'destinationDBlockToken': - 'NULL,NULL', - 'destinationSE': - panda_queue_name, - 'realDatasets': - job_name, - 'prodUserID': - 'no_one', - 'GUID': - guid, - 'realDatasetsIn': - 'user.mlassnig:user.mlassnig.pilot.test.single.hits', - 'nSent': - 0, - 'eventService': - str(is_eventservice), - 'cloud': - 'US', - 'StatusCode': - 0, - 'homepackage': - 'AtlasOffline/21.0.15', - 'inFiles': - inFiles, - 'processingType': - 'pilot-ptest', - 'ddmEndPointOut': - 'UTA_SWT2_DATADISK,UTA_SWT2_DATADISK', - 'fsize': - '118612262', - 'fileDestinationSE': - f"{panda_queue_name},{panda_queue_name}", - 'scopeOut': - 'panda', - 'minRamCount': - 0, - 'jobDefinitionID': - 7932, - 'maxWalltime': - 'NULL', - 'scopeLog': - 'panda', - 'transformation': - 'Sim_tf.py', - 'maxDiskCount': - 0, - 'coreCount': - ncores, - 'prodDBlockToken': - 'NULL', - 'transferType': - 'NULL', - 'destinationDblock': - job_name, - 'dispatchDBlockToken': - 'NULL', - 'jobPars': ( + "cmtConfig": "x86_64-slc6-gcc49-opt", + "prodDBlocks": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "dispatchDBlockTokenForOut": "NULL,NULL", + "destinationDBlockToken": "NULL,NULL", + "destinationSE": panda_queue_name, + "realDatasets": job_name, + "prodUserID": "no_one", + "GUID": guid, + "realDatasetsIn": "user.mlassnig:user.mlassnig.pilot.test.single.hits", + "nSent": 0, + "eventService": str(is_eventservice), + "cloud": "US", + "StatusCode": 0, + "homepackage": "AtlasOffline/21.0.15", + "inFiles": inFiles, + "processingType": "pilot-ptest", + "ddmEndPointOut": "UTA_SWT2_DATADISK,UTA_SWT2_DATADISK", + "fsize": "118612262", + "fileDestinationSE": f"{panda_queue_name},{panda_queue_name}", + "scopeOut": "panda", + "minRamCount": 0, + "jobDefinitionID": 7932, + "maxWalltime": "NULL", + "scopeLog": "panda", + "transformation": "Sim_tf.py", + "maxDiskCount": 0, + "coreCount": ncores, + "prodDBlockToken": "NULL", + "transferType": "NULL", + "destinationDblock": job_name, + "dispatchDBlockToken": "NULL", + "jobPars": ( '--eventService=%s --skipEvents=0 --firstEvent=1 --preExec "from AthenaCommon.DetFlags ' - 'import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();' + "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff() "' - '--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so ' - '--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py ' - '--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 ' - '--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root)' - % (str(is_eventservice), inFiles, outFilesShort)), - 'attemptNr': - 0, - 'swRelease': - 'Atlas-21.0.15', - 'nucleus': - 'NULL', - 'maxCpuCount': - 0, - 'outFiles': - outFiles, - 'currentPriority': - 1000, - 'scopeIn': - scope, - 'PandaID': - pandaID, - 'sourceSite': - 'NULL', - 'dispatchDblock': - 'NULL', - 'prodSourceLabel': - 'ptest', - 'checksum': - 'ad:5d000974', - 'jobName': - job_name, - 'ddmEndPointIn': - 'UTA_SWT2_DATADISK', - 'taskID': - taskId, - 'logFile': - '%s.job.log.tgz' % job_name + "--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so " + "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py " + "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " + "--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root)" + % (str(is_eventservice), inFiles, outFilesShort) + ), + "attemptNr": 0, + "swRelease": "Atlas-21.0.15", + "nucleus": "NULL", + "maxCpuCount": 0, + "outFiles": outFiles, + "currentPriority": 1000, + "scopeIn": scope, + "PandaID": pandaID, + "sourceSite": "NULL", + "dispatchDblock": "NULL", + "prodSourceLabel": "ptest", + "checksum": "ad:5d000974", + "jobName": job_name, + "ddmEndPointIn": "UTA_SWT2_DATADISK", + "taskID": taskId, + "logFile": "%s.job.log.tgz" % job_name, } } diff --git a/tests/harvester/conftest.py b/tests/harvester/conftest.py index ea8f1a6..301f18d 100644 --- a/tests/harvester/conftest.py +++ b/tests/harvester/conftest.py @@ -38,14 +38,20 @@ def clean_files(files): @pytest.fixture -def harvester_file_communicator(tmpdir, config, request_queue, jobs_queue, - ranges_queue): - config.harvester['endpoint'] = str(tmpdir) - communicator = HarvesterFileCommunicator(request_queue, jobs_queue, - ranges_queue, config) +def harvester_file_communicator( + tmpdir, config, request_queue, jobs_queue, ranges_queue +): + config.harvester["endpoint"] = str(tmpdir) + communicator = HarvesterFileCommunicator( + request_queue, jobs_queue, ranges_queue, config + ) yield communicator communicator.stop() - clean_files([ - communicator.jobrequestfile, communicator.jobspecfile, - communicator.eventrequestfile, communicator.eventrangesfile - ]) + clean_files( + [ + communicator.jobrequestfile, + communicator.jobspecfile, + communicator.eventrequestfile, + communicator.eventrangesfile, + ] + ) diff --git a/tests/harvester/test_harvesterFileMessenger.py b/tests/harvester/test_harvesterFileMessenger.py index 515c9c3..85469b6 100644 --- a/tests/harvester/test_harvesterFileMessenger.py +++ b/tests/harvester/test_harvesterFileMessenger.py @@ -6,17 +6,16 @@ class TestHarvesterFileMessenger: - def check_job(self, jobs, sample_jobs): assert jobs is not None assert len(jobs) == len(sample_jobs) for sample_ID, jobID in zip(sample_jobs, jobs): assert sample_ID == jobID - def test_get_job(self, harvester_file_communicator, sample_job, - request_queue, jobs_queue): - - with open(harvester_file_communicator.jobspecfile, 'w') as f: + def test_get_job( + self, harvester_file_communicator, sample_job, request_queue, jobs_queue + ): + with open(harvester_file_communicator.jobspecfile, "w") as f: json.dump(sample_job, f) harvester_file_communicator.start() @@ -24,15 +23,16 @@ def test_get_job(self, harvester_file_communicator, sample_job, job_communicator = jobs_queue.get(timeout=5) self.check_job(job_communicator, sample_job) - def test_get_job_request(self, harvester_file_communicator, sample_job, - request_queue, jobs_queue): + def test_get_job_request( + self, harvester_file_communicator, sample_job, request_queue, jobs_queue + ): harvester_file_communicator.start() request_queue.put(PandaJobRequest()) while not os.path.exists(harvester_file_communicator.jobrequestfile): time.sleep(0.1) - with open(harvester_file_communicator.jobspecfile, 'w') as f: + with open(harvester_file_communicator.jobspecfile, "w") as f: json.dump(sample_job, f) jobs = jobs_queue.get(timeout=5) self.check_job(jobs, sample_job) @@ -55,15 +55,22 @@ def test_restart(self, harvester_file_communicator): assert harvester_file_communicator.communicator_thread.is_alive() assert harvester_file_communicator.communicator_thread == ref_thread - def test_get_event_ranges(self, config, harvester_file_communicator, - request_queue, ranges_queue, sample_job): + def test_get_event_ranges( + self, + config, + harvester_file_communicator, + request_queue, + ranges_queue, + sample_job, + ): harvester_file_communicator.start() n_events = 3 evnt_request = EventRangeRequest() for pandaID, job in sample_job.items(): - evnt_request.add_event_request(pandaID, n_events, job['taskID'], - job['jobsetID']) + evnt_request.add_event_request( + pandaID, n_events, job["taskID"], job["jobsetID"] + ) request_queue.put(evnt_request) while not os.path.isfile(harvester_file_communicator.eventrequestfile): @@ -72,27 +79,35 @@ def test_get_event_ranges(self, config, harvester_file_communicator, ranges_res = {} with open(harvester_file_communicator.eventrequestfile) as f: communicator_request = json.load(f) - for pandaIDSent, pandaIDCom in zip(evnt_request, - communicator_request): + for pandaIDSent, pandaIDCom in zip( + evnt_request, communicator_request + ): assert pandaIDSent == pandaIDCom - assert evnt_request[pandaIDSent][ - 'nRanges'] == communicator_request[pandaIDSent]['nRanges'] - ranges_res[pandaIDSent] = [{ - 'lastEvent': 0, - 'eventRangeID': "0", - 'startEvent': 0, - 'scope': "scope_value", - 'LFN': "/path/to/file", - 'GUID': "worker_id" - }] * n_events - with open(harvester_file_communicator.eventrangesfile, 'w') as f: + assert ( + evnt_request[pandaIDSent]["nRanges"] + == communicator_request[pandaIDSent]["nRanges"] + ) + ranges_res[pandaIDSent] = [ + { + "lastEvent": 0, + "eventRangeID": "0", + "startEvent": 0, + "scope": "scope_value", + "LFN": "/path/to/file", + "GUID": "worker_id", + } + ] * n_events + with open(harvester_file_communicator.eventrangesfile, "w") as f: json.dump(ranges_res, f) ranges_com = ranges_queue.get(timeout=5) for pandaIDSent, pandaIDCom in zip(ranges_res, ranges_com): assert pandaIDSent == pandaIDCom - assert len(ranges_res[pandaIDSent]) == len( - ranges_com[pandaIDSent]) == n_events + assert ( + len(ranges_res[pandaIDSent]) + == len(ranges_com[pandaIDSent]) + == n_events + ) assert not os.path.isfile(harvester_file_communicator.eventrequestfile) assert not os.path.isfile(harvester_file_communicator.eventrangesfile) @@ -103,10 +118,13 @@ def test_get_event_ranges(self, config, harvester_file_communicator, ranges_res = {} for pandaID in evnt_request: ranges_res[pandaID] = [] - with open(harvester_file_communicator.eventrangesfile, 'w') as f: + with open(harvester_file_communicator.eventrangesfile, "w") as f: json.dump(ranges_res, f) ranges_com = ranges_queue.get(timeout=5) for pandaIDSent, pandaIDCom in zip(ranges_res, ranges_com): assert pandaIDSent == pandaIDCom - assert len(ranges_res[pandaIDSent]) == len( - ranges_com[pandaIDSent]) == 0 + assert ( + len(ranges_res[pandaIDSent]) + == len(ranges_com[pandaIDSent]) + == 0 + ) diff --git a/tests/harvester/test_harvesterMock.py b/tests/harvester/test_harvesterMock.py index fce89a6..3acdd7b 100644 --- a/tests/harvester/test_harvesterMock.py +++ b/tests/harvester/test_harvesterMock.py @@ -2,15 +2,15 @@ class TestHarvesterMock: - def test_get_job(self, harvester_mock, request_queue, jobs_queue): harvester_mock.start() request_queue.put(PandaJobRequest()) job = jobs_queue.get(timeout=5) assert job is not None and isinstance(job, dict) - def test_get_ranges(self, harvester_mock, request_queue, jobs_queue, - ranges_queue): + def test_get_ranges( + self, harvester_mock, request_queue, jobs_queue, ranges_queue + ): harvester_mock.start() request_queue.put(PandaJobRequest()) jobs = jobs_queue.get(timeout=5) @@ -18,8 +18,9 @@ def test_get_ranges(self, harvester_mock, request_queue, jobs_queue, n_events = harvester_mock.nevents evnt_request = EventRangeRequest() for pandaID, job in jobs.items(): - evnt_request.add_event_request(pandaID, n_events, job['taskID'], - job['jobsetID']) + evnt_request.add_event_request( + pandaID, n_events, job["taskID"], job["jobsetID"] + ) request_queue.put(evnt_request) ranges = ranges_queue.get(timeout=5) assert ranges is not None diff --git a/tests/test_bookkeeper.py b/tests/test_bookkeeper.py index 3ad9cf8..f31b710 100644 --- a/tests/test_bookkeeper.py +++ b/tests/test_bookkeeper.py @@ -4,7 +4,6 @@ @pytest.mark.usefixtures("requires_ray") class TestBookKeeper: - def test_add_jobs(self, is_eventservice, config, sample_multijobs, njobs): bookKeeper = BookKeeper(config) bookKeeper.output_dir = "dummy" @@ -14,8 +13,15 @@ def test_add_jobs(self, is_eventservice, config, sample_multijobs, njobs): for pandaID in bookKeeper.jobs: assert pandaID in sample_multijobs - def test_assign_job_to_actor(elf, is_eventservice, config, sample_multijobs, - njobs, sample_ranges, nevents): + def test_assign_job_to_actor( + elf, + is_eventservice, + config, + sample_multijobs, + njobs, + sample_ranges, + nevents, + ): bookKeeper = BookKeeper(config) bookKeeper.output_dir = "dummy" bookKeeper.merged_files_dir = "dummy" @@ -26,7 +32,7 @@ def test_assign_job_to_actor(elf, is_eventservice, config, sample_multijobs, for i in range(njobs): job_tmp = bookKeeper.assign_job_to_actor(actor_id) if job: - assert job['PandaID'] != job_tmp['PandaID'] + assert job["PandaID"] != job_tmp["PandaID"] job = job_tmp assert not bookKeeper.has_jobs_ready() assert not bookKeeper.assign_job_to_actor(actor_id) @@ -36,14 +42,23 @@ def test_assign_job_to_actor(elf, is_eventservice, config, sample_multijobs, for i in range(njobs): job_tmp = bookKeeper.assign_job_to_actor(actor_id) if job: - assert job['PandaID'] == job_tmp['PandaID'] + assert job["PandaID"] == job_tmp["PandaID"] job = job_tmp bookKeeper.fetch_event_ranges(actor_id, nevents) - assert bookKeeper.assign_job_to_actor( - actor_id)['PandaID'] == job['PandaID'] - - def test_add_event_ranges(self, is_eventservice, config, sample_multijobs, - njobs, nevents, sample_ranges): + assert ( + bookKeeper.assign_job_to_actor(actor_id)["PandaID"] + == job["PandaID"] + ) + + def test_add_event_ranges( + self, + is_eventservice, + config, + sample_multijobs, + njobs, + nevents, + sample_ranges, + ): if not is_eventservice: pytest.skip() @@ -55,11 +70,20 @@ def test_add_event_ranges(self, is_eventservice, config, sample_multijobs, assert bookKeeper.has_jobs_ready() for pandaID in sample_multijobs: - print(bookKeeper.jobs[pandaID].event_ranges_queue.event_ranges_by_id) + print( + bookKeeper.jobs[pandaID].event_ranges_queue.event_ranges_by_id + ) assert bookKeeper.n_ready(pandaID) == nevents - def test_fetch_event_ranges(self, is_eventservice, config, sample_multijobs, - njobs, nevents, sample_ranges): + def test_fetch_event_ranges( + self, + is_eventservice, + config, + sample_multijobs, + njobs, + nevents, + sample_ranges, + ): if not is_eventservice: pytest.skip() worker_ids = [f"w_{i}" for i in range(10)] @@ -73,19 +97,27 @@ def test_fetch_event_ranges(self, is_eventservice, config, sample_multijobs, for wid in worker_ids: assert not bookKeeper.fetch_event_ranges(wid, 100) - assigned_workers = worker_ids[:int(len(worker_ids) / 2)] + assigned_workers = worker_ids[: int(len(worker_ids) / 2)] for wid in assigned_workers: job = bookKeeper.assign_job_to_actor(wid) - assert job['PandaID'] in sample_multijobs + assert job["PandaID"] in sample_multijobs ranges = bookKeeper.fetch_event_ranges( - wid, int(nevents / len(assigned_workers))) + wid, int(nevents / len(assigned_workers)) + ) assert ranges assert not bookKeeper.fetch_event_ranges(wid[0], 1) - def test_process_event_ranges_update(self, is_eventservice, config, - sample_multijobs, njobs, nevents, - sample_ranges, sample_rangeupdate, - sample_failed_rangeupdate): + def test_process_event_ranges_update( + self, + is_eventservice, + config, + sample_multijobs, + njobs, + nevents, + sample_ranges, + sample_rangeupdate, + sample_failed_rangeupdate, + ): if not is_eventservice: pytest.skip("No eventservice jobs") @@ -107,9 +139,10 @@ def __inner__(range_update, failed=False): assert job.event_ranges_queue.nranges_failed() == nevents else: assert job.event_ranges_queue.nranges_done() == nevents - assert not bookKeeper.is_flagged_no_more_events(job['PandaID']) + assert not bookKeeper.is_flagged_no_more_events(job["PandaID"]) assert bookKeeper.assign_job_to_actor(actor_id) + __inner__(sample_rangeupdate) __inner__(sample_failed_rangeupdate, True) @@ -119,33 +152,52 @@ def __inner__(range_update, failed=False): bookKeeper.add_jobs(sample_multijobs, False) for _ in range(njobs): job = bookKeeper.assign_job_to_actor(actor_id) - print(bookKeeper.jobs.get_event_ranges(job.get_id()).event_ranges_count) + print( + bookKeeper.jobs.get_event_ranges( + job.get_id() + ).event_ranges_count + ) ranges = bookKeeper.fetch_event_ranges(actor_id, nevents) assert len(ranges) == nevents assert bookKeeper.rangesID_by_actor[actor_id] - bookKeeper.process_event_ranges_update(actor_id, sample_failed_rangeupdate) + bookKeeper.process_event_ranges_update( + actor_id, sample_failed_rangeupdate + ) assert not bookKeeper.rangesID_by_actor[actor_id] assert job.event_ranges_queue.nranges_failed() == nevents assert not bookKeeper.rangesID_by_actor[actor_id] - n_success = len(sample_rangeupdate[0]['eventRanges']) // 2 - sample_rangeupdate[0]['eventRanges'] = sample_rangeupdate[0]['eventRanges'][:n_success] - bookKeeper.process_event_ranges_update(actor_id, sample_rangeupdate[0]['eventRanges']) + n_success = len(sample_rangeupdate[0]["eventRanges"]) // 2 + sample_rangeupdate[0]["eventRanges"] = sample_rangeupdate[0][ + "eventRanges" + ][:n_success] + bookKeeper.process_event_ranges_update( + actor_id, sample_rangeupdate[0]["eventRanges"] + ) assert not bookKeeper.rangesID_by_actor[actor_id] assert job.event_ranges_queue.nranges_done() == n_success events = bookKeeper.fetch_event_ranges(actor_id, nevents) assert not bookKeeper.rangesID_by_actor[actor_id] assert not events - assert job.event_ranges_queue.nranges_failed() == nevents - n_success + assert ( + job.event_ranges_queue.nranges_failed() == nevents - n_success + ) assert job.event_ranges_queue.nranges_done() == n_success print(job.event_ranges_queue.rangesID_by_state) print(bookKeeper.rangesID_by_actor) - assert not bookKeeper.is_flagged_no_more_events(job['PandaID']) + assert not bookKeeper.is_flagged_no_more_events(job["PandaID"]) assert bookKeeper.assign_job_to_actor(actor_id) - def test_process_actor_end(self, is_eventservice, config, njobs, - sample_multijobs, nevents, sample_ranges): + def test_process_actor_end( + self, + is_eventservice, + config, + njobs, + sample_multijobs, + nevents, + sample_ranges, + ): if not is_eventservice: pytest.skip("No eventservice jobs") @@ -158,7 +210,7 @@ def test_process_actor_end(self, is_eventservice, config, njobs, bookKeeper.add_jobs(sample_multijobs, False) job = bookKeeper.assign_job_to_actor(actor_id_1) - pandaID = job['PandaID'] + pandaID = job["PandaID"] assert bookKeeper.n_ready(pandaID) == nevents bookKeeper.process_actor_end(actor_id_1) @@ -166,15 +218,15 @@ def test_process_actor_end(self, is_eventservice, config, njobs, job = bookKeeper.assign_job_to_actor(actor_id_1) job_2 = bookKeeper.assign_job_to_actor(actor_id_2) - assert job_2['PandaID'] == job['PandaID'] == pandaID + assert job_2["PandaID"] == job["PandaID"] == pandaID ranges_1 = bookKeeper.fetch_event_ranges(actor_id_1, nevents) assert len(ranges_1) == nevents ranges_2 = bookKeeper.fetch_event_ranges(actor_id_2, nevents) assert len(ranges_2) == bookKeeper.n_ready(pandaID) == 0 - assert bookKeeper.assign_job_to_actor(actor_id_2)['PandaID'] == pandaID + assert bookKeeper.assign_job_to_actor(actor_id_2)["PandaID"] == pandaID bookKeeper.process_actor_end(actor_id_1) assert bookKeeper.n_ready(pandaID) == nevents - assert bookKeeper.assign_job_to_actor(actor_id_1)['PandaID'] == pandaID + assert bookKeeper.assign_job_to_actor(actor_id_1)["PandaID"] == pandaID diff --git a/tests/test_config.py b/tests/test_config.py index 9aa45f7..e87049d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,4 +1,3 @@ class TestConfig: - def test_config(self, config): pass diff --git a/tests/test_driver.py b/tests/test_driver.py index ce4efcc..3b7ef7e 100644 --- a/tests/test_driver.py +++ b/tests/test_driver.py @@ -1,5 +1,4 @@ class TestDriver: - def test_one(self, tmpdir): assert True diff --git a/tests/test_eventservice.py b/tests/test_eventservice.py index 39565f9..4ad8d89 100644 --- a/tests/test_eventservice.py +++ b/tests/test_eventservice.py @@ -12,59 +12,78 @@ class TestEventRangeRequest: - def test_from_dict_init(self): request_dict = { "0": { "nRanges": 10, "pandaID": "0", "taskID": "0", - "jobsetID": "0" + "jobsetID": "0", }, "1": { "nRanges": 20, "pandaID": "1", "taskID": "1", - "jobsetID": "1" - } + "jobsetID": "1", + }, } ranges_request = EventRangeRequest.build_from_dict(request_dict) ranges_request_init = EventRangeRequest() for pandaID, req in request_dict.items(): - ranges_request_init.add_event_request(pandaID, req['nRanges'], - req['taskID'], - req['jobsetID']) - assert len(request_dict) == len(ranges_request) == len( - ranges_request_init) - for id1, id2, id3 in zip(ranges_request, ranges_request_init, - request_dict): - assert ranges_request[id1]['pandaID'] == ranges_request_init[id2][ - 'pandaID'] == request_dict[id3]['pandaID'] + ranges_request_init.add_event_request( + pandaID, req["nRanges"], req["taskID"], req["jobsetID"] + ) + assert ( + len(request_dict) == len(ranges_request) == len(ranges_request_init) + ) + for id1, id2, id3 in zip( + ranges_request, ranges_request_init, request_dict + ): + assert ( + ranges_request[id1]["pandaID"] + == ranges_request_init[id2]["pandaID"] + == request_dict[id3]["pandaID"] + ) class TestEventRangeUpdate: - - def test_build_range_update(self, nevents, sample_rangeupdate, - sample_failed_rangeupdate): + def test_build_range_update( + self, nevents, sample_rangeupdate, sample_failed_rangeupdate + ): pandaID = "0" ranges_update = EventRangeUpdate.build_from_dict( - pandaID, sample_rangeupdate) + pandaID, sample_rangeupdate + ) assert pandaID in ranges_update ranges = ranges_update[pandaID] assert len(ranges) == nevents assert len(ranges_update) == 1 for r in ranges: - assert "eventRangeID" in r and "eventStatus" in r and "path" in r and "type" in r and "chksum" in r and "fsize" in r + assert ( + "eventRangeID" in r + and "eventStatus" in r + and "path" in r + and "type" in r + and "chksum" in r + and "fsize" in r + ) ranges_update = EventRangeUpdate.build_from_dict( - pandaID, sample_failed_rangeupdate) + pandaID, sample_failed_rangeupdate + ) assert pandaID in ranges_update ranges = ranges_update[pandaID] assert len(ranges) == nevents for r in ranges: - assert "eventRangeID" in r and "eventStatus" in r and \ - "path" not in r and "type" not in r and "chksum" not in r and "fsize" not in r + assert ( + "eventRangeID" in r + and "eventStatus" in r + and "path" not in r + and "type" not in r + and "chksum" not in r + and "fsize" not in r + ) with pytest.raises(Exception): ranges_update.range_update[pandaID] = None @@ -77,17 +96,24 @@ def test_build_range_update(self, nevents, sample_rangeupdate, class TestEventRangeQueue: - def test_new(self, nevents, sample_job, sample_ranges): ranges_queue = EventRangeQueue() assert len(ranges_queue) == 0 ranges = list(sample_ranges.values())[0] ranges_queue = EventRangeQueue.build_from_list(ranges) - assert len(ranges) == len( - ranges_queue) == ranges_queue.nranges_available() ==\ - ranges_queue.nranges_remaining() == nevents - assert ranges_queue.nranges_assigned() == ranges_queue.nranges_done() ==\ - ranges_queue.nranges_failed() == 0 + assert ( + len(ranges) + == len(ranges_queue) + == ranges_queue.nranges_available() + == ranges_queue.nranges_remaining() + == nevents + ) + assert ( + ranges_queue.nranges_assigned() + == ranges_queue.nranges_done() + == ranges_queue.nranges_failed() + == 0 + ) with pytest.raises(Exception): ranges_queue["key"] = None @@ -101,28 +127,45 @@ def test_concat(self, nevents, sample_job, sample_ranges): ranges_queue = EventRangeQueue() ranges = list(sample_ranges.values())[0] ranges_queue.concat(ranges) - assert len(ranges) == len(ranges_queue) ==\ - ranges_queue.nranges_available() ==\ - ranges_queue.nranges_remaining() == nevents - assert ranges_queue.nranges_assigned() ==\ - ranges_queue.nranges_done() ==\ - ranges_queue.nranges_failed() == 0 - assert ranges_queue[ - ranges[0]['eventRangeID']].eventRangeID == ranges[0]['eventRangeID'] + assert ( + len(ranges) + == len(ranges_queue) + == ranges_queue.nranges_available() + == ranges_queue.nranges_remaining() + == nevents + ) + assert ( + ranges_queue.nranges_assigned() + == ranges_queue.nranges_done() + == ranges_queue.nranges_failed() + == 0 + ) + assert ( + ranges_queue[ranges[0]["eventRangeID"]].eventRangeID + == ranges[0]["eventRangeID"] + ) for r in ranges: - assert r['eventRangeID'] in ranges_queue - - def test_update(self, sample_job, sample_ranges, nevents, - sample_rangeupdate, sample_failed_rangeupdate): + assert r["eventRangeID"] in ranges_queue + + def test_update( + self, + sample_job, + sample_ranges, + nevents, + sample_rangeupdate, + sample_failed_rangeupdate, + ): pandaID = "0" ranges = list(sample_ranges.values())[0] ranges_queue = EventRangeQueue.build_from_list(ranges) nsuccess = int(nevents / 2) ranges_update = EventRangeUpdate.build_from_dict( - pandaID, sample_rangeupdate)[pandaID][:nsuccess] + pandaID, sample_rangeupdate + )[pandaID][:nsuccess] failed_ranges_update = EventRangeUpdate.build_from_dict( - pandaID, sample_failed_rangeupdate)[pandaID][nsuccess:] + pandaID, sample_failed_rangeupdate + )[pandaID][nsuccess:] ranges_queue.get_next_ranges(nevents) ranges_queue.update_ranges(ranges_update) @@ -152,19 +195,23 @@ def test_get_next(self, sample_job, sample_ranges): assert ranges_queue.nranges_remaining() == nranges assert ranges_queue.nranges_available() == nranges - nranges_requested for requested_range in requested_ranges: - assert ranges_queue[ - requested_range.eventRangeID].status == EventRange.ASSIGNED + assert ( + ranges_queue[requested_range.eventRangeID].status + == EventRange.ASSIGNED + ) requested_ranges = ranges_queue.get_next_ranges(nranges) assert len(requested_ranges) == nranges - nranges_requested assert ranges_queue.nranges_available() == 0 - assert ranges_queue.nranges_assigned( - ) == ranges_queue.nranges_remaining() == nranges + assert ( + ranges_queue.nranges_assigned() + == ranges_queue.nranges_remaining() + == nranges + ) assert len(ranges_queue.get_next_ranges(1)) == 0 class TestEventRanges: - def test_new(self): id = "Range-0" start = 0 @@ -190,22 +237,29 @@ def test_build_from_dict(self): "lastEvent": last, "startEvent": start, "GUID": guid, - "scope": scope + "scope": scope, } range_from_dict = EventRange.build_from_dict(r_dict) - assert pfn == range_from_dict.PFN and range_from_dict.eventRangeID == id and range_from_dict.startEvent == start \ - and range_from_dict.lastEvent == last and guid == range_from_dict.GUID and range_from_dict.scope == scope + assert ( + pfn == range_from_dict.PFN + and range_from_dict.eventRangeID == id + and range_from_dict.startEvent == start + and range_from_dict.lastEvent == last + and guid == range_from_dict.GUID + and range_from_dict.scope == scope + ) assert range_from_dict.status == EventRange.READY class TestPandaJobQueue: - - def test_build_pandajob_queue(self, is_eventservice, njobs, - sample_multijobs): + def test_build_pandajob_queue( + self, is_eventservice, njobs, sample_multijobs + ): assert len(sample_multijobs) == njobs pandajob_queue = PandaJobQueue() pandajob_queue_fromdict = PandaJobQueue.build_from_dict( - sample_multijobs) + sample_multijobs + ) assert len(pandajob_queue) == 0 assert not pandajob_queue.next_job_to_process() @@ -217,7 +271,7 @@ def test_build_pandajob_queue(self, is_eventservice, njobs, else: for i in range(1, njobs): next_job = pandajob_queue.next_job_to_process() - assert job['PandaID'] != next_job['PandaID'] + assert job["PandaID"] != next_job["PandaID"] job = next_job for pandaID in pandajob_queue: @@ -234,9 +288,9 @@ def test_build_pandajob_queue(self, is_eventservice, njobs, pandajob_queue_2["key"] = job assert "key" in pandajob_queue_2 - def test_pandajob_process_event_ranges_reply(self, is_eventservice, njobs, - sample_multijobs, - sample_ranges): + def test_pandajob_process_event_ranges_reply( + self, is_eventservice, njobs, sample_multijobs, sample_ranges + ): if not is_eventservice: pytest.skip("Not eventservice jobs") pandajob_queue = PandaJobQueue(sample_multijobs) @@ -244,7 +298,7 @@ def test_pandajob_process_event_ranges_reply(self, is_eventservice, njobs, pandajob_queue.process_event_ranges_reply(sample_ranges) job = pandajob_queue.next_job_to_process() - assert job['PandaID'] in sample_ranges + assert job["PandaID"] in sample_ranges for pandaID in pandajob_queue: ranges = pandajob_queue.get_event_ranges(pandaID) @@ -261,9 +315,15 @@ def test_pandajob_process_event_ranges_reply(self, is_eventservice, njobs, pandajob_queue.process_event_ranges_reply(sample_ranges) assert "key" not in pandajob_queue - def test_process_event_ranges_update(self, is_eventservice, njobs, nevents, - sample_multijobs, sample_ranges, - sample_rangeupdate): + def test_process_event_ranges_update( + self, + is_eventservice, + njobs, + nevents, + sample_multijobs, + sample_ranges, + sample_rangeupdate, + ): if not is_eventservice: pytest.skip("Not eventservice jobs") pandajob_queue = PandaJobQueue(sample_multijobs) @@ -273,21 +333,24 @@ def test_process_event_ranges_update(self, is_eventservice, njobs, nevents, job = pandajob_queue.next_job_to_process() assert job == pandajob_queue.next_job_to_process() ranges_update = EventRangeUpdate.build_from_dict( - job['PandaID'], sample_rangeupdate) + job["PandaID"], sample_rangeupdate + ) - ranges_queue = pandajob_queue.get_event_ranges(job['PandaID']) + ranges_queue = pandajob_queue.get_event_ranges(job["PandaID"]) _ = job.get_next_ranges(nevents) pandajob_queue.process_event_ranges_update(ranges_update) assert not job.no_more_ranges assert ranges_queue.nranges_done() == nevents - assert ranges_queue.nranges_remaining( - ) == ranges_queue.nranges_available() == 0 + assert ( + ranges_queue.nranges_remaining() + == ranges_queue.nranges_available() + == 0 + ) job_2 = pandajob_queue.next_job_to_process() - assert job['PandaID'] == job_2['PandaID'] + assert job["PandaID"] == job_2["PandaID"] class TestPandaJob: - def test_build_pandajob(self, sample_job): job_dict = list(sample_job.values())[0] job = PandaJob(job_dict) @@ -300,7 +363,6 @@ def test_build_pandajob(self, sample_job): class TestPandaJobRequest: - def test_build_pandajob_request(self): request_dict = { "node": "nodename", @@ -312,31 +374,31 @@ def test_build_pandajob_request(self): "resource_type": "rt", "mem": 230000, "cpu": 32, - "allow_other_country": "false" + "allow_other_country": "false", } jobrequest = PandaJobRequest(**request_dict) - assert jobrequest.diskSpace == request_dict['disk_space'] - assert jobrequest.mem == request_dict['mem'] - assert jobrequest.allowOtherCountry == request_dict[ - 'allow_other_country'] + assert jobrequest.diskSpace == request_dict["disk_space"] + assert jobrequest.mem == request_dict["mem"] + assert ( + jobrequest.allowOtherCountry == request_dict["allow_other_country"] + ) class TestPandaJobUpdate: - def test_build_pandajob_update(self): update_dict = { - 'node': ['nid00038'], - 'startTime': ['1574112042.86'], - 'jobMetrics': ['coreCount=32'], - 'siteName': ['NERSC_Cori_p2_ES'], - 'timestamp': ['2019-11-18T13:20:45-08:00'], - 'coreCount': ['32'], - 'attemptNr': ['0'], - 'jobId': ['7a75654803d17d54f9129e2a6974beda'], - 'batchID': ['25932742'], - 'state': ['starting'], - 'schedulerID': ['unknown'], - 'pilotID': ['unknown|SLURM|PR|2.2.2 (1)'] + "node": ["nid00038"], + "startTime": ["1574112042.86"], + "jobMetrics": ["coreCount=32"], + "siteName": ["NERSC_Cori_p2_ES"], + "timestamp": ["2019-11-18T13:20:45-08:00"], + "coreCount": ["32"], + "attemptNr": ["0"], + "jobId": ["7a75654803d17d54f9129e2a6974beda"], + "batchID": ["25932742"], + "state": ["starting"], + "schedulerID": ["unknown"], + "pilotID": ["unknown|SLURM|PR|2.2.2 (1)"], } jobupdate = PandaJobUpdate(**update_dict) for k in update_dict: diff --git a/tests/test_importutils.py b/tests/test_importutils.py index 7318d88..85131ce 100644 --- a/tests/test_importutils.py +++ b/tests/test_importutils.py @@ -4,8 +4,10 @@ def test_importutils(): errors_string = [ - "unknown", "unknown:unknown", "unknown:", - "raythena.drivers.esdriver:ESDriver.unknown" + "unknown", + "unknown:unknown", + "unknown:", + "raythena.drivers.esdriver:ESDriver.unknown", ] for s in errors_string: with pytest.raises(ImportError): @@ -14,4 +16,5 @@ def test_importutils(): with pytest.raises(ValueError): import_from_string(":unknown") from raythena.drivers.esdriver import ESDriver + assert import_from_string("raythena.drivers.esdriver:ESDriver") == ESDriver diff --git a/tests/test_pilothttp.py b/tests/test_pilothttp.py index d91c2e3..f2c5316 100644 --- a/tests/test_pilothttp.py +++ b/tests/test_pilothttp.py @@ -8,7 +8,6 @@ class MockPopen: - def __init__(self, returncode): self.returncode = returncode @@ -25,18 +24,16 @@ def terminate(self): class MockPayload(PilotHttpPayload): - def _start_payload(self): self.pilot_process = MockPopen(None) @pytest.mark.usefixtures("requires_ray") class TestPilotHttp: - def wait_server_start(self): while True: try: - requests.post('http://127.0.0.1:8080') + requests.post("http://127.0.0.1:8080") except requests.exceptions.ConnectionError: time.sleep(0.5) else: @@ -48,7 +45,7 @@ def setup_payload(self, config): @pytest.fixture def payload(self, tmpdir, config, sample_job): cwd = os.getcwd() - config.ray['workdir'] = str(tmpdir) + config.ray["workdir"] = str(tmpdir) os.chdir(tmpdir) job_dict = list(sample_job.values())[0] job = PandaJob(job_dict) @@ -64,18 +61,25 @@ def test_getjob(self, payload, is_eventservice, config, sample_job): pytest.skip() job_dict = list(sample_job.values())[0] job = PandaJob(job_dict) - res = requests.post('http://127.0.0.1:8080/server/panda/getJob').json() - assert job['PandaID'] == PandaJob(res)['PandaID'] + res = requests.post("http://127.0.0.1:8080/server/panda/getJob").json() + assert job["PandaID"] == PandaJob(res)["PandaID"] - assert requests.post( - 'http://127.0.0.1:8080/unknown').json()['StatusCode'] == 500 + assert ( + requests.post("http://127.0.0.1:8080/unknown").json()["StatusCode"] + == 500 + ) payload.stop() assert payload.is_complete() assert payload.return_code() == payload.pilot_process.returncode def endpoint_not_implemented(self, endpoint): - assert requests.post(f'http://127.0.0.1:8080/server/panda/{endpoint}').json()['StatusCode'] == 500 + assert ( + requests.post( + f"http://127.0.0.1:8080/server/panda/{endpoint}" + ).json()["StatusCode"] + == 500 + ) @pytest.mark.usefixtures("payload") def test_updateJobsInBulk(self): @@ -94,28 +98,43 @@ def test_jobUpdate(self, payload, config, is_eventservice): pytest.skip() assert not payload.fetch_job_update() - data = {"pilotErrorCode": '0'} - res = requests.post('http://127.0.0.1:8080/server/panda/updateJob', - data=data).json() - assert res['StatusCode'] == 0 + data = {"pilotErrorCode": "0"} + res = requests.post( + "http://127.0.0.1:8080/server/panda/updateJob", data=data + ).json() + assert res["StatusCode"] == 0 # Disabled as job update are currently not forwarded to the driver # job_update = payload.fetch_job_update() # assert job_update['pilotErrorCode'][0] == data['pilotErrorCode'] - def test_rangesUpdate(self, payload, config, is_eventservice, sample_job, - sample_ranges, nevents): + def test_rangesUpdate( + self, + payload, + config, + is_eventservice, + sample_job, + sample_ranges, + nevents, + ): if not is_eventservice: pytest.skip() assert not payload.fetch_ranges_update() data = {"pilotErrorCode": 0} res = requests.post( - 'http://127.0.0.1:8080/server/panda/updateEventRanges', - data=data).json() - assert res['StatusCode'] == 0 - - def test_getranges(self, payload, config, is_eventservice, sample_job, - sample_ranges, nevents): + "http://127.0.0.1:8080/server/panda/updateEventRanges", data=data + ).json() + assert res["StatusCode"] == 0 + + def test_getranges( + self, + payload, + config, + is_eventservice, + sample_job, + sample_ranges, + nevents, + ): if not is_eventservice: pytest.skip() @@ -126,11 +145,12 @@ def test_getranges(self, payload, config, is_eventservice, sample_job, "pandaID": job["PandaID"], "nRanges": nevents, "jobsetID": job["jobsetID"], - "taskID": job["taskID"] + "taskID": job["taskID"], } res = requests.post( - 'http://127.0.0.1:8080/server/panda/getEventRanges').json() - assert res['StatusCode'] == 500 + "http://127.0.0.1:8080/server/panda/getEventRanges" + ).json() + assert res["StatusCode"] == 500 assert payload.should_request_more_ranges() ranges = list() for r in list(sample_ranges.values())[0]: @@ -138,17 +158,22 @@ def test_getranges(self, payload, config, is_eventservice, sample_job, payload.submit_new_ranges(ranges) payload.submit_new_ranges(None) - res = requests.post('http://127.0.0.1:8080/server/panda/getEventRanges', - data=data).json() - assert res['StatusCode'] == 0 - assert len(res['eventRanges']) == nevents + res = requests.post( + "http://127.0.0.1:8080/server/panda/getEventRanges", data=data + ).json() + assert res["StatusCode"] == 0 + assert len(res["eventRanges"]) == nevents - res = requests.post('http://127.0.0.1:8080/server/panda/getEventRanges', - data=data).json() - assert res['StatusCode'] == 0 - assert len(res['eventRanges']) == 0 + res = requests.post( + "http://127.0.0.1:8080/server/panda/getEventRanges", data=data + ).json() + assert res["StatusCode"] == 0 + assert len(res["eventRanges"]) == 0 assert not payload.should_request_more_ranges() data["pandaID"] = "None" - assert requests.post( - 'http://127.0.0.1:8080/server/panda/getEventRanges', - data=data).json()['StatusCode'] == -1 + assert ( + requests.post( + "http://127.0.0.1:8080/server/panda/getEventRanges", data=data + ).json()["StatusCode"] + == -1 + ) diff --git a/tests/test_ray_utils.py b/tests/test_ray_utils.py index c9de77a..34e1f1b 100644 --- a/tests/test_ray_utils.py +++ b/tests/test_ray_utils.py @@ -10,11 +10,9 @@ @pytest.mark.usefixtures("requires_ray") class TestRayUtils: - def test_build_nodes_resource_list(self, config): constraints = build_nodes_resource_list(config) - assert len( - constraints) == cluster_size() + assert len(constraints) == cluster_size() def test_cluster_size(self): assert cluster_size() > 0 diff --git a/tests/test_taskstatus.py b/tests/test_taskstatus.py index 0d1bd53..9196c41 100644 --- a/tests/test_taskstatus.py +++ b/tests/test_taskstatus.py @@ -3,14 +3,15 @@ class TestTaskStatus: - - def test_save_restore_status(self, nfiles, tmp_path, config, sample_job, sample_ranges): + def test_save_restore_status( + self, nfiles, tmp_path, config, sample_job, sample_ranges + ): config.ray["outputdir"] = tmp_path job = PandaJob(list(sample_job.values())[0]) ts = TaskStatus(job, tmp_path, config) ranges = list(sample_ranges.values())[0] - hits_per_file = int(job['esmergeSpec']['nEventsPerOutputFile']) - events_per_file = int(job['nEventsPerInputFile']) + hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) + events_per_file = int(job["nEventsPerInputFile"]) assert events_per_file % hits_per_file == 0 n_output_per_input_file = events_per_file // hits_per_file offset = nfiles @@ -18,14 +19,18 @@ def test_save_restore_status(self, nfiles, tmp_path, config, sample_job, sample_ for i in range(0, n_output_per_input_file): ranges_list = [] for j in range(hits_per_file): - ranges_list.append(ranges[file_no + (i * offset) + (i + j) * offset]) + ranges_list.append( + ranges[file_no + (i * offset) + (i + j) * offset] + ) ranges_map = {} arbitrary_range = EventRange.build_from_dict(ranges_list[0]) fname = arbitrary_range.PFN outputfile = f"{fname}-MERGED-{arbitrary_range.eventRangeID}" for r in ranges_list: event_range = EventRange.build_from_dict(r) - ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict(event_range, fname) + ranges_map[event_range.eventRangeID] = ( + TaskStatus.build_eventrange_dict(event_range, fname) + ) ts.set_eventrange_simulated(event_range, "outputfile") ts.set_file_merged([fname], outputfile, ranges_map, "guid") @@ -34,14 +39,18 @@ def test_save_restore_status(self, nfiles, tmp_path, config, sample_job, sample_ print(ts._status) assert ts._status == ts2._status - def test_set_processed(self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges): + def test_set_processed( + self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges + ): config.ray["outputdir"] = tmp_path job = PandaJob(list(sample_job.values())[0]) ts = TaskStatus(job, tmp_path, config) ranges_list = list(sample_ranges.values())[0] for r in ranges_list: - ts.set_eventrange_simulated(EventRange.build_from_dict(r), "outputfile") + ts.set_eventrange_simulated( + EventRange.build_from_dict(r), "outputfile" + ) # need to save as set_event_range_simulated is lazy ts.save_status() @@ -49,7 +58,9 @@ def test_set_processed(self, nfiles, nevents, tmp_path, config, sample_job, samp assert ts.get_nsimulated() == nevents assert ts.get_nmerged() == 0 - def test_set_failed(self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges): + def test_set_failed( + self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges + ): config.ray["outputdir"] = tmp_path job = PandaJob(list(sample_job.values())[0]) ts = TaskStatus(job, tmp_path, config) @@ -64,7 +75,9 @@ def test_set_failed(self, nfiles, nevents, tmp_path, config, sample_job, sample_ assert ts.get_nfailed() == nevents assert ts.get_nmerged() == 0 - def test_set_merged(self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges): + def test_set_merged( + self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges + ): config.ray["outputdir"] = tmp_path job = PandaJob(list(sample_job.values())[0]) ts = TaskStatus(job, tmp_path, config) @@ -74,8 +87,8 @@ def test_set_merged(self, nfiles, nevents, tmp_path, config, sample_job, sample_ er = EventRange.build_from_dict(e) ts.set_eventrange_simulated(er, f"outputfile-{er.eventRangeID}") - hits_per_file = int(job['esmergeSpec']['nEventsPerOutputFile']) - events_per_file = int(job['nEventsPerInputFile']) + hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) + events_per_file = int(job["nEventsPerInputFile"]) assert events_per_file % hits_per_file == 0 n_output_per_input_file = events_per_file // hits_per_file offset = nfiles @@ -83,14 +96,20 @@ def test_set_merged(self, nfiles, nevents, tmp_path, config, sample_job, sample_ for i in range(0, n_output_per_input_file): ranges_list = [] for j in range(hits_per_file): - ranges_list.append(ranges[file_no + (i * offset) + (i + j) * offset]) + ranges_list.append( + ranges[file_no + (i * offset) + (i + j) * offset] + ) arbitrary_range = EventRange.build_from_dict(ranges_list[0]) fname = arbitrary_range.PFN outputfile = f"{fname}-MERGED-{arbitrary_range.eventRangeID}" ranges_map = {} for r in ranges_list: event_range = EventRange.build_from_dict(r) - ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict(event_range, f"outputfile-{event_range.eventRangeID}") + ranges_map[event_range.eventRangeID] = ( + TaskStatus.build_eventrange_dict( + event_range, f"outputfile-{event_range.eventRangeID}" + ) + ) ts.set_file_merged([fname], outputfile, ranges_map, "guid") ts.save_status() @@ -103,14 +122,20 @@ def test_set_merged(self, nfiles, nevents, tmp_path, config, sample_job, sample_ for i in range(0, n_output_per_input_file): ranges_list = [] for j in range(hits_per_file): - ranges_list.append(ranges[file_no + (i * offset) + (i + j) * offset]) + ranges_list.append( + ranges[file_no + (i * offset) + (i + j) * offset] + ) arbitrary_range = EventRange.build_from_dict(ranges_list[0]) fname = arbitrary_range.PFN outputfile = f"{fname}-MERGED-{arbitrary_range.eventRangeID}" ranges_map = {} for r in ranges_list: event_range = EventRange.build_from_dict(r) - ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict(event_range, f"outputfile-{event_range.eventRangeID}") + ranges_map[event_range.eventRangeID] = ( + TaskStatus.build_eventrange_dict( + event_range, f"outputfile-{event_range.eventRangeID}" + ) + ) ts.set_file_merged([fname], outputfile, ranges_map, "guid") ts.save_status() print(ts._status) From 75f3c8eac38d63d977c178f9ed6802a593d93ebf Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 13:55:34 -0700 Subject: [PATCH 03/14] fix typing --- src/raythena/actors/esworker.py | 8 +-- src/raythena/actors/payloads/basePayload.py | 4 +- .../actors/payloads/eventservice/esPayload.py | 6 +- .../actors/payloads/eventservice/pilothttp.py | 6 +- src/raythena/drivers/esdriver.py | 24 ++++---- src/raythena/scripts/raythena.py | 4 +- src/raythena/utils/bookkeeper.py | 52 ++++++++--------- src/raythena/utils/eventservice.py | 56 +++++++++---------- src/raythena/utils/ray.py | 8 +-- src/raythena/utils/timing.py | 4 +- 10 files changed, 85 insertions(+), 87 deletions(-) diff --git a/src/raythena/actors/esworker.py b/src/raythena/actors/esworker.py index aa7bce1..ef3b70a 100644 --- a/src/raythena/actors/esworker.py +++ b/src/raythena/actors/esworker.py @@ -8,7 +8,7 @@ from collections.abc import Mapping, Sequence from socket import gethostname from time import sleep -from typing import Any, Optional, Tuple, Union +from typing import Any, Optional, Union, tuple import ray @@ -39,7 +39,7 @@ from raythena.utils.ray import get_node_ip # Type returned by the worker methods to the driver -WorkerResponse = Tuple[str, int, Any] +WorkerResponse = tuple[str, int, Any] @ray.remote(num_cpus=1, max_restarts=1, max_task_retries=3) @@ -497,7 +497,7 @@ def return_message(self, message: int, data: Any = None) -> WorkerResponse: data: extra data attached to the message type Returns: - Tuple of (id, message, data) + tuple of (id, message, data) """ return self.id, message, data @@ -622,7 +622,7 @@ def get_message(self) -> WorkerResponse: Returns: - Tuple depending on the current worker state, informing the driver about what information should be sent + tuple depending on the current worker state, informing the driver about what information should be sent to the worker or if the worker produced output data. """ try: diff --git a/src/raythena/actors/payloads/basePayload.py b/src/raythena/actors/payloads/basePayload.py index f527e08..0a5ba17 100644 --- a/src/raythena/actors/payloads/basePayload.py +++ b/src/raythena/actors/payloads/basePayload.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Dict, Optional +from typing import Any, Optional, dict from raythena.utils.config import Config from raythena.utils.eventservice import PandaJob @@ -88,7 +88,7 @@ def return_code(self) -> int: raise NotImplementedError("Base method not implemented") @abstractmethod - def fetch_job_update(self) -> Optional[Dict[str, Any]]: + def fetch_job_update(self) -> Optional[dict[str, Any]]: """ Tries to get a job update from the payload diff --git a/src/raythena/actors/payloads/eventservice/esPayload.py b/src/raythena/actors/payloads/eventservice/esPayload.py index 0725832..041c7e4 100644 --- a/src/raythena/actors/payloads/eventservice/esPayload.py +++ b/src/raythena/actors/payloads/eventservice/esPayload.py @@ -1,6 +1,6 @@ from abc import abstractmethod from collections.abc import Sequence -from typing import Dict, Optional +from typing import Optional, dict from raythena.actors.payloads.basePayload import BasePayload from raythena.utils.config import Config @@ -35,12 +35,12 @@ def submit_new_ranges( raise NotImplementedError("Base method not implemented") @abstractmethod - def fetch_ranges_update(self) -> Optional[Dict[str, str]]: + def fetch_ranges_update(self) -> Optional[dict[str, str]]: """ Checks if event ranges update are available Returns: - Dict holding event range update of processed events, None if no update is available + dict holding event range update of processed events, None if no update is available """ raise NotImplementedError("Base method not implemented") diff --git a/src/raythena/actors/payloads/eventservice/pilothttp.py b/src/raythena/actors/payloads/eventservice/pilothttp.py index 6f143e0..0eebc09 100644 --- a/src/raythena/actors/payloads/eventservice/pilothttp.py +++ b/src/raythena/actors/payloads/eventservice/pilothttp.py @@ -7,7 +7,7 @@ from asyncio import Event, Queue, QueueEmpty from collections.abc import Iterable, Mapping from subprocess import DEVNULL, Popen -from typing import Callable, Dict, List, Optional +from typing import Callable, Optional, dict, list from urllib.parse import parse_qs import uvloop @@ -367,7 +367,7 @@ def fetch_ranges_update(self) -> Optional[Mapping[str, str]]: Checks if event ranges update are available by polling the event ranges update queue Returns: - Dict holding event range update of processed events, None if no update is available + dict holding event range update of processed events, None if no update is available """ try: res = self.ranges_update.get_nowait() @@ -410,7 +410,7 @@ async def http_handler(self, request: web.BaseRequest) -> web.Response: ) @staticmethod - async def parse_qs_body(request: web.BaseRequest) -> Dict[str, List[str]]: + async def parse_qs_body(request: web.BaseRequest) -> dict[str, list[str]]: """ Parses the query-string request body to a dictionary diff --git a/src/raythena/drivers/esdriver.py b/src/raythena/drivers/esdriver.py index a3d9603..e3414ff 100644 --- a/src/raythena/drivers/esdriver.py +++ b/src/raythena/drivers/esdriver.py @@ -14,10 +14,10 @@ from subprocess import DEVNULL, Popen from typing import ( Any, - Dict, - List, Optional, - Tuple, + dict, + list, + tuple, ) import ray @@ -122,8 +122,8 @@ def __init__(self, config: Config, session_dir: str) -> None: ) self.communicator.start() self.requests_queue.put(PandaJobRequest()) - self.actors: Dict[str, ESWorker] = dict() - self.pending_objectref_to_actor: Dict[ObjectRef, str] = dict() + self.actors: dict[str, ESWorker] = dict() + self.pending_objectref_to_actor: dict[ObjectRef, str] = dict() self.actors_message_queue = list() self.bookKeeper = BookKeeper(self.config) self.terminated = list() @@ -190,8 +190,8 @@ def __init__(self, config: Config, session_dir: str) -> None: ) # {input_filename, {merged_output_filename, ([(event_range_id, EventRange)], subprocess handle)}} - self.running_merge_transforms: Dict[ - str, Tuple[List[Tuple[str, EventRange]], Popen, str] + self.running_merge_transforms: dict[ + str, tuple[list[tuple[str, EventRange]], Popen, str] ] = dict() self.total_running_merge_transforms = 0 self.failed_actor_tasks_count = dict() @@ -344,7 +344,7 @@ def handle_actors(self) -> None: "Finished handling the Actors. Raythena will shutdown now." ) - def wait_on_messages(self) -> Tuple[List[ObjectRef], List[ObjectRef]]: + def wait_on_messages(self) -> tuple[list[ObjectRef], list[ObjectRef]]: """ Wait on part of the pending futures to complete. Wait for 1 second trying to fetch half of the pending futures. If no futures are ready, then wait another second to fetch a tenth of the pending futures. @@ -352,7 +352,7 @@ def wait_on_messages(self) -> Tuple[List[ObjectRef], List[ObjectRef]]: events have finished processing yet, then wait forever until one future is ready instead of only timeout interval. Returns: - Tuple of a list of completed futures and a list of pending futures, respectively + tuple of a list of completed futures and a list of pending futures, respectively """ if self.bookKeeper.have_finished_events(): timeoutinterval = self.timeoutinterval @@ -749,7 +749,7 @@ def run(self) -> None: self.bookKeeper.print_status() self._logger.debug("All driver threads stopped. Quitting...") - def rename_output_files(self, output_map: Dict[str, str]): + def rename_output_files(self, output_map: dict[str, str]): """ Rename final output files """ @@ -771,7 +771,7 @@ def rename_output_files(self, output_map: Dict[str, str]): os.path.join(self.merged_files_dir, new_filename), ) - def produce_final_report(self, output_map: Dict[str, str]): + def produce_final_report(self, output_map: dict[str, str]): """ Merge job reports from individual merge transforms to produce the final jobReport for Panda. """ @@ -977,7 +977,7 @@ def handle_merge_transforms(self, wait_for_completion=False) -> bool: def hits_merge_transform( self, input_files: Iterable[str], output_file: str - ) -> Tuple[Popen, str]: + ) -> tuple[Popen, str]: """ Prepare the shell command for the merging subprocess and starts it. diff --git a/src/raythena/scripts/raythena.py b/src/raythena/scripts/raythena.py index 229072e..f4dd1a9 100755 --- a/src/raythena/scripts/raythena.py +++ b/src/raythena/scripts/raythena.py @@ -34,8 +34,8 @@ ) def cli(*args, **kwargs): """ - Starts the application by initializing the config object, connecting or starting the ray cluster, loading the driver - and starting it. + Starts the application by initializing the config object, + connecting or starting the ray cluster, loading the driverand starting it. Returns: None diff --git a/src/raythena/utils/bookkeeper.py b/src/raythena/utils/bookkeeper.py index d4fcefb..8eef9a8 100644 --- a/src/raythena/utils/bookkeeper.py +++ b/src/raythena/utils/bookkeeper.py @@ -3,17 +3,17 @@ import os import threading import time +from collections import deque from collections.abc import Mapping, Sequence from functools import reduce from typing import ( Any, - Deque, - Dict, - List, Optional, - Set, - Tuple, Union, + dict, + list, + set, + tuple, ) from raythena.utils.config import Config @@ -77,11 +77,11 @@ def __init__( self._n_output_per_input_file = max( 1, self._events_per_file // self._hits_per_file ) - self._status: Dict[ + self._status: dict[ str, - Union[Dict[str, Dict[str, Dict[str, str]]], Dict[str, List[str]]], + Union[dict[str, dict[str, dict[str, str]]], dict[str, list[str]]], ] = dict() - self._update_queue: Deque[Tuple[str, Union[EventRange, Tuple]]] = ( + self._update_queue: deque[tuple[str, Union[EventRange, tuple]]] = ( collections.deque() ) self._restore_status() @@ -170,7 +170,7 @@ def is_stale(self) -> bool: @staticmethod def build_eventrange_dict( eventrange: EventRange, output_file: str = None - ) -> Dict[str, Any]: + ) -> dict[str, Any]: """ Takes an EventRange object and retuns the dict representation which should be saved in the state file @@ -222,7 +222,7 @@ def _set_eventrange_simulated( def set_file_merged( self, - input_files: List[str], + input_files: list[str], outputfile: str, event_ranges: Mapping[str, Mapping[str, str]], guid: Optional[str], @@ -241,7 +241,7 @@ def set_file_merged( def _set_file_merged( self, - input_files: List[str], + input_files: list[str], outputfile: str, event_ranges: Mapping[str, Mapping[str, str]], guid: Optional[str], @@ -420,28 +420,28 @@ def __init__(self, config: Config) -> None: self.merged_files_dir = "" self.commitlog = "" self._logger = make_logger(self.config, "BookKeeper") - self.actors: Dict[str, Optional[str]] = dict() - self.rangesID_by_actor: Dict[str, Set[str]] = dict() + self.actors: dict[str, Optional[str]] = dict() + self.rangesID_by_actor: dict[str, set[str]] = dict() #  Output files for which we are ready to launch a merge transform - self.files_ready_to_merge: Dict[str, List[Tuple[str, EventRange]]] = ( + self.files_ready_to_merge: dict[str, list[tuple[str, EventRange]]] = ( dict() ) # Event ranges for a given input file which have been simulated and a ready to be merged - self.ranges_to_merge: Dict[str, List[Tuple[str, EventRange]]] = dict() + self.ranges_to_merge: dict[str, list[tuple[str, EventRange]]] = dict() # Accumulate event ranges of different input files into the same output file until we have enough to produce a merged file # Only used when multiple input files are merged in a single output (n-1) to pool input files together - self.output_merge_queue: Dict[str, List[Tuple[str, EventRange]]] = ( + self.output_merge_queue: dict[str, list[tuple[str, EventRange]]] = ( dict() ) # Keep tracks of merge job definition that have been distributed to the driver for which we expect an update - self.ditributed_merge_tasks: Dict[str, List[Tuple[str, EventRange]]] = ( + self.ditributed_merge_tasks: dict[str, list[tuple[str, EventRange]]] = ( dict() ) - self.files_guids: Dict[str, str] = dict() + self.files_guids: dict[str, str] = dict() self.last_status_print = time.time() - self.taskstatus: Dict[str, TaskStatus] = dict() - self._input_output_mapping: Dict[str, List[str]] = dict() - self._output_input_mapping: Dict[str, List[str]] = dict() + self.taskstatus: dict[str, TaskStatus] = dict() + self._input_output_mapping: dict[str, list[str]] = dict() + self._output_input_mapping: dict[str, list[str]] = dict() self.stop_saver = threading.Event() self.stop_cleaner = threading.Event() self.save_state_thread = ExThread( @@ -627,7 +627,7 @@ def _generate_input_output_mapping(self, job: PandaJob): def generate_event_range_id(file: str, n: str): return f"{file}-{n}" - def remap_output_files(self, panda_id: str) -> Dict[str, str]: + def remap_output_files(self, panda_id: str) -> dict[str, str]: """ Translate an existing output file to an output filename matching the current job definition. """ @@ -636,7 +636,7 @@ def remap_output_files(self, panda_id: str) -> Dict[str, str]: if task_status.is_stale(): task_status.save_status() merged_files = task_status._status[TaskStatus.MERGED] - previous_to_current_output_lookup: Dict[str, str] = dict() + previous_to_current_output_lookup: dict[str, str] = dict() with open(self.commitlog, "a") as f: for input_file, output_files in self._input_output_mapping.items(): @@ -794,7 +794,7 @@ def add_event_ranges( Assign event ranges to the jobs in queue. Args: - event_ranges: List of event ranges dict as returned by harvester + event_ranges: list of event ranges dict as returned by harvester Returns: None @@ -851,7 +851,7 @@ def assign_job_to_actor(self, actor_id: str) -> Optional[PandaJob]: self.actors[actor_id] = job_id return self.jobs[job_id] if job_id else None - def fetch_event_ranges(self, actor_id: str, n: int) -> List[EventRange]: + def fetch_event_ranges(self, actor_id: str, n: int) -> list[EventRange]: """ Retrieve event ranges for an actor. The specified actor should have a job assigned from assign_job_to_actor() or an empty list will be returned. If the job assigned to the actor doesn't have enough range currently available, it will assign all of its remaining anges @@ -878,7 +878,7 @@ def fetch_event_ranges(self, actor_id: str, n: int) -> List[EventRange]: def get_file_to_merge( self, - ) -> Optional[Tuple[str, List[Tuple[str, EventRange]]]]: + ) -> Optional[tuple[str, list[tuple[str, EventRange]]]]: """ Returns a merge tasks available for an arbitrary input file if available, None otherwise. """ diff --git a/src/raythena/utils/eventservice.py b/src/raythena/utils/eventservice.py index dc27a34..9fc4456 100644 --- a/src/raythena/utils/eventservice.py +++ b/src/raythena/utils/eventservice.py @@ -3,16 +3,16 @@ from collections.abc import Iterable, Mapping, MutableMapping, Sequence from typing import ( Any, - Dict, - List, Optional, - Set, Union, + dict, + list, + set, ) # Types aliases Builtin = Union[int, float, str] -JobDef = Dict[str, Builtin] +JobDef = dict[str, Builtin] EventRangeDef = MutableMapping[str, Builtin] FileInfo = Mapping[str, Builtin] PilotEventRangeUpdateDef = Mapping[ @@ -99,7 +99,7 @@ class PandaJobQueue: """ def __init__(self, jobs: Mapping[str, JobDef] = None) -> None: - self.jobs: Dict[str, PandaJob] = dict() + self.jobs: dict[str, PandaJob] = dict() self.distributed_jobs_ids = list() if jobs: @@ -348,11 +348,11 @@ def __init__(self) -> None: """ Init the queue """ - self.event_ranges_by_id: Dict[str, EventRange] = dict() - self.rangesID_by_state: Dict[str, Set[str]] = dict() + self.event_ranges_by_id: dict[str, EventRange] = dict() + self.rangesID_by_state: dict[str, set[str]] = dict() # only holds event ranges that are ready - self.rangesID_by_file: Dict[str, Set[str]] = dict() - self.event_ranges_count: Dict[str, int] = dict() + self.rangesID_by_file: dict[str, set[str]] = dict() + self.event_ranges_count: dict[str, int] = dict() for s in EventRange.STATES: self.event_ranges_count[s] = 0 self.rangesID_by_state[s] = set() @@ -437,11 +437,11 @@ def update_range_state(self, range_id: str, new_state: str) -> "EventRange": # rangesID_by_file only hold ids of ranges that are ready to be assigned return event_range - def assign_ready_ranges(self, n_ranges=1) -> List["EventRange"]: + def assign_ready_ranges(self, n_ranges=1) -> list["EventRange"]: n_ranges = min(self.nranges_available(), n_ranges) if not n_ranges: return list() - res: List[Optional[EventRange]] = [None] * n_ranges + res: list[Optional[EventRange]] = [None] * n_ranges res_idx = 0 ready = self.rangesID_by_state[EventRange.READY] assigned = self.rangesID_by_state[EventRange.ASSIGNED] @@ -581,7 +581,7 @@ def concat( for r in ranges: self.append(r) - def get_next_ranges(self, nranges: int) -> List["EventRange"]: + def get_next_ranges(self, nranges: int) -> list["EventRange"]: """ Dequeue event ranges. Event ranges which were dequeued are updated to the 'ASSIGNED' status and should be assigned to workers to be processed. In case more ranges are requested @@ -626,7 +626,7 @@ def __init__(self, **kwargs) -> None: def __str__(self) -> str: return str(self.__dict__) - def to_dict(self) -> Dict[str, Builtin]: + def to_dict(self) -> dict[str, Builtin]: return self.__dict__ @@ -703,8 +703,8 @@ class EventRangeUpdate: def __init__( self, - range_update: Dict[ - str, List[MutableMapping[str, Union[str, int]]] + range_update: dict[ + str, list[MutableMapping[str, Union[str, int]]] ] = None, ) -> None: """ @@ -714,12 +714,12 @@ def __init__( range_update: range update """ if not range_update: - self.range_update: Dict[str, HarvesterEventRangeUpdateDef] = dict() + self.range_update: dict[str, HarvesterEventRangeUpdateDef] = dict() else: for v in range_update.values(): if not isinstance(v, list): raise Exception(f"Expecting type list for element {v}") - self.range_update: Dict[str, HarvesterEventRangeUpdateDef] = ( + self.range_update: dict[str, HarvesterEventRangeUpdateDef] = ( range_update ) @@ -870,7 +870,7 @@ def __init__( def __str__(self) -> str: return str(self.__dict__) - def to_dict(self) -> Dict[str, Builtin]: + def to_dict(self) -> dict[str, Builtin]: return self.__dict__ @@ -890,7 +890,7 @@ class EventRangeRequest: """ def __init__(self) -> None: - self.request: Dict[str, Dict[str, Builtin]] = dict() + self.request: dict[str, dict[str, Builtin]] = dict() def __len__(self) -> int: return len(self.request) @@ -898,7 +898,7 @@ def __len__(self) -> int: def __iter__(self) -> Iterable[str]: return iter(self.request) - def __getitem__(self, k: str) -> Dict[str, Builtin]: + def __getitem__(self, k: str) -> dict[str, Builtin]: return self.request[k] def __str__(self) -> str: @@ -928,7 +928,7 @@ def add_event_request( @staticmethod def build_from_dict( - request_dict: Mapping[str, Dict[str, Builtin]], + request_dict: Mapping[str, dict[str, Builtin]], ) -> "EventRangeRequest": """ Build a request object from a dict parsed from its json representation @@ -987,7 +987,7 @@ class PandaJob: --athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so --preInclude sim:SimulationJobOptions/ preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py - --geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 + --geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicslist QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 --maxEvents=-1 --inputEvgenFile EVNT.01469903._009502.pool.root.1 --outputHitsFile HITS_%s.pool.root' % job_name, 'attemptNr': 0, @@ -1010,7 +1010,7 @@ class PandaJob: """ def __init__(self, job_def: JobDef) -> None: - self.job: Dict[str, Builtin] = job_def + self.job: dict[str, Builtin] = job_def if "PandaID" in self: self["PandaID"] = str(self["PandaID"]) self.event_ranges_queue: EventRangeQueue = EventRangeQueue() @@ -1037,7 +1037,7 @@ def nranges_available(self) -> int: """ return self.event_ranges_queue.nranges_available() - def get_next_ranges(self, nranges: int) -> List["EventRange"]: + def get_next_ranges(self, nranges: int) -> list["EventRange"]: """ See Also: EventRangeQueue.get_next_ranges() @@ -1139,7 +1139,7 @@ def __init__( def set_assigned(self) -> None: """ - Set current state to ASSIGNED + set current state to ASSIGNED Returns: None @@ -1148,7 +1148,7 @@ def set_assigned(self) -> None: def set_done(self) -> None: """ - Set current state to DONE + set current state to DONE Returns: None @@ -1157,7 +1157,7 @@ def set_done(self) -> None: def set_failed(self) -> None: """ - Set current state to FAILED + set current state to FAILED Returns: None @@ -1245,5 +1245,5 @@ def __init__( def __str__(self) -> str: return str(self.__dict__) - def to_dict(self) -> Dict[str, Builtin]: + def to_dict(self) -> dict[str, Builtin]: return self.__dict__ diff --git a/src/raythena/utils/ray.py b/src/raythena/utils/ray.py index 342de59..862d705 100644 --- a/src/raythena/utils/ray.py +++ b/src/raythena/utils/ray.py @@ -1,5 +1,5 @@ from collections.abc import Mapping -from typing import Any, List +from typing import Any, list import ray @@ -8,7 +8,7 @@ def build_nodes_resource_list( config: Config, run_actor_on_head: bool = False -) -> List[Mapping[str, Any]]: +) -> list[Mapping[str, Any]]: """ Build and setup ray custom resources. Actors should then be instantiated by requiring one of the resource in the returned list. @@ -72,9 +72,7 @@ def setup_ray(config: Config) -> Any: Returns: dict of cluster params """ - log_to_driver = ( - True if not config.logging.get("workerlogfile", None) else False - ) + log_to_driver = bool(not config.logging.get("workerlogfile", None)) if is_external_cluster(config): ray_url = f"{config.ray['headip']}:{config.ray['redisport']}" return ray.init( diff --git a/src/raythena/utils/timing.py b/src/raythena/utils/timing.py index b17188e..2141366 100644 --- a/src/raythena/utils/timing.py +++ b/src/raythena/utils/timing.py @@ -1,7 +1,7 @@ import json import time from threading import Event -from typing import Any, Dict, List, Union +from typing import Any, Union, dict, list import psutil @@ -49,7 +49,7 @@ def stop(self) -> None: self.stop_event = Event() def _log_to_file( - self, data: Dict[str, Union[Dict[str, List], List, int]] + self, data: dict[str, Union[dict[str, list], list, int]] ) -> None: """ Write data to log file From ec1d81823c72d8b9f079ae7b674db60c8becea5d Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 14:05:42 -0700 Subject: [PATCH 04/14] implement ruff fix --- src/raythena/actors/esworker.py | 46 +++++++++---------- .../actors/payloads/eventservice/pilothttp.py | 2 +- .../drivers/communicators/harvesterMock.py | 8 ++-- .../communicators/harvesterMock2205.py | 8 ++-- 4 files changed, 29 insertions(+), 35 deletions(-) diff --git a/src/raythena/actors/esworker.py b/src/raythena/actors/esworker.py index ef3b70a..ca94ff3 100644 --- a/src/raythena/actors/esworker.py +++ b/src/raythena/actors/esworker.py @@ -194,8 +194,8 @@ def check_time(self) -> None: f"Failed to copy ray logs to actor directory: {e}" ) if time_elapsed > self.time_limit - self.pilot_kill_time: - killsignal = open(self.pilot_kill_file, "w") - killsignal.close() + with open(self.pilot_kill_file, "w") as f: + f.write("KILL") self._logger.info("killsignal sent to payload") break else: @@ -297,23 +297,21 @@ def stagein(self) -> None: self.payload_actor_process_dir, "ray_logs" ) try: - time_limit_monitor = open( - os.path.join(self.workdir, self.time_monitor_file) - ) - start_time = time_limit_monitor.readline().split(":") - self.start_time = ( - int(start_time[0]) * 3600 - + int(start_time[1]) * 60 - + int(start_time[2]) - ) - time_limit = time_limit_monitor.readline().split(":") - if len(time_limit) < 3: - time_limit = ["0"] + time_limit - self.time_limit = ( - int(time_limit[0]) * 3600 - + int(time_limit[1]) * 60 - + int(time_limit[2]) - ) + with open(os.path.join(self.workdir, self.time_monitor_file)) as time_limit_monitor: + start_time = time_limit_monitor.readline().split(":") + self.start_time = ( + int(start_time[0]) * 3600 + + int(start_time[1]) * 60 + + int(start_time[2]) + ) + time_limit = time_limit_monitor.readline().split(":") + if len(time_limit) < 3: + time_limit = ["0"] + time_limit + self.time_limit = ( + int(time_limit[0]) * 3600 + + int(time_limit[1]) * 60 + + int(time_limit[2]) + ) timer_thread = threading.Thread( name="timer", target=self.check_time, daemon=True ) @@ -341,7 +339,7 @@ def stagein(self) -> None: os.mkdir(self.payload_actor_output_dir) except Exception as e: self._logger.warning(f"Exception when creating dir: {e}") - raise StageInFailed(self.id) + raise StageInFailed(self.id) from e # self.cpu_monitor = CPUMonitor(os.path.join(self.payload_actor_process_dir, "cpu_monitor.json")) # self.cpu_monitor.start() try: @@ -349,7 +347,7 @@ def stagein(self) -> None: self.payload.start(self.modify_job(self.job)) except Exception as e: self._logger.warning(f"Failed to stagein payload: {e}") - raise StageInFailed(self.id) + raise StageInFailed(self.id) from e self.transition_state( ESWorker.READY_FOR_EVENTS if self.is_event_service_job() @@ -426,7 +424,7 @@ def receive_job(self, reply: int, job: PandaJob) -> WorkerResponse: except BaseRaythenaException: raise except Exception as e: - raise WrappedException(self.id, e) + raise WrappedException(self.id, e) from e else: self.transition_state(ESWorker.DONE) self._logger.error("Could not fetch job. Set state to done.") @@ -581,7 +579,7 @@ def stageout_event_service_files( self._logger.error( f"Failed to move file {cfile} to {dst}: errno {e.errno}: {e.strerror}" ) - raise StageOutFailed(self.id) + raise StageOutFailed(self.id) from e range_update[cfile_key] = dst else: self._logger.warning( @@ -669,4 +667,4 @@ def get_message(self) -> WorkerResponse: except BaseRaythenaException: raise except Exception as e: - raise WrappedException(self.id, e) + raise WrappedException(self.id, e) from e diff --git a/src/raythena/actors/payloads/eventservice/pilothttp.py b/src/raythena/actors/payloads/eventservice/pilothttp.py index 0eebc09..0b4789f 100644 --- a/src/raythena/actors/payloads/eventservice/pilothttp.py +++ b/src/raythena/actors/payloads/eventservice/pilothttp.py @@ -482,7 +482,7 @@ async def handle_get_event_ranges( else: n_ranges = int(body["nRanges"][0]) if not self.no_more_ranges: - for i in range(n_ranges): + for _ in range(n_ranges): crange = await self.ranges_queue.get() if crange is None: self.no_more_ranges = True diff --git a/src/raythena/drivers/communicators/harvesterMock.py b/src/raythena/drivers/communicators/harvesterMock.py index e37d0ee..dedf8a3 100644 --- a/src/raythena/drivers/communicators/harvesterMock.py +++ b/src/raythena/drivers/communicators/harvesterMock.py @@ -242,15 +242,13 @@ def request_job(self, job_request: PandaJobRequest) -> None: "SimulationJobOptions/preInclude.BeamPipeKill.py " "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT " "--randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " - "--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root" - % (self.inFiles, job_name) + f"--maxEvents=-1 --inputEvgenFile {self.inFiles} --outputHitsFile HITS_{job_name}.pool.root" ), "attemptNr": 0, "swRelease": "Atlas-21.0.15", "nucleus": "NULL", "maxCpuCount": 0, - "outFiles": "HITS_%s.pool.root,%s.job.log.tgz" - % (job_name, job_name), + "outFiles": f"HITS_{job_name}.pool.root,{job_name}.job.log.tgz", "currentPriority": 1000, "scopeIn": self.scope, "PandaID": self.pandaID, @@ -261,7 +259,7 @@ def request_job(self, job_request: PandaJobRequest) -> None: "jobName": job_name, "ddmEndPointIn": "UTA_SWT2_DATADISK", "taskID": self.taskId, - "logFile": "%s.job.log.tgz" % job_name, + "logFile": f"{job_name}.job.log.tgz", } } ) diff --git a/src/raythena/drivers/communicators/harvesterMock2205.py b/src/raythena/drivers/communicators/harvesterMock2205.py index 58e51f3..4f7cdc1 100644 --- a/src/raythena/drivers/communicators/harvesterMock2205.py +++ b/src/raythena/drivers/communicators/harvesterMock2205.py @@ -116,15 +116,13 @@ def request_job(self, job_request: PandaJobRequest) -> None: "--geometryVersion default:ATLAS-R2-2016-01-00-01_VALIDATION " "--physicsList FTFP_BERT_ATL_VALIDATION --randomSeed 1234 " "--conditionsTag default:OFLCOND-MC16-SDR-14 " - "--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root" - % (self.inFiles, job_name) + f"--maxEvents=-1 --inputEvgenFile {self.inFiles} --outputHitsFile HITS_{job_name}.pool.root" ), "attemptNr": 0, "swRelease": "Atlas-22.0.5", "nucleus": "NULL", "maxCpuCount": 0, - "outFiles": "HITS_%s.pool.root,%s.job.log.tgz" - % (job_name, job_name), + "outFiles": f"HITS_{job_name}.pool.root,{job_name}.job.log.tgz", "currentPriority": 1000, "scopeIn": self.scope, "PandaID": self.pandaID, @@ -135,7 +133,7 @@ def request_job(self, job_request: PandaJobRequest) -> None: "jobName": job_name, "ddmEndPointIn": "UTA_SWT2_DATADISK", "taskID": self.taskId, - "logFile": "%s.job.log.tgz" % job_name, + "logFile": f"{job_name}.job.log.tgz", } } ) From ab6e2da68157d4329fa57b0327405835745dc261 Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 14:46:10 -0700 Subject: [PATCH 05/14] implement more ruff fix --- example/standalone_ray_test_hello_world.py | 8 ++++---- src/raythena/actors/esworker.py | 2 +- src/raythena/actors/payloads/basePayload.py | 2 +- .../actors/payloads/eventservice/esPayload.py | 2 +- .../actors/payloads/eventservice/pilothttp.py | 2 +- .../communicators/harvesterFileMessenger.py | 13 ++++-------- src/raythena/drivers/esdriver.py | 3 --- src/raythena/utils/bookkeeper.py | 13 +++--------- src/raythena/utils/eventservice.py | 20 ++++++++++--------- src/raythena/utils/ray.py | 2 +- src/raythena/utils/timing.py | 2 +- tests/conftest.py | 16 +++++++-------- tests/harvester/test_harvesterMock.py | 4 ++-- tests/test_bookkeeper.py | 6 +++--- tests/test_eventservice.py | 12 +++++------ 15 files changed, 46 insertions(+), 61 deletions(-) diff --git a/example/standalone_ray_test_hello_world.py b/example/standalone_ray_test_hello_world.py index 5b1924e..d7dbae4 100755 --- a/example/standalone_ray_test_hello_world.py +++ b/example/standalone_ray_test_hello_world.py @@ -47,8 +47,8 @@ def main(redis_ip: str, redis_port: str, redis_password: str): redis_address = f"{redis_ip}:{redis_port}" ray.init( ignore_reinit_error=True, - address="%s" % redis_address, - _redis_password="%s" % redis_password, + address=f"{redis_address}", + _redis_password=f"{redis_password}", ) # show the ray cluster @@ -83,10 +83,10 @@ def main(redis_ip: str, redis_port: str, redis_password: str): description="Wait on ray head node or workers to connect" ) parser.add_argument( - "--redis-ip", default="%s" % (os.environ["RAYTHENA_RAY_HEAD_IP"]) + "--redis-ip", default="{}".format(os.environ["RAYTHENA_RAY_HEAD_IP"]) ) parser.add_argument( - "--redis-port", default="%s" % (os.environ["RAYTHENA_RAY_REDIS_PORT"]) + "--redis-port", default="{}".format(os.environ["RAYTHENA_RAY_REDIS_PORT"]) ) parser.add_argument( "--redis-password", default=os.environ["RAYTHENA_RAY_REDIS_PASSWORD"] diff --git a/src/raythena/actors/esworker.py b/src/raythena/actors/esworker.py index ca94ff3..d481443 100644 --- a/src/raythena/actors/esworker.py +++ b/src/raythena/actors/esworker.py @@ -8,7 +8,7 @@ from collections.abc import Mapping, Sequence from socket import gethostname from time import sleep -from typing import Any, Optional, Union, tuple +from typing import Any, Optional, Union import ray diff --git a/src/raythena/actors/payloads/basePayload.py b/src/raythena/actors/payloads/basePayload.py index 0a5ba17..4a0c8f4 100644 --- a/src/raythena/actors/payloads/basePayload.py +++ b/src/raythena/actors/payloads/basePayload.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Optional, dict +from typing import Any, Optional from raythena.utils.config import Config from raythena.utils.eventservice import PandaJob diff --git a/src/raythena/actors/payloads/eventservice/esPayload.py b/src/raythena/actors/payloads/eventservice/esPayload.py index 041c7e4..2ec00d1 100644 --- a/src/raythena/actors/payloads/eventservice/esPayload.py +++ b/src/raythena/actors/payloads/eventservice/esPayload.py @@ -1,6 +1,6 @@ from abc import abstractmethod from collections.abc import Sequence -from typing import Optional, dict +from typing import Optional from raythena.actors.payloads.basePayload import BasePayload from raythena.utils.config import Config diff --git a/src/raythena/actors/payloads/eventservice/pilothttp.py b/src/raythena/actors/payloads/eventservice/pilothttp.py index 0b4789f..8160806 100644 --- a/src/raythena/actors/payloads/eventservice/pilothttp.py +++ b/src/raythena/actors/payloads/eventservice/pilothttp.py @@ -7,7 +7,7 @@ from asyncio import Event, Queue, QueueEmpty from collections.abc import Iterable, Mapping from subprocess import DEVNULL, Popen -from typing import Callable, Optional, dict, list +from typing import Callable, Optional from urllib.parse import parse_qs import uvloop diff --git a/src/raythena/drivers/communicators/harvesterFileMessenger.py b/src/raythena/drivers/communicators/harvesterFileMessenger.py index e3c7cb4..57c8d07 100644 --- a/src/raythena/drivers/communicators/harvesterFileMessenger.py +++ b/src/raythena/drivers/communicators/harvesterFileMessenger.py @@ -1,4 +1,5 @@ import configparser +import contextlib import json import os import shutil @@ -133,14 +134,10 @@ def request_job(self, request: PandaJobRequest) -> None: with open(self.jobspecfile) as f: job = json.load(f) - try: + with contextlib.suppress(FileNotFoundError): os.remove(self.jobrequestfile) - except FileNotFoundError: - pass - try: + with contextlib.suppress(FileNotFoundError): os.rename(self.jobspecfile, f"{self.jobspecfile}.read") - except FileNotFoundError: - pass if job: self.job_queue.put(job) @@ -189,10 +186,8 @@ def request_event_ranges(self, request: EventRangeRequest) -> None: ): self.ranges_requests_count += 1 - try: + with contextlib.suppress(FileNotFoundError): os.remove(self.eventrequestfile) - except FileNotFoundError: - pass self.ranges_requests_count += 1 self.event_ranges_queue.put(ranges) diff --git a/src/raythena/drivers/esdriver.py b/src/raythena/drivers/esdriver.py index e3414ff..c54fdf9 100644 --- a/src/raythena/drivers/esdriver.py +++ b/src/raythena/drivers/esdriver.py @@ -15,9 +15,6 @@ from typing import ( Any, Optional, - dict, - list, - tuple, ) import ray diff --git a/src/raythena/utils/bookkeeper.py b/src/raythena/utils/bookkeeper.py index 8eef9a8..f26df7f 100644 --- a/src/raythena/utils/bookkeeper.py +++ b/src/raythena/utils/bookkeeper.py @@ -10,10 +10,6 @@ Any, Optional, Union, - dict, - list, - set, - tuple, ) from raythena.utils.config import Config @@ -280,7 +276,7 @@ def _set_file_merged( self._status[TaskStatus.MERGED][inputfile] = merged_dict for merged_outputfile in self._status[TaskStatus.MERGING][ inputfile - ].keys(): + ]: merged_dict[merged_outputfile] = { "path": os.path.join( self.merged_files_dir, merged_outputfile @@ -462,7 +458,7 @@ def _cleaner_thead_run(self): for task_status in self.taskstatus.values(): for merged_file in task_status._status[ TaskStatus.MERGED - ].keys(): + ]: if self.stop_cleaner.is_set(): break for temp_file in files: @@ -709,10 +705,7 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): guids = job["GUID"].split(",") for file, guid in zip(files, guids): self.files_guids[file] = guid - if "scopeIn" in job: - scope = job["scopeIn"] - else: - scope = "" + scope = job.get("scopeIn", "") event_ranges = [] merged_files = task_status._status[TaskStatus.MERGED] merging_files = task_status._status[TaskStatus.MERGING] diff --git a/src/raythena/utils/eventservice.py b/src/raythena/utils/eventservice.py index 9fc4456..8ecbea3 100644 --- a/src/raythena/utils/eventservice.py +++ b/src/raythena/utils/eventservice.py @@ -5,9 +5,6 @@ Any, Optional, Union, - dict, - list, - set, ) # Types aliases @@ -112,7 +109,7 @@ def __setitem__(self, k: str, v: "PandaJob") -> None: if isinstance(v, PandaJob): self.jobs[k] = v else: - raise Exception(f"{v} is not of type {PandaJob}") + raise ValueError(f"{v} is not of type {PandaJob}") def __iter__(self) -> Iterable[str]: return iter(self.jobs) @@ -368,9 +365,9 @@ def __getitem__(self, k: str) -> "EventRange": def __setitem__(self, k: str, v: "EventRange") -> None: if not isinstance(v, EventRange): - raise Exception(f"{v} should be of type {EventRange}") + raise ValueError(f"{v} should be of type {EventRange}") if k != v.eventRangeID: - raise Exception( + raise KeyError( f"Specified key '{k}' should be equals to the event range id '{v.eventRangeID}' " ) if k in self.event_ranges_by_id: @@ -416,7 +413,7 @@ def update_range_state(self, range_id: str, new_state: str) -> "EventRange": the updated event range """ if range_id not in self.event_ranges_by_id: - raise Exception( + raise KeyError( f"Trying to update non-existing eventrange {range_id}" ) @@ -718,7 +715,7 @@ def __init__( else: for v in range_update.values(): if not isinstance(v, list): - raise Exception(f"Expecting type list for element {v}") + raise ValueError(f"Expecting type list for element {v}") self.range_update: dict[str, HarvesterEventRangeUpdateDef] = ( range_update ) @@ -737,7 +734,7 @@ def __getitem__(self, k: str) -> HarvesterEventRangeUpdateDef: def __setitem__(self, k: str, v: HarvesterEventRangeUpdateDef) -> None: if not isinstance(v, list): - raise Exception(f"Expecting type list for element {v}") + raise ValueError(f"Expecting type list for element {v}") self.range_update[k] = v def merge_update(self, other: "EventRangeUpdate") -> None: @@ -1062,6 +1059,11 @@ def get_id(self) -> str: """ return self["PandaID"] + def get(self, k: str, default: Any = "") -> Builtin: + if k in self.job: + return self.job[k] + return default + def __str__(self) -> str: return json.dumps(self.job) diff --git a/src/raythena/utils/ray.py b/src/raythena/utils/ray.py index 862d705..f3fa1bd 100644 --- a/src/raythena/utils/ray.py +++ b/src/raythena/utils/ray.py @@ -1,5 +1,5 @@ from collections.abc import Mapping -from typing import Any, list +from typing import Any import ray diff --git a/src/raythena/utils/timing.py b/src/raythena/utils/timing.py index 2141366..62f9e0b 100644 --- a/src/raythena/utils/timing.py +++ b/src/raythena/utils/timing.py @@ -1,7 +1,7 @@ import json import time from threading import Event -from typing import Any, Union, dict, list +from typing import Any, Union import psutil diff --git a/tests/conftest.py b/tests/conftest.py index 584161b..c1c3577 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -58,7 +58,7 @@ def is_eventservice(request): @pytest.fixture def pandaids(njobs): res = [] - for i in range(njobs): + for _ in range(njobs): hash = hashlib.md5() hash.update(str(time.time()).encode("utf-8")) res.append(hash.hexdigest()) @@ -223,14 +223,13 @@ def sample_multijobs( "destinationDblock": job_name, "dispatchDBlockToken": "NULL", "jobPars": ( - '--eventService=%s --skipEvents=0 --firstEvent=1 --preExec "from AthenaCommon.DetFlags ' + f'--eventService={str(is_eventservice)} --skipEvents=0 --firstEvent=1 --preExec "from AthenaCommon.DetFlags ' "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff() "' "--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so " "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py " "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " - "--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root)" - % (str(is_eventservice), inFiles, outFilesShort) + f"--maxEvents=-1 --inputEvgenFile {inFiles} --outputHitsFile HITS_{outFilesShort}.pool.root)" ), "attemptNr": 0, "swRelease": "Atlas-21.0.15", @@ -247,7 +246,7 @@ def sample_multijobs( "jobName": job_name, "ddmEndPointIn": "UTA_SWT2_DATADISK", "taskID": taskId, - "logFile": "%s.job.log.tgz" % job_name, + "logFile": f"{job_name}.job.log.tgz", } return res @@ -315,14 +314,13 @@ def sample_job( "destinationDblock": job_name, "dispatchDBlockToken": "NULL", "jobPars": ( - '--eventService=%s --skipEvents=0 --firstEvent=1 --preExec "from AthenaCommon.DetFlags ' + f'--eventService={str(is_eventservice)} --skipEvents=0 --firstEvent=1 --preExec "from AthenaCommon.DetFlags ' "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff() "' "--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so " "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py " "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " - "--maxEvents=-1 --inputEvgenFile %s --outputHitsFile HITS_%s.pool.root)" - % (str(is_eventservice), inFiles, outFilesShort) + f"--maxEvents=-1 --inputEvgenFile {inFiles} --outputHitsFile HITS_{outFilesShort}.pool.root)" ), "attemptNr": 0, "swRelease": "Atlas-21.0.15", @@ -339,6 +337,6 @@ def sample_job( "jobName": job_name, "ddmEndPointIn": "UTA_SWT2_DATADISK", "taskID": taskId, - "logFile": "%s.job.log.tgz" % job_name, + "logFile": f"{job_name}.job.log.tgz", } } diff --git a/tests/harvester/test_harvesterMock.py b/tests/harvester/test_harvesterMock.py index 3acdd7b..7f28a31 100644 --- a/tests/harvester/test_harvesterMock.py +++ b/tests/harvester/test_harvesterMock.py @@ -25,7 +25,7 @@ def test_get_ranges( ranges = ranges_queue.get(timeout=5) assert ranges is not None assert isinstance(ranges, dict) - for pandaID, job_ranges in ranges.items(): + for _pandaID, job_ranges in ranges.items(): assert len(job_ranges) == n_events # should return 0 ranges per job @@ -33,5 +33,5 @@ def test_get_ranges( ranges = ranges_queue.get(timeout=5) assert ranges is not None assert isinstance(ranges, dict) - for pandaID, job_ranges in ranges.items(): + for _pandaID, job_ranges in ranges.items(): assert len(job_ranges) == 0 diff --git a/tests/test_bookkeeper.py b/tests/test_bookkeeper.py index f31b710..868e75f 100644 --- a/tests/test_bookkeeper.py +++ b/tests/test_bookkeeper.py @@ -29,7 +29,7 @@ def test_assign_job_to_actor( actor_id = "a1" if not is_eventservice: job = None - for i in range(njobs): + for _ in range(njobs): job_tmp = bookKeeper.assign_job_to_actor(actor_id) if job: assert job["PandaID"] != job_tmp["PandaID"] @@ -39,7 +39,7 @@ def test_assign_job_to_actor( else: bookKeeper.add_event_ranges(sample_ranges) job = None - for i in range(njobs): + for _ in range(njobs): job_tmp = bookKeeper.assign_job_to_actor(actor_id) if job: assert job["PandaID"] == job_tmp["PandaID"] @@ -129,7 +129,7 @@ def __inner__(range_update, failed=False): bookKeeper.merged_files_dir = "dummy" bookKeeper.add_jobs(sample_multijobs, False) - for i in range(njobs): + for _ in range(njobs): job = bookKeeper.assign_job_to_actor(actor_id) _ = bookKeeper.fetch_event_ranges(actor_id, nevents) print(job.event_ranges_queue.rangesID_by_state) diff --git a/tests/test_eventservice.py b/tests/test_eventservice.py index 4ad8d89..0134f90 100644 --- a/tests/test_eventservice.py +++ b/tests/test_eventservice.py @@ -85,11 +85,11 @@ def test_build_range_update( and "fsize" not in r ) - with pytest.raises(Exception): + with pytest.raises(ValueError): ranges_update.range_update[pandaID] = None EventRangeUpdate(ranges_update.range_update) - with pytest.raises(Exception): + with pytest.raises(ValueError): ranges_update[pandaID] = None ranges_update[pandaID] = [] assert not ranges_update[pandaID] @@ -115,7 +115,7 @@ def test_new(self, nevents, sample_job, sample_ranges): == 0 ) - with pytest.raises(Exception): + with pytest.raises(ValueError): ranges_queue["key"] = None ranges_queue_2 = EventRangeQueue() @@ -179,7 +179,7 @@ def test_update( assert ranges_queue.nranges_assigned() == 0 assert ranges_queue.nranges_remaining() == 0 - with pytest.raises(Exception): + with pytest.raises(KeyError): ranges_queue.update_range_state("unknown", EventRange.ASSIGNED) def test_get_next(self, sample_job, sample_ranges): @@ -269,7 +269,7 @@ def test_build_pandajob_queue( if is_eventservice: assert job else: - for i in range(1, njobs): + for _ in range(1, njobs): next_job = pandajob_queue.next_job_to_process() assert job["PandaID"] != next_job["PandaID"] job = next_job @@ -279,7 +279,7 @@ def test_build_pandajob_queue( assert isinstance(event_ranges, EventRangeQueue) assert len(event_ranges) == 0 assert pandajob_queue.has_job(pandaID) - with pytest.raises(Exception): + with pytest.raises(ValueError): pandajob_queue[pandaID] = None pandajob_queue_2 = PandaJobQueue() From 5bb28792c83c64792a48643907d67332c1dcab72 Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 14:47:08 -0700 Subject: [PATCH 06/14] lint with ruff --- .github/workflows/main.yml | 7 +------ pyproject.toml | 5 +++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4b040aa..9898332 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -29,14 +29,9 @@ jobs: ${{ runner.os }}-pip- - name: Install Hatch uses: pypa/hatch@a3c83ab3d481fbc2dc91dd0088628817488dd1d5 - - name: Install application and deps - run: | - python -m pip install --upgrade pip - pip install . - name: Linting run: | - pip install flake8 - flake8 . + ruff check - name: Run tests run: | hatch run test:pytest diff --git a/pyproject.toml b/pyproject.toml index 113b8b4..2a23446 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,3 +64,8 @@ select = [ # isort "I" ] + +ignore = [ + # pycodestyle + "E501", +] From 154953e08c568a2317e55825a1e0c8b86b3ef09c Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 14:51:56 -0700 Subject: [PATCH 07/14] install ruff --- .github/workflows/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9898332..01fbfb2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,6 +31,7 @@ jobs: uses: pypa/hatch@a3c83ab3d481fbc2dc91dd0088628817488dd1d5 - name: Linting run: | + pip install ruff ruff check - name: Run tests run: | From 963c57d857ca542da7970a4d322aa9646ffeb32c Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 15:06:59 -0700 Subject: [PATCH 08/14] consistent formatting betweent python version --- bin/validate-raythena-job.py | 1 - example/standalone_ray_test_hello_world.py | 1 - pyproject.toml | 3 +++ src/raythena/actors/esworker.py | 2 -- src/raythena/actors/payloads/basePayload.py | 1 - src/raythena/actors/payloads/eventservice/esPayload.py | 1 - src/raythena/actors/payloads/eventservice/pilothttp.py | 2 -- src/raythena/drivers/baseDriver.py | 1 - src/raythena/drivers/communicators/baseCommunicator.py | 1 - src/raythena/drivers/communicators/harvesterFileMessenger.py | 1 - src/raythena/drivers/communicators/harvesterMock.py | 1 - src/raythena/drivers/communicators/harvesterMock2205.py | 1 - src/raythena/drivers/esdriver.py | 2 -- src/raythena/scripts/raythena.py | 2 -- src/raythena/utils/bookkeeper.py | 1 - src/raythena/utils/config.py | 1 - src/raythena/utils/logging.py | 1 - src/raythena/utils/ray.py | 2 -- src/raythena/utils/timing.py | 2 -- tests/conftest.py | 1 - tests/harvester/conftest.py | 1 - tests/harvester/test_harvesterFileMessenger.py | 1 - tests/test_pilothttp.py | 1 - tests/test_ray_utils.py | 1 - 24 files changed, 3 insertions(+), 29 deletions(-) diff --git a/bin/validate-raythena-job.py b/bin/validate-raythena-job.py index 5e5a295..0f1cf92 100644 --- a/bin/validate-raythena-job.py +++ b/bin/validate-raythena-job.py @@ -4,7 +4,6 @@ import json import os.path as path from array import array - import ROOT diff --git a/example/standalone_ray_test_hello_world.py b/example/standalone_ray_test_hello_world.py index d7dbae4..2c4e2ea 100755 --- a/example/standalone_ray_test_hello_world.py +++ b/example/standalone_ray_test_hello_world.py @@ -11,7 +11,6 @@ import platform import time from pprint import pprint - import ray diff --git a/pyproject.toml b/pyproject.toml index 2a23446..ea3c1ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,3 +69,6 @@ ignore = [ # pycodestyle "E501", ] + +[tool.ruff.lint.isort] +no-lines-before = ["third-party", "first-party", "standard-library"] diff --git a/src/raythena/actors/esworker.py b/src/raythena/actors/esworker.py index d481443..a2987cc 100644 --- a/src/raythena/actors/esworker.py +++ b/src/raythena/actors/esworker.py @@ -9,9 +9,7 @@ from socket import gethostname from time import sleep from typing import Any, Optional, Union - import ray - # from raythena.utils.timing import CPUMonitor from raythena.actors.payloads.basePayload import BasePayload from raythena.actors.payloads.eventservice.esPayload import ESPayload diff --git a/src/raythena/actors/payloads/basePayload.py b/src/raythena/actors/payloads/basePayload.py index 4a0c8f4..030ca46 100644 --- a/src/raythena/actors/payloads/basePayload.py +++ b/src/raythena/actors/payloads/basePayload.py @@ -1,6 +1,5 @@ from abc import ABC, abstractmethod from typing import Any, Optional - from raythena.utils.config import Config from raythena.utils.eventservice import PandaJob diff --git a/src/raythena/actors/payloads/eventservice/esPayload.py b/src/raythena/actors/payloads/eventservice/esPayload.py index 2ec00d1..4ac4328 100644 --- a/src/raythena/actors/payloads/eventservice/esPayload.py +++ b/src/raythena/actors/payloads/eventservice/esPayload.py @@ -1,7 +1,6 @@ from abc import abstractmethod from collections.abc import Sequence from typing import Optional - from raythena.actors.payloads.basePayload import BasePayload from raythena.utils.config import Config from raythena.utils.eventservice import EventRange diff --git a/src/raythena/actors/payloads/eventservice/pilothttp.py b/src/raythena/actors/payloads/eventservice/pilothttp.py index 8160806..d7115d0 100644 --- a/src/raythena/actors/payloads/eventservice/pilothttp.py +++ b/src/raythena/actors/payloads/eventservice/pilothttp.py @@ -9,10 +9,8 @@ from subprocess import DEVNULL, Popen from typing import Callable, Optional from urllib.parse import parse_qs - import uvloop from aiohttp import web - from raythena.actors.payloads.eventservice.esPayload import ESPayload from raythena.utils.config import Config from raythena.utils.eventservice import ESEncoder, EventRange, PandaJob diff --git a/src/raythena/drivers/baseDriver.py b/src/raythena/drivers/baseDriver.py index 344d67e..464cfc6 100644 --- a/src/raythena/drivers/baseDriver.py +++ b/src/raythena/drivers/baseDriver.py @@ -1,5 +1,4 @@ from abc import ABC, abstractmethod - from raythena.utils.config import Config diff --git a/src/raythena/drivers/communicators/baseCommunicator.py b/src/raythena/drivers/communicators/baseCommunicator.py index 2ca75d6..257ec05 100644 --- a/src/raythena/drivers/communicators/baseCommunicator.py +++ b/src/raythena/drivers/communicators/baseCommunicator.py @@ -2,7 +2,6 @@ from collections.abc import Mapping, Sequence from queue import Queue from typing import Union - from raythena.utils.config import Config from raythena.utils.eventservice import ( EventRangeDef, diff --git a/src/raythena/drivers/communicators/harvesterFileMessenger.py b/src/raythena/drivers/communicators/harvesterFileMessenger.py index 57c8d07..5f57e0e 100644 --- a/src/raythena/drivers/communicators/harvesterFileMessenger.py +++ b/src/raythena/drivers/communicators/harvesterFileMessenger.py @@ -5,7 +5,6 @@ import shutil import time from queue import Queue - from raythena.drivers.communicators.baseCommunicator import BaseCommunicator from raythena.utils.config import Config from raythena.utils.eventservice import ( diff --git a/src/raythena/drivers/communicators/harvesterMock.py b/src/raythena/drivers/communicators/harvesterMock.py index dedf8a3..cfc5b4a 100644 --- a/src/raythena/drivers/communicators/harvesterMock.py +++ b/src/raythena/drivers/communicators/harvesterMock.py @@ -3,7 +3,6 @@ import random import time from queue import Queue - from raythena.drivers.communicators.baseCommunicator import BaseCommunicator from raythena.utils.config import Config from raythena.utils.eventservice import ( diff --git a/src/raythena/drivers/communicators/harvesterMock2205.py b/src/raythena/drivers/communicators/harvesterMock2205.py index 4f7cdc1..b2bab50 100644 --- a/src/raythena/drivers/communicators/harvesterMock2205.py +++ b/src/raythena/drivers/communicators/harvesterMock2205.py @@ -3,7 +3,6 @@ import random import time from queue import Queue - from raythena.drivers.communicators.harvesterMock import HarvesterMock from raythena.utils.config import Config from raythena.utils.eventservice import PandaJobRequest diff --git a/src/raythena/drivers/esdriver.py b/src/raythena/drivers/esdriver.py index c54fdf9..238bf6e 100644 --- a/src/raythena/drivers/esdriver.py +++ b/src/raythena/drivers/esdriver.py @@ -16,11 +16,9 @@ Any, Optional, ) - import ray from ray.exceptions import RayActorError from ray.types import ObjectRef - from raythena import __version__ from raythena.actors.esworker import ESWorker, WorkerResponse from raythena.drivers.baseDriver import BaseDriver diff --git a/src/raythena/scripts/raythena.py b/src/raythena/scripts/raythena.py index f4dd1a9..2a1c49a 100755 --- a/src/raythena/scripts/raythena.py +++ b/src/raythena/scripts/raythena.py @@ -3,9 +3,7 @@ import signal import traceback import types - import click - from raythena.drivers.baseDriver import BaseDriver from raythena.drivers.esdriver import ESDriver from raythena.utils.config import Config diff --git a/src/raythena/utils/bookkeeper.py b/src/raythena/utils/bookkeeper.py index f26df7f..b5bc3d1 100644 --- a/src/raythena/utils/bookkeeper.py +++ b/src/raythena/utils/bookkeeper.py @@ -11,7 +11,6 @@ Optional, Union, ) - from raythena.utils.config import Config from raythena.utils.eventservice import ( EventRange, diff --git a/src/raythena/utils/config.py b/src/raythena/utils/config.py index 29a6f39..13170e2 100644 --- a/src/raythena/utils/config.py +++ b/src/raythena/utils/config.py @@ -1,5 +1,4 @@ import os - import yaml diff --git a/src/raythena/utils/logging.py b/src/raythena/utils/logging.py index 7716827..dc2123d 100644 --- a/src/raythena/utils/logging.py +++ b/src/raythena/utils/logging.py @@ -1,7 +1,6 @@ import logging import sys from time import gmtime - from raythena.utils.config import Config _initialized = False diff --git a/src/raythena/utils/ray.py b/src/raythena/utils/ray.py index f3fa1bd..297021f 100644 --- a/src/raythena/utils/ray.py +++ b/src/raythena/utils/ray.py @@ -1,8 +1,6 @@ from collections.abc import Mapping from typing import Any - import ray - from raythena.utils.config import Config diff --git a/src/raythena/utils/timing.py b/src/raythena/utils/timing.py index 62f9e0b..6d87663 100644 --- a/src/raythena/utils/timing.py +++ b/src/raythena/utils/timing.py @@ -2,9 +2,7 @@ import time from threading import Event from typing import Any, Union - import psutil - from raythena.utils.exception import ExThread diff --git a/tests/conftest.py b/tests/conftest.py index c1c3577..a17b52d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,5 @@ import hashlib import time - import pytest from raythena.utils.config import Config from raythena.utils.ray import setup_ray, shutdown_ray diff --git a/tests/harvester/conftest.py b/tests/harvester/conftest.py index 301f18d..65c7ecb 100644 --- a/tests/harvester/conftest.py +++ b/tests/harvester/conftest.py @@ -1,6 +1,5 @@ import os import queue - import pytest from raythena.drivers.communicators.harvesterFileMessenger import ( HarvesterFileCommunicator, diff --git a/tests/harvester/test_harvesterFileMessenger.py b/tests/harvester/test_harvesterFileMessenger.py index 85469b6..a377e9d 100644 --- a/tests/harvester/test_harvesterFileMessenger.py +++ b/tests/harvester/test_harvesterFileMessenger.py @@ -1,7 +1,6 @@ import json import os import time - from raythena.utils.eventservice import EventRangeRequest, PandaJobRequest diff --git a/tests/test_pilothttp.py b/tests/test_pilothttp.py index f2c5316..5c1c4b3 100644 --- a/tests/test_pilothttp.py +++ b/tests/test_pilothttp.py @@ -1,6 +1,5 @@ import os import time - import pytest import requests from raythena.actors.payloads.eventservice.pilothttp import PilotHttpPayload diff --git a/tests/test_ray_utils.py b/tests/test_ray_utils.py index 34e1f1b..3c70875 100644 --- a/tests/test_ray_utils.py +++ b/tests/test_ray_utils.py @@ -1,5 +1,4 @@ import socket - import pytest from raythena.utils.ray import ( build_nodes_resource_list, From 77ff2f5755cb5d71010154c93ac8d3abbc960e00 Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 15:34:59 -0700 Subject: [PATCH 09/14] fix line size --- .flake8 | 6 -- pyproject.toml | 10 +-- src/raythena/actors/esworker.py | 6 +- .../actors/payloads/eventservice/pilothttp.py | 11 ++-- src/raythena/drivers/esdriver.py | 64 +++++++++++-------- src/raythena/utils/bookkeeper.py | 58 ++++++++++------- tests/conftest.py | 18 ++++-- 7 files changed, 101 insertions(+), 72 deletions(-) delete mode 100644 .flake8 diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 5f6aab9..0000000 --- a/.flake8 +++ /dev/null @@ -1,6 +0,0 @@ -[flake8] -ignore = C901,E262,E266,N804,W504,E251,ANN101,ANN002,ANN003,ANN201,ANN204 -show-source = true -max-line-length = 160 -select=A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z -max-complexity = 15 diff --git a/pyproject.toml b/pyproject.toml index ea3c1ca..0eef6f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ dependencies = [ [tool.ruff] -line-length = 80 +line-length = 120 indent-width = 4 [tool.ruff.lint] @@ -65,10 +65,10 @@ select = [ "I" ] -ignore = [ - # pycodestyle - "E501", -] +# ignore = [ +# # pycodestyle +# "E501", +# ] [tool.ruff.lint.isort] no-lines-before = ["third-party", "first-party", "standard-library"] diff --git a/src/raythena/actors/esworker.py b/src/raythena/actors/esworker.py index a2987cc..55c7ac1 100644 --- a/src/raythena/actors/esworker.py +++ b/src/raythena/actors/esworker.py @@ -228,7 +228,8 @@ def modify_job(self, job: PandaJob) -> PandaJob: f"--inputEVNTFile={in_files} -", cmd, ) - # convert args of the form --outputHITSFile=HITS.30737678._[011001,...].pool.root to --outputHITSFile=HITS.30737678._011001.pool.root + # convert args of the form --outputHITSFile=HITS.30737678._[011001,...].pool.root + # to --outputHITSFile=HITS.30737678._011001.pool.root match = re.findall( r"--outputHITSFile=([0-9A-Z._]+)\[([0-9,]+)\](.pool.root)", cmd ) @@ -275,7 +276,8 @@ def stagein(self) -> None: Postconditions: - The worker is in the READY_FOR_EVENTS state. Raises: - StageInFailed: If creating / moving to the work directory fails or the call to the payload stage-in raises an exception. + StageInFailed: If creating / moving to the work directory fails or the call to the payload + stage-in raises an exception. """ self.payload_job_dir = os.path.join(self.workdir, self.job["PandaID"]) if not os.path.isdir(self.payload_job_dir): diff --git a/src/raythena/actors/payloads/eventservice/pilothttp.py b/src/raythena/actors/payloads/eventservice/pilothttp.py index d7115d0..f4d9d22 100644 --- a/src/raythena/actors/payloads/eventservice/pilothttp.py +++ b/src/raythena/actors/payloads/eventservice/pilothttp.py @@ -196,12 +196,15 @@ def _build_pilot_command(self) -> str: raise FailedPayload(self.worker_id) queue_escaped = shlex.quote(self.config.payload["pandaqueue"]) - cmd += f"{shlex.quote(pilotwrapper_bin)} --localpy --piloturl local -q {queue_escaped} -r {queue_escaped} -s {queue_escaped} " + cmd += (f"{shlex.quote(pilotwrapper_bin)} --localpy --piloturl local " + f"-q {queue_escaped} -r {queue_escaped} -s {queue_escaped} " + ) cmd += "--pilotversion 3 --pythonversion 3 " cmd += ( - f"-i PR -j {prod_source_label} --container --mute --pilot-user=atlas -t -u --es-executor-type=raythena -v 1 " + f"-i PR -j {prod_source_label} --container --mute --pilot-user=atlas -t -u " + f"--es-executor-type=raythena -v 1 " f"-d --cleanup=False -w generic --use-https False --allow-same-user=False --resource-type MCORE " f"--hpc-resource {shlex.quote(self.config.payload['hpcresource'])};" ) @@ -377,8 +380,8 @@ def fetch_ranges_update(self) -> Optional[Mapping[str, str]]: def should_request_more_ranges(self) -> bool: """ Checks if the payload is ready to receive more event ranges. If false is returned, then the payload is - not expecting to have more ranges assigned to it by calling submit_new_ranges. If this method ever returns false, - then any future to it will return false as well. + not expecting to have more ranges assigned to it by calling submit_new_ranges. + If this method ever returns false, then any future to it will return false as well. Event ranges submitted after this method returns false will be ignored and never sent to the pilot process. Returns: diff --git a/src/raythena/drivers/esdriver.py b/src/raythena/drivers/esdriver.py index 238bf6e..686de51 100644 --- a/src/raythena/drivers/esdriver.py +++ b/src/raythena/drivers/esdriver.py @@ -52,11 +52,13 @@ class ESDriver(BaseDriver): """ The driver is managing all the ray workers and handling the communication with Harvester. It keeps tracks of - which event ranges is assigned to which actor using a BookKeeper instance which provides the interface to read and update the status of each event range. + which event ranges is assigned to which actor using a BookKeeper instance which provides the interface to read + and update the status of each event range. - It will also send requests for jobs, event ranges or update of produced output to harvester by using a communicator instance. - The communicator uses the shared file system to communicate with Harvester and does I/O in a separate thread, - communication between the driver and the communicator is done by message passing using a queue. + It will also send requests for jobs, event ranges or update of produced output to harvester by + using a communicator instance. The communicator uses the shared file system to communicate with + Harvester and does I/O in a separate thread, communication between the driver and the communicator + is done by message passing using a queue. The driver is starting one actor per node in the ray cluster except for the ray head node which doesn't execute any worker @@ -217,7 +219,8 @@ def __getitem__(self, key: str) -> ESWorker: def start_actors(self) -> None: """ - Initialize actor communication by performing the first call to get_message() and add the future to the future list. + Initialize actor communication by performing the first call to get_message() and + add the future to the future list. Returns: None @@ -227,8 +230,8 @@ def start_actors(self) -> None: def create_actors(self) -> None: """ - Create actors on each node. Before creating an actor, the driver tries to assign it a job and an initial batch of - event ranges. This avoid having all actors requesting jobs and event ranges at the start of the job. + Create actors on each node. Before creating an actor, the driver tries to assign it a job and an initial + batch of event ranges. This avoid having all actors requesting jobs and event ranges at the start of the job. Returns: None @@ -270,7 +273,8 @@ def retrieve_actors_messages( ) -> Iterator[WorkerResponse]: """ Given a list of ready futures from actors, unwrap them and return an interable over the result of each future. - In case one of the futures raised an exception, the exception is handled by this function and not propagated to the caller. + In case one of the futures raised an exception, the exception is handled by this function and + not propagated to the caller. Args: ready: a list of read futures @@ -281,7 +285,8 @@ def retrieve_actors_messages( try: messages = ray.get(ready) except Exception: - # if any of the future raised an exception, we need to handle them one by one to know which one produced the exception. + # if any of the future raised an exception, we need to handle them one by one + # to know which one produced the exception. for r in ready: try: actor_id, message, data = ray.get(r) @@ -305,7 +310,8 @@ def enqueue_actor_call(self, actor_id: str, future: ObjectRef): def handle_actors(self) -> None: """ - Main function handling messages from all ray actors and dispatching to the appropriate handling function according to the message returned by the actor, + Main function handling messages from all ray actors and dispatching to the appropriate handling + function according to the message returned by the actor, Returns: None @@ -343,16 +349,14 @@ def wait_on_messages(self) -> tuple[list[ObjectRef], list[ObjectRef]]: """ Wait on part of the pending futures to complete. Wait for 1 second trying to fetch half of the pending futures. If no futures are ready, then wait another second to fetch a tenth of the pending futures. - If there are still no futures ready, then wait for the timeout interval or until one future is ready. If this is the beginning of the job, i.e. no - events have finished processing yet, then wait forever until one future is ready instead of only timeout interval. + If there are still no futures ready, then wait for the timeout interval or until one future is ready. + If this is the beginning of the job, i.e. no events have finished processing yet, then wait forever until + one future is ready instead of only timeout interval. Returns: tuple of a list of completed futures and a list of pending futures, respectively """ - if self.bookKeeper.have_finished_events(): - timeoutinterval = self.timeoutinterval - else: - timeoutinterval = None + timeoutinterval = self.timeoutinterval if self.bookKeeper.have_finished_events() else None messages, queue = ray.wait( self.actors_message_queue, @@ -435,9 +439,10 @@ def handle_request_event_ranges( the number of events returned in a single request is capped to the number of local events divided by the number of actors. This cap is updated every time new events are retrieved from Harvester. - If the driver doesn't have enough events to send to the actor, then it will initiate or wait on a pending event request to Harvester to get more events. - It will only return less events than the request number (or cap) if Harvester returns no events. - Requests to Harvester are skipped if it was flagged as not having any events left for the current actor's job. + If the driver doesn't have enough events to send to the actor, then it will initiate or wait on a pending + event request to Harvester to get more events. It will only return less events than the request number (or cap) + if Harvester returns no events. Requests to Harvester are skipped if it was flagged as not having any events + left for the current actor's job. Args: actor_id: worker sending the event ranges update @@ -624,7 +629,8 @@ def run(self) -> None: Method used to start the driver, initializing actors, retrieving initial job and event ranges, creates job subdir then handle actors until they are all done or stop() has been called This function will also create a directory in config.ray.workdir for the retrieved job - with the directory name being the PandaID. Workers will then each create their own subdirectory in that job directory. + with the directory name being the PandaID. Workers will then each create their own + subdirectory in that job directory. Returns: None @@ -841,8 +847,8 @@ def stop(self) -> None: def handle_actor_exception(self, actor_id: str, ex: Exception) -> None: """ Handle exception that occurred in an actor process. Log the exception and count the number of exceptions - that were produced by the same actor. If the number of exceptions is greater than the threshold, the driver will simply drop the actor - by no longer calling remote functions on it. + that were produced by the same actor. If the number of exceptions is greater than the threshold, + the driver will simply drop the actor by no longer calling remote functions on it. Args: actor_id: the actor that raised the exception @@ -890,10 +896,12 @@ def get_output_file_guid(self, job_report_file) -> Optional[str]: def handle_merge_transforms(self, wait_for_completion=False) -> bool: """ Checks if the bookkeeper has files ready to be merged. If so, subprocesses for merge tasks are started. - After starting any subprocess, go through all the running subprocess and poll then to check if any completed and report status to the bookkeepr. + After starting any subprocess, go through all the running subprocess and poll then to check + if any completed and report status to the bookkeepr. Args: - wait_for_completion: Wait for all the subprocesses (including those started by this call) to finish before returning + wait_for_completion: Wait for all the subprocesses + (including those started by this call) to finish before returning Returns: True if new merge jobs were created @@ -1020,7 +1028,10 @@ def hits_merge_transform( if self.config.payload["containerextrasetup"].strip().endswith(";") else ";" ) - container_script = f"{self.config.payload['containerextrasetup']}{endtoken}{self.merge_transform} {transform_params}" + container_script = ( + f"{self.config.payload['containerextrasetup']}{endtoken}" + f"{self.merge_transform} {transform_params}" + ) merge_script_path = os.path.join(tmp_dir, "merge_transform.sh") with open(merge_script_path, "w") as f: f.write(container_script) @@ -1062,7 +1073,8 @@ def hits_merge_transform( ) cmd += ( f"{self.config.payload['containerextraargs']}{endtoken}" - f"source ${{ATLAS_LOCAL_ROOT_BASE}}/user/atlasLocalSetup.sh --swtype {self.config.payload['containerengine']}" + f"source ${{ATLAS_LOCAL_ROOT_BASE}}/user/atlasLocalSetup.sh" + f" --swtype {self.config.payload['containerengine']}" f" -c $thePlatform -s /srv/release_setup.sh -r /srv/merge_transform.sh -e \"{self.container_options}\";" f"RETURN_VAL=$?;if [ \"$RETURN_VAL\" -eq 0 ]; then cp jobReport.json {job_report_name};fi;exit $RETURN_VAL;" ) diff --git a/src/raythena/utils/bookkeeper.py b/src/raythena/utils/bookkeeper.py index b5bc3d1..a84a47c 100644 --- a/src/raythena/utils/bookkeeper.py +++ b/src/raythena/utils/bookkeeper.py @@ -30,15 +30,19 @@ class TaskStatus: Utility class which manages the persistancy to file of the progress on a given Panda task. All operations (set_eventrange_simulated, set_eventrange_failed, set_file_merged) are lazy. - They will only enqueue a message which will only be processed just before writting the status to disk in save_status. - The reason for this design is that save_status and the update operations are supposed to be called by different threads and would - therefore add synchronization overhead and latency for the main driver thread responsible for polling actors. Having a single thread - updating and serializing the dictionary eliminate the need for synchronization however it also means that other thread reading the dictionary - (e.g. from get_nsimulated) will get out of date information as there will most likely be update pending in the queue at any point in time + They will only enqueue a message which will only be processed just before writting + the status to disk in save_status. The reason for this design is that save_status and the update + operations are supposed to be called by different threads and would therefore add synchronization overhead + and latency for the main driver thread responsible for polling actors. Having a single thread updating and + serializing the dictionary eliminate the need for synchronization, however it also means that other thread + reading the dictionary (e.g. from get_nsimulated) will get out of date information as there will most + likely be update pending in the queue at any point in time Keys set relation of each sub-dictionnary (simulated, merged, failed, merging): - - merged and merging key sets are disjoints -- when a file has been fully merged, its entry is removed from merging and moved into merged - - merged and simulated key sets are disjoints -- when a file has been fully merged, it is no longer necessary to keep track of individual event ranges; + - merged and merging key sets are disjoints -- when a file has been fully merged, + its entry is removed from merging and moved into merged + - merged and simulated key sets are disjoints -- when a file has been fully merged, + it is no longer necessary to keep track of individual event ranges; they are removed from simulated - merging is a subset of simulated -- it is possible for events from a given file to have been simulated but no merge job has completed for that specific file. @@ -123,8 +127,9 @@ def _restore_status(self): def save_status(self, write_to_tmp=True, force_update=False): """ - Save the current status to a json file. Before saving to file, the update queue will be drained, actually carrying out the operations to the dictionary - that will be written to file. + Save the current status to a json file. Before saving to file, + the update queue will be drained, actually carrying out + the operations to the dictionary that will be written to file. Args: write_to_tmp: if true, the json data will be written to a temporary file then renamed to the final file @@ -323,7 +328,8 @@ def get_nsimulated(self, filename=None) -> int: Total number of event ranges that have been simulated but not yet merged. Args: - filename: if none, returns the total number of simulated events. If specified, returns the number of events simulated for that specific file + filename: if none, returns the total number of simulated events. + If specified, returns the number of events simulated for that specific file Returns: the number of events simulated @@ -357,7 +363,8 @@ def get_nfailed(self, filename=None) -> int: Total number of event ranges that have failed. Args: - filename: if none, returns the total number of failed events. If specified, returns the number of events failed for that specific file + filename: if none, returns the total number of failed events. + If specified, returns the number of events failed for that specific file Returns: the number of events failed @@ -423,7 +430,8 @@ def __init__(self, config: Config) -> None: ) # Event ranges for a given input file which have been simulated and a ready to be merged self.ranges_to_merge: dict[str, list[tuple[str, EventRange]]] = dict() - # Accumulate event ranges of different input files into the same output file until we have enough to produce a merged file + # Accumulate event ranges of different input files into the same output file until + # we have enough to produce a merged file. # Only used when multiple input files are merged in a single output (n-1) to pool input files together self.output_merge_queue: dict[str, list[tuple[str, EventRange]]] = ( dict() @@ -489,8 +497,8 @@ def save_status(self): def check_mergeable_files(self): """ - Goes through the current task status, checks if a file has been entierly processed (event ranges all simulated or failed) and - if so adds the file to self.files_ready_to_merge + Goes through the current task status, checks if a file has been entierly processed + (event ranges all simulated or failed) and if so adds the file to self.files_ready_to_merge """ if self._hits_per_file >= self._events_per_file: self._check_mergeable_files_n_1() @@ -553,8 +561,9 @@ def start_threads(self): def add_jobs(self, jobs: Mapping[str, JobDef], start_threads=True) -> None: """ - Register new jobs. Event service jobs will not be assigned to worker until event ranges are added to the job. - This will also automatically start the thread responsible for saving the task status to file if the parameter start_save_thread is True. + Register new jobs. Event service jobs will not be assigned to worker until + event ranges are added to the job. This will also automatically start the thread + responsible for saving the task status to file if the parameter start_save_thread is True. If the thread is started, it must be stopped with stop_save_thread before exiting the application Args: @@ -735,7 +744,8 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): # Second pass handling only merged, simulated and not processed files for file, guid in zip(files, guids): - # if all the event ranges in the input file have been merge, or the file was declared as failed in the first pass, move to the next + # if all the event ranges in the input file have been merge, + # or the file was declared as failed in the first pass, move to the next if file in merged_files or file in failed_input_files: continue file_simulated_ranges = simulated_ranges.get(file) @@ -845,9 +855,10 @@ def assign_job_to_actor(self, actor_id: str) -> Optional[PandaJob]: def fetch_event_ranges(self, actor_id: str, n: int) -> list[EventRange]: """ - Retrieve event ranges for an actor. The specified actor should have a job assigned from assign_job_to_actor() or an empty list will be returned. - If the job assigned to the actor doesn't have enough range currently available, it will assign all of its remaining anges - to the worker without trying to get new ranges from harvester. + Retrieve event ranges for an actor. The specified actor should have + a job assigned from assign_job_to_actor() or an empty list will be returned. + If the job assigned to the actor doesn't have enough range currently available, + it will assign all of its remaining ranges to the worker without trying to get new ranges from harvester. Args: actor_id: actor requesting event ranges @@ -911,8 +922,8 @@ def process_event_ranges_update( ], ): """ - Process the event ranges update sent by the worker. This will update the status of event ranges in the update as well as building - the list of event ranges to be tarred up for each input file. + Process the event ranges update sent by the worker. This will update the status of event ranges + in the update as well as building the list of event ranges to be tarred up for each input file. Args: actor_id: actor worker_id that sent the update @@ -1051,7 +1062,8 @@ def is_flagged_no_more_events(self, panda_id: str) -> bool: """ Checks if a job can still receive more event ranges from harvester. This function returning Trued doesn't guarantee that Harvester has more events available, - only that it may or may not have more events available. If false is returned, Harvester doesn't have more events available + only that it may or may not have more events available. + If false is returned, Harvester doesn't have more events available Args: panda_id: job worker_id to check diff --git a/tests/conftest.py b/tests/conftest.py index a17b52d..2497928 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -222,12 +222,15 @@ def sample_multijobs( "destinationDblock": job_name, "dispatchDBlockToken": "NULL", "jobPars": ( - f'--eventService={str(is_eventservice)} --skipEvents=0 --firstEvent=1 --preExec "from AthenaCommon.DetFlags ' + f"--eventService={str(is_eventservice)} --skipEvents=0 --firstEvent=1 " + '--preExec "from AthenaCommon.DetFlags ' "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff() "' "--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so " - "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py " - "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " + "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py," + "SimulationJobOptions/preInclude.BeamPipeKill.py " + "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT " + "--randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " f"--maxEvents=-1 --inputEvgenFile {inFiles} --outputHitsFile HITS_{outFilesShort}.pool.root)" ), "attemptNr": 0, @@ -313,12 +316,15 @@ def sample_job( "destinationDblock": job_name, "dispatchDBlockToken": "NULL", "jobPars": ( - f'--eventService={str(is_eventservice)} --skipEvents=0 --firstEvent=1 --preExec "from AthenaCommon.DetFlags ' + f"--eventService={str(is_eventservice)} --skipEvents=0 --firstEvent=1 " + '--preExec "from AthenaCommon.DetFlags ' "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff() "' "--athenaopts=--preloadlib=${ATLASMKLLIBDIR_PRELOAD}/libimf.so " - "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py " - "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT --randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " + "--preInclude sim:SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py," + "SimulationJobOptions/preInclude.BeamPipeKill.py " + "--geometryVersion ATLAS-R2-2016-01-00-00_VALIDATION --physicsList QGSP_BERT " + "--randomSeed 1234 --conditionsTag OFLCOND-MC12-SIM-00 " f"--maxEvents=-1 --inputEvgenFile {inFiles} --outputHitsFile HITS_{outFilesShort}.pool.root)" ), "attemptNr": 0, From f18b61da496afee085d909af2245e19ffb68d8e6 Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 15:36:14 -0700 Subject: [PATCH 10/14] format --- bin/validate-raythena-job.py | 19 +- example/standalone_ray_test_hello_world.py | 20 +- src/raythena/actors/esworker.py | 127 ++------ .../actors/payloads/eventservice/esPayload.py | 4 +- .../actors/payloads/eventservice/pilothttp.py | 115 ++----- .../drivers/communicators/baseCommunicator.py | 4 +- .../communicators/harvesterFileMessenger.py | 65 +--- .../drivers/communicators/harvesterMock.py | 12 +- .../communicators/harvesterMock2205.py | 4 +- src/raythena/drivers/esdriver.py | 283 +++++------------- src/raythena/scripts/raythena.py | 28 +- src/raythena/utils/bookkeeper.py | 213 ++++--------- src/raythena/utils/config.py | 12 +- src/raythena/utils/eventservice.py | 74 ++--- src/raythena/utils/exception.py | 12 +- src/raythena/utils/logging.py | 4 +- src/raythena/utils/ray.py | 8 +- src/raythena/utils/timing.py | 12 +- tests/conftest.py | 15 +- tests/harvester/conftest.py | 8 +- .../harvester/test_harvesterFileMessenger.py | 33 +- tests/harvester/test_harvesterMock.py | 8 +- tests/test_bookkeeper.py | 35 +-- tests/test_eventservice.py | 94 ++---- tests/test_pilothttp.py | 39 +-- tests/test_taskstatus.py | 48 +-- 26 files changed, 311 insertions(+), 985 deletions(-) diff --git a/bin/validate-raythena-job.py b/bin/validate-raythena-job.py index 0f1cf92..5a373e0 100644 --- a/bin/validate-raythena-job.py +++ b/bin/validate-raythena-job.py @@ -24,18 +24,12 @@ def validate_job(job_dir, job_state_file): with open(job_state_file) as f: job_state = json.load(f) merged_input_files = job_state["merged"] - merged_output_files = set( - [list(x.keys())[0] for x in merged_input_files.values()] - ) + merged_output_files = set([list(x.keys())[0] for x in merged_input_files.values()]) event_numbers = set() for output_file in merged_output_files: output_file_abs = path.join(job_dir, "final", output_file) if not path.isfile(output_file_abs): - print( - "Expected file " - + output_file_abs - + " to be present in the job directory" - ) + print("Expected file " + output_file_abs + " to be present in the job directory") exit(1) current_event_numbers = get_event_numbers(output_file_abs) @@ -45,16 +39,11 @@ def validate_job(job_dir, job_state_file): "Duplicate events in file " + output_file + "(" - + str( - len(current_event_numbers) - - len(unique_current_event_numbers) - ) + + str(len(current_event_numbers) - len(unique_current_event_numbers)) + "): " ) exit(1) - print( - str(len(current_event_numbers)) + " events in file " + output_file - ) + print(str(len(current_event_numbers)) + " events in file " + output_file) if not unique_current_event_numbers.isdisjoint(event_numbers): print( "Found duplicate events in file " diff --git a/example/standalone_ray_test_hello_world.py b/example/standalone_ray_test_hello_world.py index 2c4e2ea..ee827a1 100755 --- a/example/standalone_ray_test_hello_world.py +++ b/example/standalone_ray_test_hello_world.py @@ -32,9 +32,7 @@ def __init__(self) -> None: self.pid = os.getpid() self.hostname = platform.node() self.ip = ray._private.services.get_node_ip_address() - print( - f"Initial message from PID - {self.pid} Running on host - {self.hostname} {self.ip}" - ) + print(f"Initial message from PID - {self.pid} Running on host - {self.hostname} {self.ip}") def ping(self): print(f"{self.pid} {self.hostname} {self.ip} - ping") @@ -78,18 +76,10 @@ def main(redis_ip: str, redis_port: str, redis_password: str): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Wait on ray head node or workers to connect" - ) - parser.add_argument( - "--redis-ip", default="{}".format(os.environ["RAYTHENA_RAY_HEAD_IP"]) - ) - parser.add_argument( - "--redis-port", default="{}".format(os.environ["RAYTHENA_RAY_REDIS_PORT"]) - ) - parser.add_argument( - "--redis-password", default=os.environ["RAYTHENA_RAY_REDIS_PASSWORD"] - ) + parser = argparse.ArgumentParser(description="Wait on ray head node or workers to connect") + parser.add_argument("--redis-ip", default="{}".format(os.environ["RAYTHENA_RAY_HEAD_IP"])) + parser.add_argument("--redis-port", default="{}".format(os.environ["RAYTHENA_RAY_REDIS_PORT"])) + parser.add_argument("--redis-password", default=os.environ["RAYTHENA_RAY_REDIS_PASSWORD"]) args = parser.parse_args() print(f"args : {args}") main(args.redis_ip, args.redis_port, args.redis_password) diff --git a/src/raythena/actors/esworker.py b/src/raythena/actors/esworker.py index 55c7ac1..60bbfaf 100644 --- a/src/raythena/actors/esworker.py +++ b/src/raythena/actors/esworker.py @@ -10,7 +10,6 @@ from time import sleep from typing import Any, Optional, Union import ray -# from raythena.utils.timing import CPUMonitor from raythena.actors.payloads.basePayload import BasePayload from raythena.actors.payloads.eventservice.esPayload import ESPayload from raythena.actors.payloads.eventservice.pilothttp import PilotHttpPayload @@ -56,18 +55,12 @@ class ESWorker: """ READY_FOR_JOB = 0 # initial state, before the first job request - JOB_REQUESTED = ( - 1 # job has been requested to the driver, waiting for result - ) + JOB_REQUESTED = 1 # job has been requested to the driver, waiting for result READY_FOR_EVENTS = 2 # ready to request new events for the current job - EVENT_RANGES_REQUESTED = ( - 3 # event ranges have been requested to the driver, waiting for result - ) + EVENT_RANGES_REQUESTED = 3 # event ranges have been requested to the driver, waiting for result FINISHING_LOCAL_RANGES = 4 # do not request additional ranges, will move to STAGE_OUT once local cache is empty PROCESSING = 5 # currently processing event ranges - FINISHING = ( - 6 # Performing cleanup of resources, preparing final server update - ) + FINISHING = 6 # Performing cleanup of resources, preparing final server update DONE = 7 # Actor has finished processing job STAGE_IN = 8 # Staging-in data. STAGE_OUT = 9 # Staging-out data @@ -134,24 +127,16 @@ def __init__( self.payload_actor_process_dir = None self.actor_ray_logs_dir = None self.cpu_monitor = None - self.workdir = os.path.expandvars( - self.config.ray.get("workdir", os.getcwd()) - ) + self.workdir = os.path.expandvars(self.config.ray.get("workdir", os.getcwd())) if not os.path.isdir(self.workdir): self.workdir = os.getcwd() self.output_dir = self.config.ray.get("outputdir") - self.pilot_kill_file = os.path.expandvars( - self.config.payload.get("pilotkillfile", "pilot_kill_payload") - ) + self.pilot_kill_file = os.path.expandvars(self.config.payload.get("pilotkillfile", "pilot_kill_payload")) self.pilot_kill_time = self.config.payload.get("pilotkilltime", 600) self.time_monitor_file = os.path.expandvars( - self.config.payload.get( - "timemonitorfile", "RaythenaTimeMonitor.txt" - ) - ) - self.payload: Union[BasePayload, ESPayload] = PilotHttpPayload( - self.id, self.config + self.config.payload.get("timemonitorfile", "RaythenaTimeMonitor.txt") ) + self.payload: Union[BasePayload, ESPayload] = PilotHttpPayload(self.id, self.config) self.start_time = -1 self.time_limit = -1 self.elapsed = 1 @@ -170,12 +155,7 @@ def check_time(self) -> None: """ while True: curtime = datetime.datetime.now() - time_elapsed = ( - curtime.hour * 3600 - + curtime.minute * 60 - + curtime.second - - self.start_time - ) + time_elapsed = curtime.hour * 3600 + curtime.minute * 60 + curtime.second - self.start_time if time_elapsed <= 0: time_elapsed = 24 * 3600 + time_elapsed if time_elapsed // 300 >= self.elapsed: @@ -184,13 +164,9 @@ def check_time(self) -> None: if self.config.logging.get("copyraylogs", False): if os.path.isdir(self.actor_ray_logs_dir): shutil.rmtree(self.actor_ray_logs_dir) - shutil.copytree( - self.session_log_dir, self.actor_ray_logs_dir - ) + shutil.copytree(self.session_log_dir, self.actor_ray_logs_dir) except Exception as e: - self._logger.warning( - f"Failed to copy ray logs to actor directory: {e}" - ) + self._logger.warning(f"Failed to copy ray logs to actor directory: {e}") if time_elapsed > self.time_limit - self.pilot_kill_time: with open(self.pilot_kill_file, "w") as f: f.write("KILL") @@ -217,9 +193,7 @@ def modify_job(self, job: PandaJob) -> PandaJob: if len(input_evnt_file) != 1: return job in_files = [ - os.path.join( - os.path.expandvars(self.config.harvester["endpoint"]), x - ) + os.path.join(os.path.expandvars(self.config.harvester["endpoint"]), x) for x in input_evnt_file[0].split(",") ] in_files = ",".join(in_files[0:1]) @@ -230,9 +204,7 @@ def modify_job(self, job: PandaJob) -> PandaJob: ) # convert args of the form --outputHITSFile=HITS.30737678._[011001,...].pool.root # to --outputHITSFile=HITS.30737678._011001.pool.root - match = re.findall( - r"--outputHITSFile=([0-9A-Z._]+)\[([0-9,]+)\](.pool.root)", cmd - ) + match = re.findall(r"--outputHITSFile=([0-9A-Z._]+)\[([0-9,]+)\](.pool.root)", cmd) if match: match_tuple = match[0] prefix = match_tuple[0] @@ -245,15 +217,9 @@ def modify_job(self, job: PandaJob) -> PandaJob: cmd, ) - job_number = ( - max(int(job["attemptNr"]) - 1, 0) * self.actor_count - + self.actor_no - + 1 - ) + job_number = max(int(job["attemptNr"]) - 1, 0) * self.actor_count + self.actor_no + 1 if "--jobNumber=" in cmd: - cmd = re.sub( - r"--jobNumber=[0-9]+", f"--jobNumber={job_number}", cmd - ) + cmd = re.sub(r"--jobNumber=[0-9]+", f"--jobNumber={job_number}", cmd) else: cmd = f"{cmd} --jobNumber={job_number} " @@ -281,40 +247,22 @@ def stagein(self) -> None: """ self.payload_job_dir = os.path.join(self.workdir, self.job["PandaID"]) if not os.path.isdir(self.payload_job_dir): - self._logger.warning( - f"Specified path {self.payload_job_dir} does not exist. Using cwd {os.getcwd()}" - ) + self._logger.warning(f"Specified path {self.payload_job_dir} does not exist. Using cwd {os.getcwd()}") self.payload_job_dir = self.workdir subdir = f"{self.id}" - self.payload_actor_process_dir = os.path.join( - self.payload_job_dir, subdir - ) - self.payload_actor_output_dir = os.path.join( - self.payload_job_dir, subdir, "esOutput" - ) - self.actor_ray_logs_dir = os.path.join( - self.payload_actor_process_dir, "ray_logs" - ) + self.payload_actor_process_dir = os.path.join(self.payload_job_dir, subdir) + self.payload_actor_output_dir = os.path.join(self.payload_job_dir, subdir, "esOutput") + self.actor_ray_logs_dir = os.path.join(self.payload_actor_process_dir, "ray_logs") try: with open(os.path.join(self.workdir, self.time_monitor_file)) as time_limit_monitor: start_time = time_limit_monitor.readline().split(":") - self.start_time = ( - int(start_time[0]) * 3600 - + int(start_time[1]) * 60 - + int(start_time[2]) - ) + self.start_time = int(start_time[0]) * 3600 + int(start_time[1]) * 60 + int(start_time[2]) time_limit = time_limit_monitor.readline().split(":") if len(time_limit) < 3: time_limit = ["0"] + time_limit - self.time_limit = ( - int(time_limit[0]) * 3600 - + int(time_limit[1]) * 60 - + int(time_limit[2]) - ) - timer_thread = threading.Thread( - name="timer", target=self.check_time, daemon=True - ) + self.time_limit = int(time_limit[0]) * 3600 + int(time_limit[1]) * 60 + int(time_limit[2]) + timer_thread = threading.Thread(name="timer", target=self.check_time, daemon=True) timer_thread.start() except Exception as e: self._logger.warning(f"Failed to setup timer thread: {e}") @@ -348,11 +296,7 @@ def stagein(self) -> None: except Exception as e: self._logger.warning(f"Failed to stagein payload: {e}") raise StageInFailed(self.id) from e - self.transition_state( - ESWorker.READY_FOR_EVENTS - if self.is_event_service_job() - else ESWorker.PROCESSING - ) + self.transition_state(ESWorker.READY_FOR_EVENTS if self.is_event_service_job() else ESWorker.PROCESSING) def stageout(self) -> None: """ @@ -447,9 +391,7 @@ def mark_new_job(self) -> WorkerResponse: self.transition_state(ESWorker.JOB_REQUESTED) return self.return_message(Messages.REQUEST_NEW_JOB) - def receive_event_ranges( - self, reply: int, event_ranges: Sequence[EventRange] - ) -> WorkerResponse: + def receive_event_ranges(self, reply: int, event_ranges: Sequence[EventRange]) -> WorkerResponse: """ Sends event ranges to be processed by the worker. Update the PFN of event ranges to an absolute path if it is a relative path. If no ranges are provided, the worker will not expect any more ranges in the future and @@ -540,9 +482,7 @@ def should_request_ranges(self) -> bool: self.transition_state(ESWorker.READY_FOR_EVENTS) return res - def stageout_event_service_files( - self, ranges_update: Mapping[str, str] - ) -> Optional[EventRangeUpdate]: + def stageout_event_service_files(self, ranges_update: Mapping[str, str]) -> Optional[EventRangeUpdate]: """ Move the HITS files reported by the pilot payload. Files are moved from the Athena work directory to the worker-specific output directory. @@ -576,15 +516,11 @@ def stageout_event_service_files( try: os.replace(cfile, dst) except OSError as e: - self._logger.error( - f"Failed to move file {cfile} to {dst}: errno {e.errno}: {e.strerror}" - ) + self._logger.error(f"Failed to move file {cfile} to {dst}: errno {e.errno}: {e.strerror}") raise StageOutFailed(self.id) from e range_update[cfile_key] = dst else: - self._logger.warning( - f"Couldn't stageout file {cfile} as it doesn't exist" - ) + self._logger.warning(f"Couldn't stageout file {cfile} as it doesn't exist") raise StageOutFailed(self.id) return ranges @@ -599,9 +535,7 @@ def get_payload_message(self) -> Optional[WorkerResponse]: ranges_update = self.payload.fetch_ranges_update() if ranges_update: ranges_update = self.stageout_event_service_files(ranges_update) - return self.return_message( - Messages.UPDATE_EVENT_RANGES, ranges_update - ) + return self.return_message(Messages.UPDATE_EVENT_RANGES, ranges_update) job_update = self.payload.fetch_job_update() if job_update: @@ -644,8 +578,7 @@ def get_message(self) -> WorkerResponse: self.stageout() return self.return_message(Messages.PROCESS_DONE) elif self.is_event_service_job() and ( - self.state == ESWorker.READY_FOR_EVENTS - or self.should_request_ranges() + self.state == ESWorker.READY_FOR_EVENTS or self.should_request_ranges() ): req = EventRangeRequest() req.add_event_request( @@ -655,9 +588,7 @@ def get_message(self) -> WorkerResponse: self.job["jobsetID"], ) self.transition_state(ESWorker.EVENT_RANGES_REQUESTED) - return self.return_message( - Messages.REQUEST_EVENT_RANGES, req - ) + return self.return_message(Messages.REQUEST_EVENT_RANGES, req) elif self.state == ESWorker.DONE: return self.return_message(Messages.PROCESS_DONE) else: diff --git a/src/raythena/actors/payloads/eventservice/esPayload.py b/src/raythena/actors/payloads/eventservice/esPayload.py index 4ac4328..2de9d17 100644 --- a/src/raythena/actors/payloads/eventservice/esPayload.py +++ b/src/raythena/actors/payloads/eventservice/esPayload.py @@ -22,9 +22,7 @@ def __init__(self, worker_id: str, config: Config): super().__init__(worker_id, config) @abstractmethod - def submit_new_ranges( - self, event_ranges: Optional[Sequence[EventRange]] - ) -> None: + def submit_new_ranges(self, event_ranges: Optional[Sequence[EventRange]]) -> None: """ Submit a new list of event ranges to the payload. The event ranges should be saved until is can be processed diff --git a/src/raythena/actors/payloads/eventservice/pilothttp.py b/src/raythena/actors/payloads/eventservice/pilothttp.py index f4d9d22..98f8679 100644 --- a/src/raythena/actors/payloads/eventservice/pilothttp.py +++ b/src/raythena/actors/payloads/eventservice/pilothttp.py @@ -115,22 +115,12 @@ def __init__(self, worker_id: str, config: Config) -> None: self.router.register("/", self.handle_get_job) self.router.register("/server/panda/getJob", self.handle_get_job) self.router.register("/server/panda/updateJob", self.handle_update_job) - self.router.register( - "/server/panda/updateWorkerPilotStatus", self.handle_update_job - ) - self.router.register( - "/server/panda/updateJobsInBulk", self.handle_update_jobs_in_bulk - ) + self.router.register("/server/panda/updateWorkerPilotStatus", self.handle_update_job) + self.router.register("/server/panda/updateJobsInBulk", self.handle_update_jobs_in_bulk) self.router.register("/server/panda/getStatus", self.handle_get_status) - self.router.register( - "/server/panda/getEventRanges", self.handle_get_event_ranges - ) - self.router.register( - "/server/panda/updateEventRanges", self.handle_update_event_ranges - ) - self.router.register( - "/server/panda/getKeyPair", self.handle_get_key_pair - ) + self.router.register("/server/panda/getEventRanges", self.handle_get_event_ranges) + self.router.register("/server/panda/updateEventRanges", self.handle_update_event_ranges) + self.router.register("/server/panda/getKeyPair", self.handle_get_key_pair) def _start_payload(self) -> None: """ @@ -149,9 +139,7 @@ def _start_payload(self) -> None: shell=True, close_fds=True, ) - self._logger.info( - f"Pilot payload started with PID {self.pilot_process.pid}" - ) + self._logger.info(f"Pilot payload started with PID {self.pilot_process.pid}") def _build_pilot_command(self) -> str: """ @@ -169,22 +157,16 @@ def _build_pilot_command(self) -> str: extra_setup = self.config.payload.get("extrasetup", None) if extra_setup is not None: - cmd += ( - f"{extra_setup}{';' if not extra_setup.endswith(';') else ''}" - ) + cmd += f"{extra_setup}{';' if not extra_setup.endswith(';') else ''}" pilot_base = "pilot3" pilot_version = self.config.payload.get("pilotversion", "latest") - pilot_src = ( - f"/cvmfs/atlas.cern.ch/repo/sw/PandaPilot/pilot3/{pilot_version}" - ) + pilot_src = f"/cvmfs/atlas.cern.ch/repo/sw/PandaPilot/pilot3/{pilot_version}" if not os.path.isdir(pilot_src): - raise FailedPayload( - self.worker_id, f"Pilot release {pilot_src} not found" - ) + raise FailedPayload(self.worker_id, f"Pilot release {pilot_src} not found") cmd += f"ln -s {pilot_src} {os.path.join(os.getcwd(), pilot_base)};" @@ -196,8 +178,9 @@ def _build_pilot_command(self) -> str: raise FailedPayload(self.worker_id) queue_escaped = shlex.quote(self.config.payload["pandaqueue"]) - cmd += (f"{shlex.quote(pilotwrapper_bin)} --localpy --piloturl local " - f"-q {queue_escaped} -r {queue_escaped} -s {queue_escaped} " + cmd += ( + f"{shlex.quote(pilotwrapper_bin)} --localpy --piloturl local " + f"-q {queue_escaped} -r {queue_escaped} -s {queue_escaped} " ) cmd += "--pilotversion 3 --pythonversion 3 " @@ -211,21 +194,14 @@ def _build_pilot_command(self) -> str: extra_script = self.config.payload.get("extrapostpayload", None) if extra_script is not None: - cmd += ( - f"{extra_script}{';' if not extra_script.endswith(';') else ''}" - ) + cmd += f"{extra_script}{';' if not extra_script.endswith(';') else ''}" cmd_script = os.path.join(os.getcwd(), "payload.sh") with open(cmd_script, "w") as f: f.write(cmd) st = os.stat(cmd_script) os.chmod(cmd_script, st.st_mode | stat.S_IEXEC) - payload_log = shlex.quote( - self.config.payload.get("logfilename", "wrapper") - ) - return ( - f"/bin/bash {cmd_script} " - f"> {payload_log} 2> {payload_log}.stderr" - ) + payload_log = shlex.quote(self.config.payload.get("logfilename", "wrapper")) + return f"/bin/bash {cmd_script} " f"> {payload_log} 2> {payload_log}.stderr" def stagein(self) -> None: """ @@ -234,25 +210,15 @@ def stagein(self) -> None: """ cwd = os.getcwd() - ddm_endpoints_file = ( - "/cvmfs/atlas.cern.ch/repo/sw/local/etc/cric_ddmendpoints.json" - ) + ddm_endpoints_file = "/cvmfs/atlas.cern.ch/repo/sw/local/etc/cric_ddmendpoints.json" if os.path.isfile(ddm_endpoints_file): - os.symlink( - ddm_endpoints_file, os.path.join(cwd, "cric_ddmendpoints.json") - ) + os.symlink(ddm_endpoints_file, os.path.join(cwd, "cric_ddmendpoints.json")) - pandaqueues_file = ( - "/cvmfs/atlas.cern.ch/repo/sw/local/etc/cric_pandaqueues.json" - ) + pandaqueues_file = "/cvmfs/atlas.cern.ch/repo/sw/local/etc/cric_pandaqueues.json" if os.path.isfile(pandaqueues_file): - os.symlink( - pandaqueues_file, os.path.join(cwd, "cric_pandaqueues.json") - ) + os.symlink(pandaqueues_file, os.path.join(cwd, "cric_pandaqueues.json")) - queue_escaped = ( - "/cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_schedconf.json" - ) + queue_escaped = "/cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_schedconf.json" if os.path.isfile(queue_escaped): os.symlink(queue_escaped, os.path.join(cwd, "queuedata.json")) @@ -270,10 +236,7 @@ def is_complete(self) -> bool: Returns: False if the payload has not finished yet, True otherwise """ - return ( - self.pilot_process is not None - and self.pilot_process.poll() is not None - ) + return self.pilot_process is not None and self.pilot_process.poll() is not None def return_code(self) -> Optional[int]: """ @@ -313,27 +276,19 @@ def stop(self) -> None: self.pilot_process.terminate() pexit = self.pilot_process.wait() self._logger.debug(f"Payload return code: {pexit}") - asyncio.run_coroutine_threadsafe( - self.notify_stop_server_task(), self.loop - ) + asyncio.run_coroutine_threadsafe(self.notify_stop_server_task(), self.loop) self.server_thread.join() - def submit_new_range( - self, event_range: Optional[EventRange] - ) -> asyncio.Future: + def submit_new_range(self, event_range: Optional[EventRange]) -> asyncio.Future: """ Submits a new evnet range to the payload thread by adding it to the event ranges queue. Args: event_range: range to forward to pilot """ - return asyncio.run_coroutine_threadsafe( - self.ranges_queue.put(event_range), self.loop - ) + return asyncio.run_coroutine_threadsafe(self.ranges_queue.put(event_range), self.loop) - def submit_new_ranges( - self, event_ranges: Optional[Iterable[EventRange]] - ) -> None: + def submit_new_ranges(self, event_ranges: Optional[Iterable[EventRange]]) -> None: """ Wrapper for submit_new_range that accepts an iterable of event ranges. @@ -406,9 +361,7 @@ async def http_handler(self, request: web.BaseRequest) -> web.Response: try: return await self.router.route(request.path, request=request) except Exception: - return web.json_response( - {"StatusCode": 500}, dumps=self.json_encoder - ) + return web.json_response({"StatusCode": 500}, dumps=self.json_encoder) @staticmethod async def parse_qs_body(request: web.BaseRequest) -> dict[str, list[str]]: @@ -459,9 +412,7 @@ async def handle_update_job(self, request: web.BaseRequest) -> web.Response: # self._logger.debug(f"job update queue size is {self.job_update.qsize()}") return web.json_response(res, dumps=self.json_encoder) - async def handle_get_event_ranges( - self, request: web.BaseRequest - ) -> web.Response: + async def handle_get_event_ranges(self, request: web.BaseRequest) -> web.Response: """ Handler for getEventRanges call, retrieve event ranges from the queue and returns ranges to pilot. If not enough event ranges are available yet, wait until more ranges become available or a message indicating @@ -493,9 +444,7 @@ async def handle_get_event_ranges( # self._logger.info(f"{len(res['eventRanges'])} ranges sent to pilot") return web.json_response(res, dumps=self.json_encoder) - async def handle_update_event_ranges( - self, request: web.BaseRequest - ) -> web.Response: + async def handle_update_event_ranges(self, request: web.BaseRequest) -> web.Response: """ Handler for updateEventRanges call, adds the event ranges update to a queue to be retrieved by the worker @@ -511,9 +460,7 @@ async def handle_update_event_ranges( # self._logger.debug(f"event ranges queue size is {self.ranges_update.qsize()}") return web.json_response(res, dumps=self.json_encoder) - async def handle_update_jobs_in_bulk( - self, request: web.BaseRequest - ) -> web.Response: + async def handle_update_jobs_in_bulk(self, request: web.BaseRequest) -> web.Response: """ Not used by pilot in the current workflow @@ -543,9 +490,7 @@ async def handle_get_status(self, request: web.BaseRequest) -> web.Response: """ raise NotImplementedError(f"{request.path} handler not implemented") - async def handle_get_key_pair( - self, request: web.BaseRequest - ) -> web.Response: + async def handle_get_key_pair(self, request: web.BaseRequest) -> web.Response: """ Not used by pilot in the current workflow diff --git a/src/raythena/drivers/communicators/baseCommunicator.py b/src/raythena/drivers/communicators/baseCommunicator.py index 257ec05..effdf1a 100644 --- a/src/raythena/drivers/communicators/baseCommunicator.py +++ b/src/raythena/drivers/communicators/baseCommunicator.py @@ -46,9 +46,7 @@ def __init__( """ self.requests_queue: Queue[RequestData] = requests_queue self.job_queue: Queue[Mapping[str, JobDef]] = job_queue - self.event_ranges_queue: Queue[ - Mapping[str, Sequence[EventRangeDef]] - ] = event_ranges_queue + self.event_ranges_queue: Queue[Mapping[str, Sequence[EventRangeDef]]] = event_ranges_queue self.config = config @abstractmethod diff --git a/src/raythena/drivers/communicators/harvesterFileMessenger.py b/src/raythena/drivers/communicators/harvesterFileMessenger.py index 5f57e0e..dc2bbbd 100644 --- a/src/raythena/drivers/communicators/harvesterFileMessenger.py +++ b/src/raythena/drivers/communicators/harvesterFileMessenger.py @@ -44,18 +44,14 @@ def __init__( config: app config """ super().__init__(requests_queue, job_queue, event_ranges_queue, config) - self.harvester_workdir = os.path.expandvars( - self.config.harvester["endpoint"] - ) + self.harvester_workdir = os.path.expandvars(self.config.harvester["endpoint"]) self.ranges_requests_count = 0 self._parse_harvester_config() self.id = "HarvesterCommunicator" self._logger = make_logger(self.config, self.id) self.event_ranges_update_buffer = EventRangeUpdate() self.event_ranges_update_interval = 5 * 60 - self.communicator_thread = ExThread( - target=self.run, name="communicator-thread" - ) + self.communicator_thread = ExThread(target=self.run, name="communicator-thread") def _parse_harvester_config(self) -> None: """ @@ -72,9 +68,7 @@ def _parse_harvester_config(self) -> None: Raises: FileNotFoundError if the harvester config file doesn't exist """ - self.harvester_conf_file = os.path.expandvars( - self.config.harvester["harvesterconf"] - ) + self.harvester_conf_file = os.path.expandvars(self.config.harvester["harvesterconf"]) if not os.path.isfile(self.harvester_conf_file): raise FileNotFoundError("Harvester config file not found") self.harvester_conf = configparser.ConfigParser() @@ -152,23 +146,17 @@ def request_event_ranges(self, request: EventRangeRequest) -> None: Returns: None """ - if not os.path.isfile(self.eventrangesfile) and not os.path.exists( - self.eventrequestfile - ): + if not os.path.isfile(self.eventrangesfile) and not os.path.exists(self.eventrequestfile): event_request_file_tmp = f"{self.eventrequestfile}.tmp" with open(event_request_file_tmp, "w") as f: json.dump(request.request, f) shutil.move(event_request_file_tmp, self.eventrequestfile) - self._logger.debug( - f"request_event_ranges: created new {self.eventrequestfile} file" - ) + self._logger.debug(f"request_event_ranges: created new {self.eventrequestfile} file") while not os.path.isfile(self.eventrangesfile): time.sleep(1) - self._logger.debug( - f"request_event_ranges: found a {self.eventrangesfile} file" - ) + self._logger.debug(f"request_event_ranges: found a {self.eventrangesfile} file") while os.path.isfile(self.eventrangesfile): try: with open(self.eventrangesfile) as f: @@ -180,9 +168,7 @@ def request_event_ranges(self, request: EventRangeRequest) -> None: ) except Exception: time.sleep(5) - if os.path.exists( - f"{self.eventrangesfile}-{self.ranges_requests_count}" - ): + if os.path.exists(f"{self.eventrangesfile}-{self.ranges_requests_count}"): self.ranges_requests_count += 1 with contextlib.suppress(FileNotFoundError): @@ -245,25 +231,17 @@ def update_events(self, request: EventRangeUpdate) -> None: try: shutil.move(tmp_status_dump_file, self.eventstatusdumpjsonfile) except Exception as e: - self._logger.critical( - f"Failed to move temporary event status file to harvester dump file: {e}" - ) + self._logger.critical(f"Failed to move temporary event status file to harvester dump file: {e}") - def merge_write_dump_file( - self, request: EventRangeUpdate, tmp_status_dump_file: str - ) -> None: + def merge_write_dump_file(self, request: EventRangeUpdate, tmp_status_dump_file: str) -> None: if os.path.isfile(self.eventstatusdumpjsonfile): - self._logger.debug( - "Dump file already exists, merge with upcoming update" - ) + self._logger.debug("Dump file already exists, merge with upcoming update") try: shutil.move(self.eventstatusdumpjsonfile, tmp_status_dump_file) with open(tmp_status_dump_file) as f: current_update = json.load(f) except Exception as e: - self._logger.error( - f"Failed to move and load existing dump file: {e} " - ) + self._logger.error(f"Failed to move and load existing dump file: {e} ") else: request.merge_update(EventRangeUpdate(current_update)) @@ -272,16 +250,12 @@ def merge_write_dump_file( with open(tmp_status_dump_file, "w") as f: json.dump(request.range_update, f) except Exception as e: - self._logger.error( - f"Failed to write event update to temporary file: {e}" - ) + self._logger.error(f"Failed to write event update to temporary file: {e}") def cleanup_tmp_files(self) -> None: tmp_status_dump_file = f"{self.eventstatusdumpjsonfile}.tmp" if os.path.isfile(tmp_status_dump_file): - self._logger.warning( - "About to quit with leftover temporary files... Last try to move it" - ) + self._logger.warning("About to quit with leftover temporary files... Last try to move it") try: with open(tmp_status_dump_file) as f: current_update = json.load(f) @@ -328,10 +302,7 @@ def run(self) -> None: elif isinstance(request, EventRangeUpdate): self.event_ranges_update_buffer.merge_update(request) now = time.time() - if ( - now - last_event_range_update - > self.event_ranges_update_interval - ): + if now - last_event_range_update > self.event_ranges_update_interval: self.update_events(self.event_ranges_update_buffer) last_event_range_update = now self.event_ranges_update_buffer = EventRangeUpdate() @@ -340,9 +311,7 @@ def run(self) -> None: else: # if any other request is received, stop the thread break except Exception as e: - self._logger.error( - f"Exception occured while handling request: {e}" - ) + self._logger.error(f"Exception occured while handling request: {e}") if self.event_ranges_update_buffer: self.update_events(self.event_ranges_update_buffer) @@ -369,6 +338,4 @@ def stop(self) -> None: if self.communicator_thread.is_alive(): self.requests_queue.put(None) self.communicator_thread.join() - self.communicator_thread = ExThread( - target=self.run, name="communicator-thread" - ) + self.communicator_thread = ExThread(target=self.run, name="communicator-thread") diff --git a/src/raythena/drivers/communicators/harvesterMock.py b/src/raythena/drivers/communicators/harvesterMock.py index cfc5b4a..f0beefc 100644 --- a/src/raythena/drivers/communicators/harvesterMock.py +++ b/src/raythena/drivers/communicators/harvesterMock.py @@ -35,9 +35,7 @@ def __init__( """ Initialize communicator thread, input files name, job worker_id, number of events to be distributed """ - self.communicator_thread = ExThread( - target=self.run, name="communicator-thread" - ) + self.communicator_thread = ExThread(target=self.run, name="communicator-thread") self.event_ranges = None self.pandaID = random.randint(0, 100) self.jobsetId = random.randint(0, 100) @@ -122,12 +120,8 @@ def request_event_ranges(self, request: EventRangeRequest) -> None: for pandaID in request: range_list = list() request_dict = request[pandaID] - nranges = min( - self.nevents - self.served_events, request_dict["nRanges"] - ) - for i in range( - self.served_events + 1, self.served_events + nranges + 1 - ): + nranges = min(self.nevents - self.served_events, request_dict["nRanges"]) + for i in range(self.served_events + 1, self.served_events + nranges + 1): file_idx = self.served_events // self.nevents_per_file range_id = f"Range-{i:05}" range_list.append( diff --git a/src/raythena/drivers/communicators/harvesterMock2205.py b/src/raythena/drivers/communicators/harvesterMock2205.py index b2bab50..0b99aee 100644 --- a/src/raythena/drivers/communicators/harvesterMock2205.py +++ b/src/raythena/drivers/communicators/harvesterMock2205.py @@ -25,9 +25,7 @@ def __init__( Initialize communicator thread, input files name, job worker_id, number of events to be distributed """ super().__init__(requests_queue, job_queue, event_ranges_queue, config) - self.communicator_thread = ExThread( - target=self.run, name="communicator-thread" - ) + self.communicator_thread = ExThread(target=self.run, name="communicator-thread") self.event_ranges = None self.pandaID = random.randint(0, 100) self.jobsetId = random.randint(0, 100) diff --git a/src/raythena/drivers/esdriver.py b/src/raythena/drivers/esdriver.py index 686de51..8dd861f 100644 --- a/src/raythena/drivers/esdriver.py +++ b/src/raythena/drivers/esdriver.py @@ -80,15 +80,11 @@ def __init__(self, config: Config, session_dir: str) -> None: self.id = "Driver" self._logger = make_logger(self.config, self.id) self.session_log_dir = os.path.join(self.session_dir, "logs") - self.nodes = build_nodes_resource_list( - self.config, run_actor_on_head=False - ) + self.nodes = build_nodes_resource_list(self.config, run_actor_on_head=False) self.requests_queue: Queue[RequestData] = Queue() self.jobs_queue: Queue[Mapping[str, JobDef]] = Queue() - self.event_ranges_queue: Queue[ - Mapping[str, Sequence[EventRangeDef]] - ] = Queue() + self.event_ranges_queue: Queue[Mapping[str, Sequence[EventRangeDef]]] = Queue() workdir = os.path.expandvars(self.config.ray.get("workdir")) if not workdir or not os.path.exists(workdir): @@ -103,9 +99,7 @@ def __init__(self, config: Config, session_dir: str) -> None: # TODO removing stdout on the root logger will also disable ray logging and collected stdout from actors disable_stdout_logging() - self._logger.debug( - f"Raythena v{__version__} initializing, running Ray {ray.__version__} on {gethostname()}" - ) + self._logger.debug(f"Raythena v{__version__} initializing, running Ray {ray.__version__} on {gethostname()}") self.task_workdir_path_file = f"{workdir}/task_workdir_path.txt" # self.cpu_monitor = CPUMonitor(os.path.join(workdir, "cpu_monitor_driver.json")) @@ -130,13 +124,9 @@ def __init__(self, config: Config, session_dir: str) -> None: self.first_event_range_request = True self.no_more_events = False self.cache_size_factor = self.config.ray.get("cachesizefactor", 3) - self.cores_per_node = self.config.resources.get( - "corepernode", os.cpu_count() - ) + self.cores_per_node = self.config.resources.get("corepernode", os.cpu_count()) self.n_actors = len(self.nodes) - self.events_cache_size = ( - self.cores_per_node * self.n_actors * self.cache_size_factor - ) + self.events_cache_size = self.cores_per_node * self.n_actors * self.cache_size_factor self.timeoutinterval = self.config.ray["timeoutinterval"] self.max_running_merge_transforms = self.config.ray["mergemaxprocesses"] self.panda_taskid = None @@ -148,9 +138,7 @@ def __init__(self, config: Config, session_dir: str) -> None: self.container_type = "" self.jobreport_name = "" if not os.path.isfile(harvester_config): - self._logger.warning( - f"Couldn't find harvester config file {harvester_config}" - ) + self._logger.warning(f"Couldn't find harvester config file {harvester_config}") else: parser.read(harvester_config) queuedata_config = [ @@ -160,36 +148,23 @@ def __init__(self, config: Config, session_dir: str) -> None: ] self.jobreport_name = parser["payload_interaction"]["jobReportFile"] if not queuedata_config: - self._logger.warning( - f"No queuedata config found for {self.pandaqueue}" - ) + self._logger.warning(f"No queuedata config found for {self.pandaqueue}") elif not os.path.isfile(queuedata_config[0]): - self._logger.warning( - f"cached queudata file not found: {queuedata_config[0]}" - ) + self._logger.warning(f"cached queudata file not found: {queuedata_config[0]}") else: self.queuedata_file = queuedata_config[0] with open(self.queuedata_file) as f: queuedata = json.load(f) self.container_options = queuedata["container_options"] - self.container_type = queuedata["container_type"].split( - ":" - )[0] - if ( - self.container_type - != self.config.payload["containerengine"] - ): + self.container_type = queuedata["container_type"].split(":")[0] + if self.container_type != self.config.payload["containerengine"]: self._logger.warning( "Mismatch between pandaqueue and raythena container type. Overriding raythena config" ) - self.config.payload["containerengine"] = ( - self.container_type - ) + self.config.payload["containerengine"] = self.container_type # {input_filename, {merged_output_filename, ([(event_range_id, EventRange)], subprocess handle)}} - self.running_merge_transforms: dict[ - str, tuple[list[tuple[str, EventRange]], Popen, str] - ] = dict() + self.running_merge_transforms: dict[str, tuple[list[tuple[str, EventRange]], Popen, str]] = dict() self.total_running_merge_transforms = 0 self.failed_actor_tasks_count = dict() self.available_events_per_actor = 0 @@ -236,9 +211,7 @@ def create_actors(self) -> None: Returns: None """ - events_per_actor = min( - self.available_events_per_actor, self.cores_per_node - ) + events_per_actor = min(self.available_events_per_actor, self.cores_per_node) for i, node in enumerate(self.nodes): nodeip = node["NodeManagerAddress"] node_constraint = f"node:{nodeip}" @@ -254,23 +227,17 @@ def create_actors(self) -> None: if job: job_remote = self.remote_jobdef_byid[job["PandaID"]] kwargs["job"] = job_remote - event_ranges = self.bookKeeper.fetch_event_ranges( - actor_id, events_per_actor - ) + event_ranges = self.bookKeeper.fetch_event_ranges(actor_id, events_per_actor) if event_ranges: kwargs["event_ranges"] = event_ranges self._logger.debug( f"Prefetched job {job['PandaID']} and {len(event_ranges)} event ranges for {actor_id}" ) - actor = ESWorker.options(resources={node_constraint: 1}).remote( - **kwargs - ) + actor = ESWorker.options(resources={node_constraint: 1}).remote(**kwargs) self.actors[actor_id] = actor - def retrieve_actors_messages( - self, ready: Sequence[ObjectRef] - ) -> Iterator[WorkerResponse]: + def retrieve_actors_messages(self, ready: Sequence[ObjectRef]) -> Iterator[WorkerResponse]: """ Given a list of ready futures from actors, unwrap them and return an interable over the result of each future. In case one of the futures raised an exception, the exception is handled by this function and @@ -319,19 +286,13 @@ def handle_actors(self) -> None: new_messages, self.actors_message_queue = self.wait_on_messages() total_sent = 0 while new_messages and self.running: - for actor_id, message, data in self.retrieve_actors_messages( - new_messages - ): + for actor_id, message, data in self.retrieve_actors_messages(new_messages): if message == Messages.IDLE or message == Messages.REPLY_OK: - self.enqueue_actor_call( - actor_id, self[actor_id].get_message.remote() - ) + self.enqueue_actor_call(actor_id, self[actor_id].get_message.remote()) elif message == Messages.REQUEST_NEW_JOB: self.handle_job_request(actor_id) elif message == Messages.REQUEST_EVENT_RANGES: - total_sent = self.handle_request_event_ranges( - actor_id, data, total_sent - ) + total_sent = self.handle_request_event_ranges(actor_id, data, total_sent) elif message == Messages.UPDATE_JOB: self.handle_update_job(actor_id, data) elif message == Messages.UPDATE_EVENT_RANGES: @@ -341,9 +302,7 @@ def handle_actors(self) -> None: self.on_tick() new_messages, self.actors_message_queue = self.wait_on_messages() - self._logger.debug( - "Finished handling the Actors. Raythena will shutdown now." - ) + self._logger.debug("Finished handling the Actors. Raythena will shutdown now.") def wait_on_messages(self) -> tuple[list[ObjectRef], list[ObjectRef]]: """ @@ -392,9 +351,7 @@ def handle_actor_done(self, actor_id: str) -> bool: # TODO: Temporary hack has_jobs = False if has_jobs: - self.enqueue_actor_call( - actor_id, self[actor_id].mark_new_job.remote() - ) + self.enqueue_actor_call(actor_id, self[actor_id].mark_new_job.remote()) else: self.terminated.append(actor_id) self.bookKeeper.process_actor_end(actor_id) @@ -402,9 +359,7 @@ def handle_actor_done(self, actor_id: str) -> bool: # do not get new messages from this actor return has_jobs - def handle_update_event_ranges( - self, actor_id: str, data: EventRangeUpdate - ) -> None: + def handle_update_event_ranges(self, actor_id: str, data: EventRangeUpdate) -> None: """ Handle worker update event ranges @@ -431,9 +386,7 @@ def handle_update_job(self, actor_id: str, data: Any) -> None: """ self.enqueue_actor_call(actor_id, self[actor_id].get_message.remote()) - def handle_request_event_ranges( - self, actor_id: str, data: EventRangeRequest, total_sent: int - ) -> int: + def handle_request_event_ranges(self, actor_id: str, data: EventRangeRequest, total_sent: int) -> int: """ Handle event ranges request. Event ranges are distributed evenly amongst workers, the number of events returned in a single request is capped to the number of local events @@ -455,9 +408,7 @@ def handle_request_event_ranges( panda_id = self.bookKeeper.get_actor_job(actor_id) # get the min between requested ranges and what is available for each actor - n_ranges = min( - data[panda_id]["nRanges"], self.available_events_per_actor - ) + n_ranges = min(data[panda_id]["nRanges"], self.available_events_per_actor) evt_range = self.bookKeeper.fetch_event_ranges(actor_id, n_ranges) # did not fetch enough events and harvester might have more, needs to get more events now @@ -473,9 +424,7 @@ def handle_request_event_ranges( self.enqueue_actor_call( actor_id, self[actor_id].receive_event_ranges.remote( - Messages.REPLY_OK - if evt_range - else Messages.REPLY_NO_MORE_EVENT_RANGES, + Messages.REPLY_OK if evt_range else Messages.REPLY_NO_MORE_EVENT_RANGES, evt_range, ), ) @@ -538,9 +487,7 @@ def request_event_ranges(self, block: bool = False) -> None: ) if len(event_request) > 0: - self._logger.debug( - f"Sending event ranges request to harvester for {self.events_cache_size} events" - ) + self._logger.debug(f"Sending event ranges request to harvester for {self.events_cache_size} events") self.requests_queue.put(event_request) self.n_eventsrequest += 1 @@ -550,17 +497,13 @@ def request_event_ranges(self, block: bool = False) -> None: n_received_events = 0 for pandaID, ranges_list in ranges.items(): n_received_events += len(ranges_list) - self._logger.debug( - f"got event ranges for job {pandaID}: {len(ranges_list)}" - ) + self._logger.debug(f"got event ranges for job {pandaID}: {len(ranges_list)}") if self.first_event_range_request: self.first_event_range_request = False if n_received_events == 0: self.stop() self.bookKeeper.add_event_ranges(ranges) - self.available_events_per_actor = max( - 1, ceil(self.bookKeeper.n_ready(pandaID) / self.n_actors) - ) + self.available_events_per_actor = max(1, ceil(self.bookKeeper.n_ready(pandaID) / self.n_actors)) self.n_eventsrequest -= 1 except Empty: pass @@ -638,9 +581,7 @@ def run(self) -> None: # gets initial jobs and send an eventranges request for each jobs jobs = self.jobs_queue.get() if not jobs: - self._logger.critical( - "No jobs provided by communicator, stopping..." - ) + self._logger.critical("No jobs provided by communicator, stopping...") return if len(jobs) > 1: self._logger.critical("Raythena can only handle one job") @@ -661,9 +602,7 @@ def run(self) -> None: elif self.cmt_config: self.the_platform = self.cmt_config else: - self._logger.warning( - f"No container or CmtConfig found, using default platform {self.the_platform}" - ) + self._logger.warning(f"No container or CmtConfig found, using default platform {self.the_platform}") self.cmt_config = job["cmtConfig"] = self.the_platform self.setup_dirs() self._logger.debug("Adding job and generating event ranges...") @@ -676,17 +615,13 @@ def run(self) -> None: self.bookKeeper.stop_cleaner_thread() self.bookKeeper.stop_saver_thread() self.communicator.stop() - self._logger.critical( - "Couldn't fetch a job with event ranges, stopping..." - ) + self._logger.critical("Couldn't fetch a job with event ranges, stopping...") return job_id = self.bookKeeper.jobs.next_job_id_to_process() total_events = self.bookKeeper.n_ready(job_id) os.makedirs(os.path.join(self.config.ray["workdir"], job_id)) if total_events: - self.available_events_per_actor = max( - 1, ceil(total_events / self.n_actors) - ) + self.available_events_per_actor = max(1, ceil(total_events / self.n_actors)) for pandaID in self.bookKeeper.jobs: cjob = self.bookKeeper.jobs[pandaID] self.remote_jobdef_byid[pandaID] = ray.put(cjob) @@ -699,22 +634,16 @@ def run(self) -> None: self.handle_actors() except Exception as e: self._logger.error(f"{traceback.format_exc()}") - self._logger.error( - f"Error while handling actors: {e}. stopping..." - ) + self._logger.error(f"Error while handling actors: {e}. stopping...") if self.config.logging.get("copyraylogs", False): ray_logs = os.path.join(self.workdir, "ray_logs") try: shutil.copytree(self.session_log_dir, ray_logs) except Exception as e: - self._logger.error( - f"Failed to copy ray logs to workdir: {e}" - ) + self._logger.error(f"Failed to copy ray logs to workdir: {e}") else: - self._logger.info( - "No events to process, check for remaining merge jobs..." - ) + self._logger.info("No events to process, check for remaining merge jobs...") self._logger.debug("Waiting on merge transforms") # Workers might have sent event ranges update since last check, create possible merge jobs self.bookKeeper.stop_saver_thread() @@ -736,11 +665,7 @@ def run(self) -> None: # need to explicitely save as we stopped saver_thread self.bookKeeper.save_status() task_status = self.bookKeeper.taskstatus.get(self.panda_taskid, None) - if ( - task_status - and task_status.get_nmerged() + task_status.get_nfailed() - == task_status.total_events() - ): + if task_status and task_status.get_nmerged() + task_status.get_nfailed() == task_status.total_events(): assert job_id output_map = self.bookKeeper.remap_output_files(job_id) self.rename_output_files(output_map) @@ -759,13 +684,9 @@ def rename_output_files(self, output_map: dict[str, str]): new_filename = output_map[file] except KeyError: # read the commit log to recover the correct name. If we get another KeyError, we can't recover - new_filename = output_map.get( - self.bookKeeper.recover_outputfile_name(file) - ) + new_filename = output_map.get(self.bookKeeper.recover_outputfile_name(file)) if not new_filename: - self._logger.warning( - f"Couldn't find new name for {file}, will not be staged out correctly" - ) + self._logger.warning(f"Couldn't find new name for {file}, will not be staged out correctly") continue os.rename( os.path.join(self.merged_files_dir, file), @@ -792,9 +713,7 @@ def produce_final_report(self, output_map: dict[str, str]): new_filename = output_map[old_filename] except KeyError: # read the commit log to recover the correct name. If we get another KeyError, we can't recover - new_filename = output_map[ - self.bookKeeper.recover_outputfile_name(old_filename) - ] + new_filename = output_map[self.bookKeeper.recover_outputfile_name(old_filename)] output_file_entry["name"] = new_filename with open(os.path.join(self.job_reports_dir, files[0]), "w") as f: json.dump(final_report, f) @@ -803,24 +722,16 @@ def produce_final_report(self, output_map: dict[str, str]): current_file = os.path.join(self.job_reports_dir, file) with open(current_file) as f: current_report = json.load(f) - final_report_files["input"].append( - current_report["files"]["input"][0] - ) - output_file_entry = current_report["files"]["output"][0][ - "subFiles" - ][0] + final_report_files["input"].append(current_report["files"]["input"][0]) + output_file_entry = current_report["files"]["output"][0]["subFiles"][0] old_filename = output_file_entry["name"] try: new_filename = output_map[old_filename] except KeyError: # read the commit log to recover the correct name. If we get another KeyError, we can't recover - new_filename = output_map[ - self.bookKeeper.recover_outputfile_name(old_filename) - ] + new_filename = output_map[self.bookKeeper.recover_outputfile_name(old_filename)] output_file_entry["name"] = new_filename - final_report_files["output"][0]["subFiles"].append( - output_file_entry - ) + final_report_files["output"][0]["subFiles"].append(output_file_entry) with open(current_file, "w") as f: json.dump(current_report, f) @@ -862,20 +773,11 @@ def handle_actor_exception(self, actor_id: str, ex: Exception) -> None: self.failed_actor_tasks_count[actor_id] = 0 self.failed_actor_tasks_count[actor_id] += 1 - if ( - self.failed_actor_tasks_count[actor_id] - < self.max_retries_error_failed_tasks - ): - self.enqueue_actor_call( - actor_id, self[actor_id].get_message.remote() - ) - self._logger.warning( - f"{actor_id} failed {self.failed_actor_tasks_count[actor_id]} times. Retrying..." - ) + if self.failed_actor_tasks_count[actor_id] < self.max_retries_error_failed_tasks: + self.enqueue_actor_call(actor_id, self[actor_id].get_message.remote()) + self._logger.warning(f"{actor_id} failed {self.failed_actor_tasks_count[actor_id]} times. Retrying...") else: - self._logger.warning( - f"{actor_id} failed too many times. No longer fetching messages from it" - ) + self._logger.warning(f"{actor_id} failed too many times. No longer fetching messages from it") if actor_id not in self.terminated: self.terminated.append(actor_id) @@ -886,9 +788,7 @@ def get_output_file_guid(self, job_report_file) -> Optional[str]: with open(job_report_file) as f: job_report = json.load(f) try: - guid = job_report["files"]["output"][0]["subFiles"][0][ - "file_guid" - ] + guid = job_report["files"]["output"][0]["subFiles"][0]["file_guid"] except KeyError: guid = None return guid @@ -896,11 +796,11 @@ def get_output_file_guid(self, job_report_file) -> Optional[str]: def handle_merge_transforms(self, wait_for_completion=False) -> bool: """ Checks if the bookkeeper has files ready to be merged. If so, subprocesses for merge tasks are started. - After starting any subprocess, go through all the running subprocess and poll then to check + After starting any subprocess, go through all the running subprocess and poll then to check if any completed and report status to the bookkeepr. Args: - wait_for_completion: Wait for all the subprocesses + wait_for_completion: Wait for all the subprocesses (including those started by this call) to finish before returning Returns: @@ -908,10 +808,7 @@ def handle_merge_transforms(self, wait_for_completion=False) -> bool: """ new_transforms = False - if ( - self.total_running_merge_transforms - < self.max_running_merge_transforms - ): + if self.total_running_merge_transforms < self.max_running_merge_transforms: self.bookKeeper.check_mergeable_files() merge_files = self.bookKeeper.get_file_to_merge() while merge_files: @@ -920,9 +817,7 @@ def handle_merge_transforms(self, wait_for_completion=False) -> bool: (sub_process, job_report_file) = self.hits_merge_transform( [e[0] for e in event_ranges], output_filename ) - self._logger.debug( - f"Starting merge transform for {output_filename}" - ) + self._logger.debug(f"Starting merge transform for {output_filename}") self.running_merge_transforms[output_filename] = ( event_ranges, sub_process, @@ -930,10 +825,7 @@ def handle_merge_transforms(self, wait_for_completion=False) -> bool: ) self.total_running_merge_transforms += 1 new_transforms = True - if ( - self.total_running_merge_transforms - >= self.max_running_merge_transforms - ): + if self.total_running_merge_transforms >= self.max_running_merge_transforms: break merge_files = self.bookKeeper.get_file_to_merge() @@ -950,16 +842,12 @@ def handle_merge_transforms(self, wait_for_completion=False) -> bool: to_remove.append(output_filename) self.total_running_merge_transforms -= 1 if sub_process.returncode == 0: - self._logger.debug( - f"Merge transform for file {output_filename} finished." - ) + self._logger.debug(f"Merge transform for file {output_filename} finished.") event_ranges_map = {} guid = self.get_output_file_guid(job_report_file) for event_range_output, event_range in event_ranges: - event_ranges_map[event_range.eventRangeID] = ( - TaskStatus.build_eventrange_dict( - event_range, event_range_output - ) + event_ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict( + event_range, event_range_output ) self.bookKeeper.report_merged_file( self.panda_taskid, @@ -968,9 +856,7 @@ def handle_merge_transforms(self, wait_for_completion=False) -> bool: guid, ) else: - self.bookKeeper.report_failed_merge_transform( - self.panda_taskid, output_filename - ) + self.bookKeeper.report_failed_merge_transform(self.panda_taskid, output_filename) self._logger.debug( f"Merge transform for {output_filename} failed with return code {sub_process.returncode}" ) @@ -978,9 +864,7 @@ def handle_merge_transforms(self, wait_for_completion=False) -> bool: del self.running_merge_transforms[o] return new_transforms - def hits_merge_transform( - self, input_files: Iterable[str], output_file: str - ) -> tuple[Popen, str]: + def hits_merge_transform(self, input_files: Iterable[str], output_file: str) -> tuple[Popen, str]: """ Prepare the shell command for the merging subprocess and starts it. @@ -995,9 +879,7 @@ def hits_merge_transform( return tmp_dir = tempfile.mkdtemp() file_list = "\n".join(input_files) - job_report_name = ( - os.path.join(self.job_reports_dir, output_file) + ".json" - ) + job_report_name = os.path.join(self.job_reports_dir, output_file) + ".json" output_file = os.path.join(self.merged_files_dir, output_file) file_list_path = os.path.join(tmp_dir, "file_list.txt") @@ -1009,41 +891,22 @@ def hits_merge_transform( f"@/srv/{os.path.basename(file_list_path)}", self.merge_transform_params, ) - transform_params = re.sub( - r"--inputHitsFile=", "--inputHitsFile ", transform_params - ) - transform_params = re.sub( - r"--inputHITSFile=", "--inputHITSFile ", transform_params - ) - transform_params = re.sub( - r"\$\{OUTPUT0\}", output_file, transform_params, count=1 - ) - transform_params = re.sub( - r"--autoConfiguration=everything", "", transform_params - ) + transform_params = re.sub(r"--inputHitsFile=", "--inputHitsFile ", transform_params) + transform_params = re.sub(r"--inputHITSFile=", "--inputHITSFile ", transform_params) + transform_params = re.sub(r"\$\{OUTPUT0\}", output_file, transform_params, count=1) + transform_params = re.sub(r"--autoConfiguration=everything", "", transform_params) transform_params = re.sub(r"--DBRelease=current", "", transform_params) - endtoken = ( - "" - if self.config.payload["containerextrasetup"].strip().endswith(";") - else ";" - ) + endtoken = "" if self.config.payload["containerextrasetup"].strip().endswith(";") else ";" container_script = ( - f"{self.config.payload['containerextrasetup']}{endtoken}" - f"{self.merge_transform} {transform_params}" + f"{self.config.payload['containerextrasetup']}{endtoken}" f"{self.merge_transform} {transform_params}" ) merge_script_path = os.path.join(tmp_dir, "merge_transform.sh") with open(merge_script_path, "w") as f: f.write(container_script) os.chmod( merge_script_path, - stat.S_IRUSR - | stat.S_IWUSR - | stat.S_IXUSR - | stat.S_IRGRP - | stat.S_IXGRP - | stat.S_IROTH - | stat.S_IXOTH, + stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH, ) setup_script_path = os.path.join(tmp_dir, "release_setup.sh") @@ -1053,24 +916,14 @@ def hits_merge_transform( f.write(setup_script) os.chmod( setup_script_path, - stat.S_IRUSR - | stat.S_IWUSR - | stat.S_IXUSR - | stat.S_IRGRP - | stat.S_IXGRP - | stat.S_IROTH - | stat.S_IXOTH, + stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH, ) cmd = "" cmd += "export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase;" cmd += f'export thePlatform="{self.the_platform}";' - endtoken = ( - "" - if self.config.payload["containerextraargs"].strip().endswith(";") - else ";" - ) + endtoken = "" if self.config.payload["containerextraargs"].strip().endswith(";") else ";" cmd += ( f"{self.config.payload['containerextraargs']}{endtoken}" f"source ${{ATLAS_LOCAL_ROOT_BASE}}/user/atlasLocalSetup.sh" diff --git a/src/raythena/scripts/raythena.py b/src/raythena/scripts/raythena.py index 2a1c49a..2550ddf 100755 --- a/src/raythena/scripts/raythena.py +++ b/src/raythena/scripts/raythena.py @@ -14,12 +14,8 @@ @click.option("--config", required=True, help="raythena configuration file.") @click.option("-d", "--debug", is_flag=True, help="Debug log level") @click.option("--ray-head-ip", help="IP address of ray head node") -@click.option( - "--ray-redis-port", help="Port of redis instance used by the ray cluster" -) -@click.option( - "--ray-redis-password", help="Redis password setup in the ray cluster" -) +@click.option("--ray-redis-port", help="Port of redis instance used by the ray cluster") +@click.option("--ray-redis-password", help="Redis password setup in the ray cluster") @click.option("--ray-workdir", help="Workdirectory for ray actors") @click.option( "--harvester-endpoint", @@ -45,21 +41,11 @@ def cli(*args, **kwargs): driver = ESDriver(config, cluster_config["session_dir"]) signal.signal(signal.SIGINT, functools.partial(cleanup, config, driver)) - signal.signal( - signal.SIGTERM, functools.partial(cleanup, config, driver) - ) - signal.signal( - signal.SIGQUIT, functools.partial(cleanup, config, driver) - ) - signal.signal( - signal.SIGSEGV, functools.partial(cleanup, config, driver) - ) - signal.signal( - signal.SIGXCPU, functools.partial(cleanup, config, driver) - ) - signal.signal( - signal.SIGUSR1, functools.partial(cleanup, config, driver) - ) + signal.signal(signal.SIGTERM, functools.partial(cleanup, config, driver)) + signal.signal(signal.SIGQUIT, functools.partial(cleanup, config, driver)) + signal.signal(signal.SIGSEGV, functools.partial(cleanup, config, driver)) + signal.signal(signal.SIGXCPU, functools.partial(cleanup, config, driver)) + signal.signal(signal.SIGUSR1, functools.partial(cleanup, config, driver)) signal.signal(signal.SIGBUS, functools.partial(cleanup, config, driver)) driver.run() except Exception as e: diff --git a/src/raythena/utils/bookkeeper.py b/src/raythena/utils/bookkeeper.py index a84a47c..6919a76 100644 --- a/src/raythena/utils/bookkeeper.py +++ b/src/raythena/utils/bookkeeper.py @@ -54,9 +54,7 @@ class TaskStatus: MERGING = "merging" FAILED = "failed" - def __init__( - self, job: PandaJob, merged_files_dir: str, config: Config - ) -> None: + def __init__(self, job: PandaJob, merged_files_dir: str, config: Config) -> None: self.config = config self.job = job self._logger = make_logger(self.config, "TaskStatus") @@ -68,21 +66,16 @@ def __init__( self._nfiles = len(job["inFiles"].split(",")) self._nevents = self._events_per_file * self._nfiles self._hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) - assert ( - (self._events_per_file % self._hits_per_file == 0) - or (self._hits_per_file % self._events_per_file == 0) + assert (self._events_per_file % self._hits_per_file == 0) or ( + self._hits_per_file % self._events_per_file == 0 ), "Expected number of events per input file to be a multiple of number of hits per merged file" # if _hits_per_file > _events_per_file, each input file has a single output file - self._n_output_per_input_file = max( - 1, self._events_per_file // self._hits_per_file - ) + self._n_output_per_input_file = max(1, self._events_per_file // self._hits_per_file) self._status: dict[ str, Union[dict[str, dict[str, dict[str, str]]], dict[str, list[str]]], ] = dict() - self._update_queue: deque[tuple[str, Union[EventRange, tuple]]] = ( - collections.deque() - ) + self._update_queue: deque[tuple[str, Union[EventRange, tuple]]] = collections.deque() self._restore_status() def _default_init_status(self): @@ -114,9 +107,7 @@ def _restore_status(self): self._status = json.load(f) except OSError as e: # failed to load status, try to read from a possible tmp file if it exists and not already done - if filename != self.tmpfilepath and os.path.isfile( - self.tmpfilepath - ): + if filename != self.tmpfilepath and os.path.isfile(self.tmpfilepath): try: with open(self.tmpfilepath) as f: self._status = json.load(f) @@ -168,9 +159,7 @@ def is_stale(self) -> bool: return len(self._update_queue) > 0 @staticmethod - def build_eventrange_dict( - eventrange: EventRange, output_file: str = None - ) -> dict[str, Any]: + def build_eventrange_dict(eventrange: EventRange, output_file: str = None) -> dict[str, Any]: """ Takes an EventRange object and retuns the dict representation which should be saved in the state file @@ -188,9 +177,7 @@ def build_eventrange_dict( res["path"] = output_file return res - def set_eventrange_simulated( - self, eventrange: EventRange, simulation_output_file: str - ): + def set_eventrange_simulated(self, eventrange: EventRange, simulation_output_file: str): """ Enqueue a message indicating that an event range has been simulated @@ -198,13 +185,9 @@ def set_eventrange_simulated( eventrange: the event range simulation_output_file: produced file """ - self._update_queue.append( - (TaskStatus.SIMULATED, (eventrange, simulation_output_file)) - ) + self._update_queue.append((TaskStatus.SIMULATED, (eventrange, simulation_output_file))) - def _set_eventrange_simulated( - self, eventrange: EventRange, simulation_output_file: str - ): + def _set_eventrange_simulated(self, eventrange: EventRange, simulation_output_file: str): """ Performs the update of the internal dictionnary of a simulated event range @@ -216,8 +199,8 @@ def _set_eventrange_simulated( simulated_dict = self._status[TaskStatus.SIMULATED] if filename not in simulated_dict: simulated_dict[filename] = dict() - simulated_dict[filename][eventrange.eventRangeID] = ( - TaskStatus.build_eventrange_dict(eventrange, simulation_output_file) + simulated_dict[filename][eventrange.eventRangeID] = TaskStatus.build_eventrange_dict( + eventrange, simulation_output_file ) def set_file_merged( @@ -235,9 +218,7 @@ def set_file_merged( outputfile: produced merged hits file event_ranges: event ranges merged in the outputfile. Map of [event_range_id, [k, v]] """ - self._update_queue.append( - (TaskStatus.MERGING, (input_files, outputfile, event_ranges, guid)) - ) + self._update_queue.append((TaskStatus.MERGING, (input_files, outputfile, event_ranges, guid))) def _set_file_merged( self, @@ -264,36 +245,23 @@ def _set_file_merged( ), f"Expected {self._hits_per_file} hits in {outputfile}, got {len(event_ranges)}" for inputfile in input_files: if inputfile not in self._status[TaskStatus.MERGING]: - self._status[TaskStatus.MERGING][inputfile] = { - outputfile: event_ranges - } + self._status[TaskStatus.MERGING][inputfile] = {outputfile: event_ranges} else: - self._status[TaskStatus.MERGING][inputfile][outputfile] = ( - event_ranges - ) + self._status[TaskStatus.MERGING][inputfile][outputfile] = event_ranges - if ( - len(self._status[TaskStatus.MERGING][inputfile]) - == self._n_output_per_input_file - ): + if len(self._status[TaskStatus.MERGING][inputfile]) == self._n_output_per_input_file: merged_dict = dict() self._status[TaskStatus.MERGED][inputfile] = merged_dict - for merged_outputfile in self._status[TaskStatus.MERGING][ - inputfile - ]: + for merged_outputfile in self._status[TaskStatus.MERGING][inputfile]: merged_dict[merged_outputfile] = { - "path": os.path.join( - self.merged_files_dir, merged_outputfile - ), + "path": os.path.join(self.merged_files_dir, merged_outputfile), "guid": guid if guid else "", } del self._status[TaskStatus.MERGING][inputfile] del self._status[TaskStatus.SIMULATED][inputfile] else: for event_range_id in event_ranges: - del self._status[TaskStatus.SIMULATED][inputfile][ - event_range_id - ] + del self._status[TaskStatus.SIMULATED][inputfile][event_range_id] def set_eventrange_failed(self, eventrange: EventRange): """ @@ -315,12 +283,8 @@ def _set_eventrange_failed(self, eventrange: EventRange): failed_dict = self._status[TaskStatus.FAILED] if filename not in failed_dict: failed_dict[filename] = dict() - failed_dict[filename][eventrange.eventRangeID] = ( - TaskStatus.build_eventrange_dict(eventrange) - ) - if eventrange.eventRangeID in self._status[TaskStatus.SIMULATED].get( - filename, {} - ): + failed_dict[filename][eventrange.eventRangeID] = TaskStatus.build_eventrange_dict(eventrange) + if eventrange.eventRangeID in self._status[TaskStatus.SIMULATED].get(filename, {}): del self._status[TaskStatus.SIMULATED][eventrange.eventRangeID] def get_nsimulated(self, filename=None) -> int: @@ -339,14 +303,8 @@ def get_nsimulated(self, filename=None) -> int: if filename in self._status[TaskStatus.MERGED]: return merged elif filename in self._status[TaskStatus.MERGING]: - merged = ( - len(self._status[TaskStatus.MERGING][filename]) - * self._hits_per_file - ) - return ( - len(self._status[TaskStatus.SIMULATED].get(filename, [])) - - merged - ) + merged = len(self._status[TaskStatus.MERGING][filename]) * self._hits_per_file + return len(self._status[TaskStatus.SIMULATED].get(filename, [])) - merged return reduce( lambda acc, cur: acc + len(cur), @@ -391,13 +349,8 @@ def get_nmerged(self, filename=None) -> int: if filename in self._status[TaskStatus.MERGED]: return self._events_per_file elif filename in self._status[TaskStatus.MERGING]: - return ( - len(self._status[TaskStatus.MERGING][filename]) - * self._hits_per_file - ) - return len( - self._status[TaskStatus.MERGED] - ) * self._events_per_file + reduce( + return len(self._status[TaskStatus.MERGING][filename]) * self._hits_per_file + return len(self._status[TaskStatus.MERGED]) * self._events_per_file + reduce( lambda acc, cur: acc + len(cur) * self._hits_per_file, self._status[TaskStatus.MERGING].values(), 0, @@ -425,21 +378,15 @@ def __init__(self, config: Config) -> None: self.actors: dict[str, Optional[str]] = dict() self.rangesID_by_actor: dict[str, set[str]] = dict() #  Output files for which we are ready to launch a merge transform - self.files_ready_to_merge: dict[str, list[tuple[str, EventRange]]] = ( - dict() - ) + self.files_ready_to_merge: dict[str, list[tuple[str, EventRange]]] = dict() # Event ranges for a given input file which have been simulated and a ready to be merged self.ranges_to_merge: dict[str, list[tuple[str, EventRange]]] = dict() # Accumulate event ranges of different input files into the same output file until # we have enough to produce a merged file. # Only used when multiple input files are merged in a single output (n-1) to pool input files together - self.output_merge_queue: dict[str, list[tuple[str, EventRange]]] = ( - dict() - ) + self.output_merge_queue: dict[str, list[tuple[str, EventRange]]] = dict() # Keep tracks of merge job definition that have been distributed to the driver for which we expect an update - self.ditributed_merge_tasks: dict[str, list[tuple[str, EventRange]]] = ( - dict() - ) + self.ditributed_merge_tasks: dict[str, list[tuple[str, EventRange]]] = dict() self.files_guids: dict[str, str] = dict() self.last_status_print = time.time() self.taskstatus: dict[str, TaskStatus] = dict() @@ -447,12 +394,8 @@ def __init__(self, config: Config) -> None: self._output_input_mapping: dict[str, list[str]] = dict() self.stop_saver = threading.Event() self.stop_cleaner = threading.Event() - self.save_state_thread = ExThread( - target=self._saver_thead_run, name="status-saver-thread" - ) - self.cleaner_thread = ExThread( - target=self._cleaner_thead_run, name="cleaner-thread" - ) + self.save_state_thread = ExThread(target=self._saver_thead_run, name="status-saver-thread") + self.cleaner_thread = ExThread(target=self._cleaner_thead_run, name="cleaner-thread") def _cleaner_thead_run(self): """ @@ -463,18 +406,14 @@ def _cleaner_thead_run(self): if os.path.isdir(self.output_dir): files = set(os.listdir(self.output_dir)) for task_status in self.taskstatus.values(): - for merged_file in task_status._status[ - TaskStatus.MERGED - ]: + for merged_file in task_status._status[TaskStatus.MERGED]: if self.stop_cleaner.is_set(): break for temp_file in files: if self.stop_cleaner.is_set(): break if merged_file in temp_file: - os.remove( - os.path.join(self.output_dir, temp_file) - ) + os.remove(os.path.join(self.output_dir, temp_file)) removed.add(temp_file) files -= removed removed.clear() @@ -523,30 +462,21 @@ def _check_mergeable_files_n_1(self): self.output_merge_queue[output_filename] = [] self.output_merge_queue[output_filename].extend(event_ranges) event_ranges.clear() - if ( - len(self.output_merge_queue[output_filename]) - == self._hits_per_file - ): - self.files_ready_to_merge[output_filename] = ( - self.output_merge_queue[output_filename] - ) + if len(self.output_merge_queue[output_filename]) == self._hits_per_file: + self.files_ready_to_merge[output_filename] = self.output_merge_queue[output_filename] del self.output_merge_queue[output_filename] def stop_saver_thread(self): if self.save_state_thread.is_alive(): self.stop_saver.set() self.save_state_thread.join_with_ex() - self.save_state_thread = ExThread( - target=self._saver_thead_run, name="status-saver-thread" - ) + self.save_state_thread = ExThread(target=self._saver_thead_run, name="status-saver-thread") def stop_cleaner_thread(self): if self.cleaner_thread.is_alive(): self.stop_cleaner.set() self.cleaner_thread.join_with_ex() - self.cleaner_thread = ExThread( - target=self._cleaner_thead_run, name="cleaner-thread" - ) + self.cleaner_thread = ExThread(target=self._cleaner_thead_run, name="cleaner-thread") def start_threads(self): """ @@ -593,9 +523,7 @@ def _generate_input_output_mapping(self, job: PandaJob): Goes through the list of input and ouput file names and matches expected output files for a given input file """ # Filter out potential log files, only interested in HITS files - output_files = [ - e for e in job["outFiles"].split(",") if e.startswith("HITS") - ] + output_files = [e for e in job["outFiles"].split(",") if e.startswith("HITS")] input_files = job["inFiles"].split(",") events_per_file = int(job["nEventsPerInputFile"]) hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) @@ -618,9 +546,7 @@ def _generate_input_output_mapping(self, job: PandaJob): assert events_per_file % hits_per_file == 0 n = events_per_file // hits_per_file assert len(input_files) * n == len(output_files) - for i, j in zip( - range(len(input_files)), range(0, len(output_files), n) - ): + for i, j in zip(range(len(input_files)), range(0, len(output_files), n)): input_output_mapping[input_files[i]] = output_files[j : (j + n)] for output_file in output_files[j : (j + n)]: output_input_mapping[output_file] = [input_files[i]] @@ -647,14 +573,9 @@ def remap_output_files(self, panda_id: str) -> dict[str, str]: merged_output_files = merged_files[input_file] assert isinstance(merged_output_files, dict) assert len(merged_output_files) == len(output_files) - for merged_file, new_file in zip( - merged_output_files, output_files - ): + for merged_file, new_file in zip(merged_output_files, output_files): if merged_file in previous_to_current_output_lookup: - assert ( - new_file - == previous_to_current_output_lookup[merged_file] - ) + assert new_file == previous_to_current_output_lookup[merged_file] continue previous_to_current_output_lookup[merged_file] = new_file f.write(f"rename_output {merged_file} {new_file}\n") @@ -734,9 +655,7 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): guid = self.files_guids[file] for i in range(1, self._events_per_file + 1): range_id = BookKeeper.generate_event_range_id(file, i) - event_range = EventRange( - range_id, i, i, file, guid, scope - ) + event_range = EventRange(range_id, i, i, file, guid, scope) event_range.status = EventRange.FAILED failed_event_ranges.append(event_range) task_status.set_eventrange_failed(event_range) @@ -764,10 +683,7 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): skip_event = False continue # event range hasn't been merged but already simulated, add it as ready to be merged - if ( - file_simulated_ranges is not None - and range_id in file_simulated_ranges - ): + if file_simulated_ranges is not None and range_id in file_simulated_ranges: item = ( file_simulated_ranges[range_id]["path"], event_range, @@ -777,10 +693,7 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): else: self.ranges_to_merge[event_range.PFN].append(item) # only for 1-to-n jobs, failure in n-t-1 have been handled in the 1st pass - elif ( - file_failed_ranges is not None - and range_id in file_failed_ranges - ): + elif file_failed_ranges is not None and range_id in file_failed_ranges: event_range.status = EventRange.FAILED job.event_ranges_queue.append(event_range) # event range hasn't been simulated, add it to the event range queue @@ -789,9 +702,7 @@ def _generate_event_ranges(self, job: PandaJob, task_status: TaskStatus): self._logger.debug(f"Generated {len(event_ranges)} event ranges") job.event_ranges_queue.add_new_event_ranges(event_ranges) - def add_event_ranges( - self, event_ranges: Mapping[str, Sequence[EventRangeDef]] - ) -> None: + def add_event_ranges(self, event_ranges: Mapping[str, Sequence[EventRangeDef]]) -> None: """ Assign event ranges to the jobs in queue. @@ -871,12 +782,8 @@ def fetch_event_ranges(self, actor_id: str, n: int) -> list[EventRange]: return list() if actor_id not in self.rangesID_by_actor: self.rangesID_by_actor[actor_id] = set() - ranges = self.jobs.get_event_ranges( - self.actors[actor_id] - ).get_next_ranges(n) - self.rangesID_by_actor[actor_id].update( - map(lambda e: e.eventRangeID, ranges) - ) + ranges = self.jobs.get_event_ranges(self.actors[actor_id]).get_next_ranges(n) + self.rangesID_by_actor[actor_id].update(map(lambda e: e.eventRangeID, ranges)) return ranges def get_file_to_merge( @@ -907,9 +814,7 @@ def report_merged_file( guid, ) - def report_failed_merge_transform( - self, taskID: str, merged_output_file: str - ): + def report_failed_merge_transform(self, taskID: str, merged_output_file: str): assert merged_output_file in self.ditributed_merge_tasks old_task = self.ditributed_merge_tasks.pop(merged_output_file) self.files_ready_to_merge[merged_output_file] = old_task @@ -917,9 +822,7 @@ def report_failed_merge_transform( def process_event_ranges_update( self, actor_id: str, - event_ranges_update: Union[ - Sequence[PilotEventRangeUpdateDef], EventRangeUpdate - ], + event_ranges_update: Union[Sequence[PilotEventRangeUpdateDef], EventRangeUpdate], ): """ Process the event ranges update sent by the worker. This will update the status of event ranges @@ -938,9 +841,7 @@ def process_event_ranges_update( return if not isinstance(event_ranges_update, EventRangeUpdate): - event_ranges_update = EventRangeUpdate.build_from_dict( - panda_id, event_ranges_update - ) + event_ranges_update = EventRangeUpdate.build_from_dict(panda_id, event_ranges_update) self.jobs.process_event_ranges_update(event_ranges_update) task_status = self.taskstatus[self.jobs[panda_id]["taskID"]] job_ranges = self.jobs.get_event_ranges(panda_id) @@ -959,9 +860,7 @@ def process_event_ranges_update( evnt_range = job_ranges[r["eventRangeID"]] if evnt_range.PFN in failed_files: continue - failed_files.extend( - self.get_files_to_merge_with(evnt_range.PFN) - ) + failed_files.extend(self.get_files_to_merge_with(evnt_range.PFN)) for file in failed_files: for i in range(1, self._events_per_file + 1): @@ -980,9 +879,7 @@ def process_event_ranges_update( task_status.set_eventrange_simulated(evnt_range, r["path"]) if evnt_range.PFN not in self.ranges_to_merge: self.ranges_to_merge[evnt_range.PFN] = list() - self.ranges_to_merge[evnt_range.PFN].append( - (r["path"], evnt_range) - ) + self.ranges_to_merge[evnt_range.PFN].append((r["path"], evnt_range)) now = time.time() if now - self.last_status_print > 60: self.last_status_print = now @@ -1024,13 +921,9 @@ def process_actor_end(self, actor_id: str) -> None: actor_ranges = self.rangesID_by_actor.get(actor_id, None) if not actor_ranges: return - self._logger.info( - f"{actor_id} finished with {len(actor_ranges)} events remaining to process" - ) + self._logger.info(f"{actor_id} finished with {len(actor_ranges)} events remaining to process") for rangeID in actor_ranges: - self.jobs.get_event_ranges(panda_id).update_range_state( - rangeID, EventRange.READY - ) + self.jobs.get_event_ranges(panda_id).update_range_state(rangeID, EventRange.READY) actor_ranges.clear() self.actors[actor_id] = None diff --git a/src/raythena/utils/config.py b/src/raythena/utils/config.py index 13170e2..2dc54b2 100644 --- a/src/raythena/utils/config.py +++ b/src/raythena/utils/config.py @@ -158,9 +158,7 @@ def _validate_section( """ for name, value in template_params.items(): if name not in section_params: - raise Exception( - f"Param '{name}' not found in conf section '{template_section_name}'" - ) + raise Exception(f"Param '{name}' not found in conf section '{template_section_name}'") if isinstance(value, dict): self._validate_section( f"{template_section_name}.{name}", @@ -185,9 +183,5 @@ def _validate(self) -> None: ) in Config.required_conf_settings.items(): section_params = getattr(self, template_section, None) if section_params is None: - raise Exception( - f"Malformed configuration file: section '{template_section}' not found" - ) - self._validate_section( - template_section, section_params, template_params - ) + raise Exception(f"Malformed configuration file: section '{template_section}' not found") + self._validate_section(template_section, section_params, template_params) diff --git a/src/raythena/utils/eventservice.py b/src/raythena/utils/eventservice.py index 8ecbea3..dcce653 100644 --- a/src/raythena/utils/eventservice.py +++ b/src/raythena/utils/eventservice.py @@ -12,14 +12,10 @@ JobDef = dict[str, Builtin] EventRangeDef = MutableMapping[str, Builtin] FileInfo = Mapping[str, Builtin] -PilotEventRangeUpdateDef = Mapping[ - str, Union[Builtin, FileInfo, Sequence[EventRangeDef]] -] +PilotEventRangeUpdateDef = Mapping[str, Union[Builtin, FileInfo, Sequence[EventRangeDef]]] HarvesterEventRangeUpdateDef = Sequence[MutableMapping[str, Builtin]] -EventRangeUpdateDef = Union[ - Sequence[PilotEventRangeUpdateDef], HarvesterEventRangeUpdateDef -] +EventRangeUpdateDef = Union[Sequence[PilotEventRangeUpdateDef], HarvesterEventRangeUpdateDef] EventRangeRequestDef = Mapping[str, Mapping[str, Builtin]] @@ -187,9 +183,7 @@ def get_event_ranges(self, panda_id: str) -> "EventRangeQueue": if panda_id in self.jobs: return self[panda_id].event_ranges_queue - def process_event_ranges_update( - self, ranges_update: "EventRangeUpdate" - ) -> None: + def process_event_ranges_update(self, ranges_update: "EventRangeUpdate") -> None: """ Update the range status Args: @@ -201,9 +195,7 @@ def process_event_ranges_update( for pandaID in ranges_update: self.get_event_ranges(pandaID).update_ranges(ranges_update[pandaID]) - def process_event_ranges_reply( - self, reply: Mapping[str, HarvesterEventRangeUpdateDef] - ) -> None: + def process_event_ranges_reply(self, reply: Mapping[str, HarvesterEventRangeUpdateDef]) -> None: """ Process an event ranges reply from harvester by adding ranges to each corresponding job already present in the queue. If an empty event list is received for a job, assume that no more events will be provided for this job @@ -220,10 +212,7 @@ def process_event_ranges_reply( if not ranges: self[pandaID].no_more_ranges = True else: - ranges_obj = [ - EventRange.build_from_dict(range_dict) - for range_dict in ranges - ] + ranges_obj = [EventRange.build_from_dict(range_dict) for range_dict in ranges] self.get_event_ranges(pandaID).add_new_event_ranges(ranges_obj) @staticmethod @@ -367,9 +356,7 @@ def __setitem__(self, k: str, v: "EventRange") -> None: if not isinstance(v, EventRange): raise ValueError(f"{v} should be of type {EventRange}") if k != v.eventRangeID: - raise KeyError( - f"Specified key '{k}' should be equals to the event range id '{v.eventRangeID}' " - ) + raise KeyError(f"Specified key '{k}' should be equals to the event range id '{v.eventRangeID}' ") if k in self.event_ranges_by_id: self.rangesID_by_state[v.status].remove(k) if v.PFN in self.rangesID_by_file: @@ -413,15 +400,10 @@ def update_range_state(self, range_id: str, new_state: str) -> "EventRange": the updated event range """ if range_id not in self.event_ranges_by_id: - raise KeyError( - f"Trying to update non-existing eventrange {range_id}" - ) + raise KeyError(f"Trying to update non-existing eventrange {range_id}") event_range = self.event_ranges_by_id[range_id] - if ( - new_state != EventRange.READY - and event_range.status == EventRange.READY - ): + if new_state != EventRange.READY and event_range.status == EventRange.READY: self.rangesID_by_file[event_range.PFN].remove(range_id) elif new_state == EventRange.READY: self.rangesID_by_file[event_range.PFN].add(range_id) @@ -490,9 +472,7 @@ def nranges_remaining(self) -> int: Returns: Number of event ranges which are not finished or failed """ - return len(self.event_ranges_by_id) - ( - self.nranges_done() + self.nranges_failed() - ) + return len(self.event_ranges_by_id) - (self.nranges_done() + self.nranges_failed()) def nranges_available(self) -> int: """ @@ -553,9 +533,7 @@ def append(self, event_range: Union[EventRangeDef, "EventRange"]) -> None: def add_new_event_ranges(self, ranges: Sequence["EventRange"]) -> None: # PRE: all ranges in the list are in state ready - self.rangesID_by_state[EventRange.READY].update( - map(lambda e: e.eventRangeID, ranges) - ) + self.rangesID_by_state[EventRange.READY].update(map(lambda e: e.eventRangeID, ranges)) self.event_ranges_count[EventRange.READY] += len(ranges) for r in ranges: self.event_ranges_by_id[r.eventRangeID] = r @@ -563,9 +541,7 @@ def add_new_event_ranges(self, ranges: Sequence["EventRange"]) -> None: self.rangesID_by_file[r.PFN] = set() self.rangesID_by_file[r.PFN].add(r.eventRangeID) - def concat( - self, ranges: Sequence[Union[EventRangeDef, "EventRange"]] - ) -> None: + def concat(self, ranges: Sequence[Union[EventRangeDef, "EventRange"]]) -> None: """ Concatenate a list of event ranges to the queue @@ -700,9 +676,7 @@ class EventRangeUpdate: def __init__( self, - range_update: dict[ - str, list[MutableMapping[str, Union[str, int]]] - ] = None, + range_update: dict[str, list[MutableMapping[str, Union[str, int]]]] = None, ) -> None: """ Wraps the range update dict in an object. The range update should be in the harvester-supported format. @@ -716,9 +690,7 @@ def __init__( for v in range_update.values(): if not isinstance(v, list): raise ValueError(f"Expecting type list for element {v}") - self.range_update: dict[str, HarvesterEventRangeUpdateDef] = ( - range_update - ) + self.range_update: dict[str, HarvesterEventRangeUpdateDef] = range_update def __len__(self) -> int: return len(self.range_update) @@ -745,9 +717,7 @@ def merge_update(self, other: "EventRangeUpdate") -> None: self[pandaID] = other[pandaID] @staticmethod - def build_from_dict( - panda_id: str, range_update: Sequence[PilotEventRangeUpdateDef] - ) -> "EventRangeUpdate": + def build_from_dict(panda_id: str, range_update: Sequence[PilotEventRangeUpdateDef]) -> "EventRangeUpdate": """ Parses a range_update dict to a format adapted to be sent to harvester. @@ -766,9 +736,7 @@ def build_from_dict( and "esOutput" not in range_update and "eventRangeID" not in range_update ): - range_update: Sequence[PilotEventRangeUpdateDef] = json.loads( - range_update["eventRanges"][0] - ) + range_update: Sequence[PilotEventRangeUpdateDef] = json.loads(range_update["eventRanges"][0]) for range_elt in range_update: if "zipFile" in range_elt and range_elt["zipFile"]: @@ -780,9 +748,7 @@ def build_from_dict( else: range_update_type = None file_info: None = None - ranges_info: Sequence[EventRangeDef] = range_elt.get( - "eventRanges", None - ) + ranges_info: Sequence[EventRangeDef] = range_elt.get("eventRanges", None) file_data = dict() if file_info: @@ -901,9 +867,7 @@ def __getitem__(self, k: str) -> dict[str, Builtin]: def __str__(self) -> str: return json.dumps(self.request) - def add_event_request( - self, panda_id: str, n_ranges: int, task_id: str, jobset_id: str - ) -> None: + def add_event_request(self, panda_id: str, n_ranges: int, task_id: str, jobset_id: str) -> None: """ Adds a job for which event ranges should be requested to the request object @@ -1237,9 +1201,7 @@ class JobReport: """ - def __init__( - self, exitCode: int = 0, exitMsg: str = None, exitMsgExtra: str = None - ) -> None: + def __init__(self, exitCode: int = 0, exitMsg: str = None, exitMsgExtra: str = None) -> None: self.exitCode = exitCode self.exitMsg = exitMsg self.exitMsgExtra = exitMsgExtra diff --git a/src/raythena/utils/exception.py b/src/raythena/utils/exception.py index 4029114..0b63202 100644 --- a/src/raythena/utils/exception.py +++ b/src/raythena/utils/exception.py @@ -105,9 +105,7 @@ class BaseRaythenaException(Exception): Base class for raythena exception """ - def __init__( - self, worker_id: str, error_code: int, message: str = None - ) -> None: + def __init__(self, worker_id: str, error_code: int, message: str = None) -> None: """ Initialize worker_id, error code and message @@ -118,9 +116,7 @@ def __init__( """ self.worker_id = worker_id self.error_code = error_code - self.message = ( - message if message else ErrorCodes.get_error_message(error_code) - ) + self.message = message if message else ErrorCodes.get_error_message(error_code) super().__init__(self.message) def __reduce__(self): @@ -204,9 +200,7 @@ class WrappedException(BaseRaythenaException): """ def __init__(self, worker_id: str, e: Exception) -> None: - super().__init__( - worker_id, ErrorCodes.UNKNOWN, f"Wrapped exception {e}" - ) + super().__init__(worker_id, ErrorCodes.UNKNOWN, f"Wrapped exception {e}") self.wrapped_exception = e def __reduce__(self): diff --git a/src/raythena/utils/logging.py b/src/raythena/utils/logging.py index dc2123d..86b6bb5 100644 --- a/src/raythena/utils/logging.py +++ b/src/raythena/utils/logging.py @@ -6,9 +6,7 @@ _initialized = False -def make_logger( - config: Config, name: str, filepath: str = None -) -> logging.Logger: +def make_logger(config: Config, name: str, filepath: str = None) -> logging.Logger: global _initialized if not _initialized: configure_logger(config, filepath) diff --git a/src/raythena/utils/ray.py b/src/raythena/utils/ray.py index 297021f..ee7c6cf 100644 --- a/src/raythena/utils/ray.py +++ b/src/raythena/utils/ray.py @@ -4,9 +4,7 @@ from raythena.utils.config import Config -def build_nodes_resource_list( - config: Config, run_actor_on_head: bool = False -) -> list[Mapping[str, Any]]: +def build_nodes_resource_list(config: Config, run_actor_on_head: bool = False) -> list[Mapping[str, Any]]: """ Build and setup ray custom resources. Actors should then be instantiated by requiring one of the resource in the returned list. @@ -55,9 +53,7 @@ def is_external_cluster(config: Config) -> bool: Returns: True if raythena is connecting to an existing cluster, False otherwise """ - return ( - config.ray["headip"] is not None and config.ray["redisport"] is not None - ) + return config.ray["headip"] is not None and config.ray["redisport"] is not None def setup_ray(config: Config) -> Any: diff --git a/src/raythena/utils/timing.py b/src/raythena/utils/timing.py index 6d87663..b29ba15 100644 --- a/src/raythena/utils/timing.py +++ b/src/raythena/utils/timing.py @@ -15,9 +15,7 @@ def __init__(self, log_file: str, pid: Any = None) -> None: self.process = psutil.Process(pid) self.log_file = log_file self.stop_event = Event() - self.monitor_thread = ExThread( - target=self.monitor_cpu, name="cpu_monitor" - ) + self.monitor_thread = ExThread(target=self.monitor_cpu, name="cpu_monitor") self.write_interval = 10 * 60 self.time_step = 1 @@ -41,14 +39,10 @@ def stop(self) -> None: if not self.stop_event.is_set(): self.stop_event.set() self.monitor_thread.join() - self.monitor_thread = ExThread( - target=self.monitor_cpu, name="cpu_monitor" - ) + self.monitor_thread = ExThread(target=self.monitor_cpu, name="cpu_monitor") self.stop_event = Event() - def _log_to_file( - self, data: dict[str, Union[dict[str, list], list, int]] - ) -> None: + def _log_to_file(self, data: dict[str, Union[dict[str, list], list, int]]) -> None: """ Write data to log file diff --git a/tests/conftest.py b/tests/conftest.py index 2497928..bf53046 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -81,11 +81,7 @@ def nhits_per_file(nevents_per_file): @pytest.fixture def range_ids(nfiles, nevents_per_file): - return [ - f"EVNT_{file}.pool.root.1-{event}" - for event in range(1, nevents_per_file + 1) - for file in range(nfiles) - ] + return [f"EVNT_{file}.pool.root.1-{event}" for event in range(1, nevents_per_file + 1) for file in range(nfiles)] @pytest.fixture @@ -124,10 +120,7 @@ def sample_rangeupdate(range_ids): "fsize": 860160, "pathConvention": 1000, }, - "eventRanges": [ - {"eventRangeID": r, "eventStatus": "finished"} - for r in range_ids - ], + "eventRanges": [{"eventRangeID": r, "eventStatus": "finished"} for r in range_ids], } ] @@ -254,9 +247,7 @@ def sample_multijobs( @pytest.fixture -def sample_job( - is_eventservice, input_output_file_list, nhits_per_file, nevents_per_file -): +def sample_job(is_eventservice, input_output_file_list, nhits_per_file, nevents_per_file): hash = hashlib.md5() (input_files, output_files) = input_output_file_list hash.update(str(time.time()).encode("utf-8")) diff --git a/tests/harvester/conftest.py b/tests/harvester/conftest.py index 65c7ecb..7941e8b 100644 --- a/tests/harvester/conftest.py +++ b/tests/harvester/conftest.py @@ -37,13 +37,9 @@ def clean_files(files): @pytest.fixture -def harvester_file_communicator( - tmpdir, config, request_queue, jobs_queue, ranges_queue -): +def harvester_file_communicator(tmpdir, config, request_queue, jobs_queue, ranges_queue): config.harvester["endpoint"] = str(tmpdir) - communicator = HarvesterFileCommunicator( - request_queue, jobs_queue, ranges_queue, config - ) + communicator = HarvesterFileCommunicator(request_queue, jobs_queue, ranges_queue, config) yield communicator communicator.stop() clean_files( diff --git a/tests/harvester/test_harvesterFileMessenger.py b/tests/harvester/test_harvesterFileMessenger.py index a377e9d..ec8831d 100644 --- a/tests/harvester/test_harvesterFileMessenger.py +++ b/tests/harvester/test_harvesterFileMessenger.py @@ -11,9 +11,7 @@ def check_job(self, jobs, sample_jobs): for sample_ID, jobID in zip(sample_jobs, jobs): assert sample_ID == jobID - def test_get_job( - self, harvester_file_communicator, sample_job, request_queue, jobs_queue - ): + def test_get_job(self, harvester_file_communicator, sample_job, request_queue, jobs_queue): with open(harvester_file_communicator.jobspecfile, "w") as f: json.dump(sample_job, f) @@ -22,9 +20,7 @@ def test_get_job( job_communicator = jobs_queue.get(timeout=5) self.check_job(job_communicator, sample_job) - def test_get_job_request( - self, harvester_file_communicator, sample_job, request_queue, jobs_queue - ): + def test_get_job_request(self, harvester_file_communicator, sample_job, request_queue, jobs_queue): harvester_file_communicator.start() request_queue.put(PandaJobRequest()) @@ -67,9 +63,7 @@ def test_get_event_ranges( n_events = 3 evnt_request = EventRangeRequest() for pandaID, job in sample_job.items(): - evnt_request.add_event_request( - pandaID, n_events, job["taskID"], job["jobsetID"] - ) + evnt_request.add_event_request(pandaID, n_events, job["taskID"], job["jobsetID"]) request_queue.put(evnt_request) while not os.path.isfile(harvester_file_communicator.eventrequestfile): @@ -78,14 +72,9 @@ def test_get_event_ranges( ranges_res = {} with open(harvester_file_communicator.eventrequestfile) as f: communicator_request = json.load(f) - for pandaIDSent, pandaIDCom in zip( - evnt_request, communicator_request - ): + for pandaIDSent, pandaIDCom in zip(evnt_request, communicator_request): assert pandaIDSent == pandaIDCom - assert ( - evnt_request[pandaIDSent]["nRanges"] - == communicator_request[pandaIDSent]["nRanges"] - ) + assert evnt_request[pandaIDSent]["nRanges"] == communicator_request[pandaIDSent]["nRanges"] ranges_res[pandaIDSent] = [ { "lastEvent": 0, @@ -102,11 +91,7 @@ def test_get_event_ranges( for pandaIDSent, pandaIDCom in zip(ranges_res, ranges_com): assert pandaIDSent == pandaIDCom - assert ( - len(ranges_res[pandaIDSent]) - == len(ranges_com[pandaIDSent]) - == n_events - ) + assert len(ranges_res[pandaIDSent]) == len(ranges_com[pandaIDSent]) == n_events assert not os.path.isfile(harvester_file_communicator.eventrequestfile) assert not os.path.isfile(harvester_file_communicator.eventrangesfile) @@ -122,8 +107,4 @@ def test_get_event_ranges( ranges_com = ranges_queue.get(timeout=5) for pandaIDSent, pandaIDCom in zip(ranges_res, ranges_com): assert pandaIDSent == pandaIDCom - assert ( - len(ranges_res[pandaIDSent]) - == len(ranges_com[pandaIDSent]) - == 0 - ) + assert len(ranges_res[pandaIDSent]) == len(ranges_com[pandaIDSent]) == 0 diff --git a/tests/harvester/test_harvesterMock.py b/tests/harvester/test_harvesterMock.py index 7f28a31..0164908 100644 --- a/tests/harvester/test_harvesterMock.py +++ b/tests/harvester/test_harvesterMock.py @@ -8,9 +8,7 @@ def test_get_job(self, harvester_mock, request_queue, jobs_queue): job = jobs_queue.get(timeout=5) assert job is not None and isinstance(job, dict) - def test_get_ranges( - self, harvester_mock, request_queue, jobs_queue, ranges_queue - ): + def test_get_ranges(self, harvester_mock, request_queue, jobs_queue, ranges_queue): harvester_mock.start() request_queue.put(PandaJobRequest()) jobs = jobs_queue.get(timeout=5) @@ -18,9 +16,7 @@ def test_get_ranges( n_events = harvester_mock.nevents evnt_request = EventRangeRequest() for pandaID, job in jobs.items(): - evnt_request.add_event_request( - pandaID, n_events, job["taskID"], job["jobsetID"] - ) + evnt_request.add_event_request(pandaID, n_events, job["taskID"], job["jobsetID"]) request_queue.put(evnt_request) ranges = ranges_queue.get(timeout=5) assert ranges is not None diff --git a/tests/test_bookkeeper.py b/tests/test_bookkeeper.py index 868e75f..fd903b4 100644 --- a/tests/test_bookkeeper.py +++ b/tests/test_bookkeeper.py @@ -45,10 +45,7 @@ def test_assign_job_to_actor( assert job["PandaID"] == job_tmp["PandaID"] job = job_tmp bookKeeper.fetch_event_ranges(actor_id, nevents) - assert ( - bookKeeper.assign_job_to_actor(actor_id)["PandaID"] - == job["PandaID"] - ) + assert bookKeeper.assign_job_to_actor(actor_id)["PandaID"] == job["PandaID"] def test_add_event_ranges( self, @@ -70,9 +67,7 @@ def test_add_event_ranges( assert bookKeeper.has_jobs_ready() for pandaID in sample_multijobs: - print( - bookKeeper.jobs[pandaID].event_ranges_queue.event_ranges_by_id - ) + print(bookKeeper.jobs[pandaID].event_ranges_queue.event_ranges_by_id) assert bookKeeper.n_ready(pandaID) == nevents def test_fetch_event_ranges( @@ -101,9 +96,7 @@ def test_fetch_event_ranges( for wid in assigned_workers: job = bookKeeper.assign_job_to_actor(wid) assert job["PandaID"] in sample_multijobs - ranges = bookKeeper.fetch_event_ranges( - wid, int(nevents / len(assigned_workers)) - ) + ranges = bookKeeper.fetch_event_ranges(wid, int(nevents / len(assigned_workers))) assert ranges assert not bookKeeper.fetch_event_ranges(wid[0], 1) @@ -152,37 +145,25 @@ def __inner__(range_update, failed=False): bookKeeper.add_jobs(sample_multijobs, False) for _ in range(njobs): job = bookKeeper.assign_job_to_actor(actor_id) - print( - bookKeeper.jobs.get_event_ranges( - job.get_id() - ).event_ranges_count - ) + print(bookKeeper.jobs.get_event_ranges(job.get_id()).event_ranges_count) ranges = bookKeeper.fetch_event_ranges(actor_id, nevents) assert len(ranges) == nevents assert bookKeeper.rangesID_by_actor[actor_id] - bookKeeper.process_event_ranges_update( - actor_id, sample_failed_rangeupdate - ) + bookKeeper.process_event_ranges_update(actor_id, sample_failed_rangeupdate) assert not bookKeeper.rangesID_by_actor[actor_id] assert job.event_ranges_queue.nranges_failed() == nevents assert not bookKeeper.rangesID_by_actor[actor_id] n_success = len(sample_rangeupdate[0]["eventRanges"]) // 2 - sample_rangeupdate[0]["eventRanges"] = sample_rangeupdate[0][ - "eventRanges" - ][:n_success] - bookKeeper.process_event_ranges_update( - actor_id, sample_rangeupdate[0]["eventRanges"] - ) + sample_rangeupdate[0]["eventRanges"] = sample_rangeupdate[0]["eventRanges"][:n_success] + bookKeeper.process_event_ranges_update(actor_id, sample_rangeupdate[0]["eventRanges"]) assert not bookKeeper.rangesID_by_actor[actor_id] assert job.event_ranges_queue.nranges_done() == n_success events = bookKeeper.fetch_event_ranges(actor_id, nevents) assert not bookKeeper.rangesID_by_actor[actor_id] assert not events - assert ( - job.event_ranges_queue.nranges_failed() == nevents - n_success - ) + assert job.event_ranges_queue.nranges_failed() == nevents - n_success assert job.event_ranges_queue.nranges_done() == n_success print(job.event_ranges_queue.rangesID_by_state) print(bookKeeper.rangesID_by_actor) diff --git a/tests/test_eventservice.py b/tests/test_eventservice.py index 0134f90..703fcb9 100644 --- a/tests/test_eventservice.py +++ b/tests/test_eventservice.py @@ -31,30 +31,16 @@ def test_from_dict_init(self): ranges_request = EventRangeRequest.build_from_dict(request_dict) ranges_request_init = EventRangeRequest() for pandaID, req in request_dict.items(): - ranges_request_init.add_event_request( - pandaID, req["nRanges"], req["taskID"], req["jobsetID"] - ) - assert ( - len(request_dict) == len(ranges_request) == len(ranges_request_init) - ) - for id1, id2, id3 in zip( - ranges_request, ranges_request_init, request_dict - ): - assert ( - ranges_request[id1]["pandaID"] - == ranges_request_init[id2]["pandaID"] - == request_dict[id3]["pandaID"] - ) + ranges_request_init.add_event_request(pandaID, req["nRanges"], req["taskID"], req["jobsetID"]) + assert len(request_dict) == len(ranges_request) == len(ranges_request_init) + for id1, id2, id3 in zip(ranges_request, ranges_request_init, request_dict): + assert ranges_request[id1]["pandaID"] == ranges_request_init[id2]["pandaID"] == request_dict[id3]["pandaID"] class TestEventRangeUpdate: - def test_build_range_update( - self, nevents, sample_rangeupdate, sample_failed_rangeupdate - ): + def test_build_range_update(self, nevents, sample_rangeupdate, sample_failed_rangeupdate): pandaID = "0" - ranges_update = EventRangeUpdate.build_from_dict( - pandaID, sample_rangeupdate - ) + ranges_update = EventRangeUpdate.build_from_dict(pandaID, sample_rangeupdate) assert pandaID in ranges_update ranges = ranges_update[pandaID] assert len(ranges) == nevents @@ -69,9 +55,7 @@ def test_build_range_update( and "fsize" in r ) - ranges_update = EventRangeUpdate.build_from_dict( - pandaID, sample_failed_rangeupdate - ) + ranges_update = EventRangeUpdate.build_from_dict(pandaID, sample_failed_rangeupdate) assert pandaID in ranges_update ranges = ranges_update[pandaID] assert len(ranges) == nevents @@ -108,12 +92,7 @@ def test_new(self, nevents, sample_job, sample_ranges): == ranges_queue.nranges_remaining() == nevents ) - assert ( - ranges_queue.nranges_assigned() - == ranges_queue.nranges_done() - == ranges_queue.nranges_failed() - == 0 - ) + assert ranges_queue.nranges_assigned() == ranges_queue.nranges_done() == ranges_queue.nranges_failed() == 0 with pytest.raises(ValueError): ranges_queue["key"] = None @@ -134,16 +113,8 @@ def test_concat(self, nevents, sample_job, sample_ranges): == ranges_queue.nranges_remaining() == nevents ) - assert ( - ranges_queue.nranges_assigned() - == ranges_queue.nranges_done() - == ranges_queue.nranges_failed() - == 0 - ) - assert ( - ranges_queue[ranges[0]["eventRangeID"]].eventRangeID - == ranges[0]["eventRangeID"] - ) + assert ranges_queue.nranges_assigned() == ranges_queue.nranges_done() == ranges_queue.nranges_failed() == 0 + assert ranges_queue[ranges[0]["eventRangeID"]].eventRangeID == ranges[0]["eventRangeID"] for r in ranges: assert r["eventRangeID"] in ranges_queue @@ -160,12 +131,8 @@ def test_update( ranges_queue = EventRangeQueue.build_from_list(ranges) nsuccess = int(nevents / 2) - ranges_update = EventRangeUpdate.build_from_dict( - pandaID, sample_rangeupdate - )[pandaID][:nsuccess] - failed_ranges_update = EventRangeUpdate.build_from_dict( - pandaID, sample_failed_rangeupdate - )[pandaID][nsuccess:] + ranges_update = EventRangeUpdate.build_from_dict(pandaID, sample_rangeupdate)[pandaID][:nsuccess] + failed_ranges_update = EventRangeUpdate.build_from_dict(pandaID, sample_failed_rangeupdate)[pandaID][nsuccess:] ranges_queue.get_next_ranges(nevents) ranges_queue.update_ranges(ranges_update) @@ -195,19 +162,12 @@ def test_get_next(self, sample_job, sample_ranges): assert ranges_queue.nranges_remaining() == nranges assert ranges_queue.nranges_available() == nranges - nranges_requested for requested_range in requested_ranges: - assert ( - ranges_queue[requested_range.eventRangeID].status - == EventRange.ASSIGNED - ) + assert ranges_queue[requested_range.eventRangeID].status == EventRange.ASSIGNED requested_ranges = ranges_queue.get_next_ranges(nranges) assert len(requested_ranges) == nranges - nranges_requested assert ranges_queue.nranges_available() == 0 - assert ( - ranges_queue.nranges_assigned() - == ranges_queue.nranges_remaining() - == nranges - ) + assert ranges_queue.nranges_assigned() == ranges_queue.nranges_remaining() == nranges assert len(ranges_queue.get_next_ranges(1)) == 0 @@ -252,14 +212,10 @@ def test_build_from_dict(self): class TestPandaJobQueue: - def test_build_pandajob_queue( - self, is_eventservice, njobs, sample_multijobs - ): + def test_build_pandajob_queue(self, is_eventservice, njobs, sample_multijobs): assert len(sample_multijobs) == njobs pandajob_queue = PandaJobQueue() - pandajob_queue_fromdict = PandaJobQueue.build_from_dict( - sample_multijobs - ) + pandajob_queue_fromdict = PandaJobQueue.build_from_dict(sample_multijobs) assert len(pandajob_queue) == 0 assert not pandajob_queue.next_job_to_process() @@ -288,9 +244,7 @@ def test_build_pandajob_queue( pandajob_queue_2["key"] = job assert "key" in pandajob_queue_2 - def test_pandajob_process_event_ranges_reply( - self, is_eventservice, njobs, sample_multijobs, sample_ranges - ): + def test_pandajob_process_event_ranges_reply(self, is_eventservice, njobs, sample_multijobs, sample_ranges): if not is_eventservice: pytest.skip("Not eventservice jobs") pandajob_queue = PandaJobQueue(sample_multijobs) @@ -332,20 +286,14 @@ def test_process_event_ranges_update( job = pandajob_queue.next_job_to_process() assert job == pandajob_queue.next_job_to_process() - ranges_update = EventRangeUpdate.build_from_dict( - job["PandaID"], sample_rangeupdate - ) + ranges_update = EventRangeUpdate.build_from_dict(job["PandaID"], sample_rangeupdate) ranges_queue = pandajob_queue.get_event_ranges(job["PandaID"]) _ = job.get_next_ranges(nevents) pandajob_queue.process_event_ranges_update(ranges_update) assert not job.no_more_ranges assert ranges_queue.nranges_done() == nevents - assert ( - ranges_queue.nranges_remaining() - == ranges_queue.nranges_available() - == 0 - ) + assert ranges_queue.nranges_remaining() == ranges_queue.nranges_available() == 0 job_2 = pandajob_queue.next_job_to_process() assert job["PandaID"] == job_2["PandaID"] @@ -379,9 +327,7 @@ def test_build_pandajob_request(self): jobrequest = PandaJobRequest(**request_dict) assert jobrequest.diskSpace == request_dict["disk_space"] assert jobrequest.mem == request_dict["mem"] - assert ( - jobrequest.allowOtherCountry == request_dict["allow_other_country"] - ) + assert jobrequest.allowOtherCountry == request_dict["allow_other_country"] class TestPandaJobUpdate: diff --git a/tests/test_pilothttp.py b/tests/test_pilothttp.py index 5c1c4b3..68b24d7 100644 --- a/tests/test_pilothttp.py +++ b/tests/test_pilothttp.py @@ -63,22 +63,14 @@ def test_getjob(self, payload, is_eventservice, config, sample_job): res = requests.post("http://127.0.0.1:8080/server/panda/getJob").json() assert job["PandaID"] == PandaJob(res)["PandaID"] - assert ( - requests.post("http://127.0.0.1:8080/unknown").json()["StatusCode"] - == 500 - ) + assert requests.post("http://127.0.0.1:8080/unknown").json()["StatusCode"] == 500 payload.stop() assert payload.is_complete() assert payload.return_code() == payload.pilot_process.returncode def endpoint_not_implemented(self, endpoint): - assert ( - requests.post( - f"http://127.0.0.1:8080/server/panda/{endpoint}" - ).json()["StatusCode"] - == 500 - ) + assert requests.post(f"http://127.0.0.1:8080/server/panda/{endpoint}").json()["StatusCode"] == 500 @pytest.mark.usefixtures("payload") def test_updateJobsInBulk(self): @@ -98,9 +90,7 @@ def test_jobUpdate(self, payload, config, is_eventservice): assert not payload.fetch_job_update() data = {"pilotErrorCode": "0"} - res = requests.post( - "http://127.0.0.1:8080/server/panda/updateJob", data=data - ).json() + res = requests.post("http://127.0.0.1:8080/server/panda/updateJob", data=data).json() assert res["StatusCode"] == 0 # Disabled as job update are currently not forwarded to the driver # job_update = payload.fetch_job_update() @@ -120,9 +110,7 @@ def test_rangesUpdate( assert not payload.fetch_ranges_update() data = {"pilotErrorCode": 0} - res = requests.post( - "http://127.0.0.1:8080/server/panda/updateEventRanges", data=data - ).json() + res = requests.post("http://127.0.0.1:8080/server/panda/updateEventRanges", data=data).json() assert res["StatusCode"] == 0 def test_getranges( @@ -146,9 +134,7 @@ def test_getranges( "jobsetID": job["jobsetID"], "taskID": job["taskID"], } - res = requests.post( - "http://127.0.0.1:8080/server/panda/getEventRanges" - ).json() + res = requests.post("http://127.0.0.1:8080/server/panda/getEventRanges").json() assert res["StatusCode"] == 500 assert payload.should_request_more_ranges() ranges = list() @@ -157,22 +143,13 @@ def test_getranges( payload.submit_new_ranges(ranges) payload.submit_new_ranges(None) - res = requests.post( - "http://127.0.0.1:8080/server/panda/getEventRanges", data=data - ).json() + res = requests.post("http://127.0.0.1:8080/server/panda/getEventRanges", data=data).json() assert res["StatusCode"] == 0 assert len(res["eventRanges"]) == nevents - res = requests.post( - "http://127.0.0.1:8080/server/panda/getEventRanges", data=data - ).json() + res = requests.post("http://127.0.0.1:8080/server/panda/getEventRanges", data=data).json() assert res["StatusCode"] == 0 assert len(res["eventRanges"]) == 0 assert not payload.should_request_more_ranges() data["pandaID"] = "None" - assert ( - requests.post( - "http://127.0.0.1:8080/server/panda/getEventRanges", data=data - ).json()["StatusCode"] - == -1 - ) + assert requests.post("http://127.0.0.1:8080/server/panda/getEventRanges", data=data).json()["StatusCode"] == -1 diff --git a/tests/test_taskstatus.py b/tests/test_taskstatus.py index 9196c41..18722c3 100644 --- a/tests/test_taskstatus.py +++ b/tests/test_taskstatus.py @@ -3,9 +3,7 @@ class TestTaskStatus: - def test_save_restore_status( - self, nfiles, tmp_path, config, sample_job, sample_ranges - ): + def test_save_restore_status(self, nfiles, tmp_path, config, sample_job, sample_ranges): config.ray["outputdir"] = tmp_path job = PandaJob(list(sample_job.values())[0]) ts = TaskStatus(job, tmp_path, config) @@ -19,18 +17,14 @@ def test_save_restore_status( for i in range(0, n_output_per_input_file): ranges_list = [] for j in range(hits_per_file): - ranges_list.append( - ranges[file_no + (i * offset) + (i + j) * offset] - ) + ranges_list.append(ranges[file_no + (i * offset) + (i + j) * offset]) ranges_map = {} arbitrary_range = EventRange.build_from_dict(ranges_list[0]) fname = arbitrary_range.PFN outputfile = f"{fname}-MERGED-{arbitrary_range.eventRangeID}" for r in ranges_list: event_range = EventRange.build_from_dict(r) - ranges_map[event_range.eventRangeID] = ( - TaskStatus.build_eventrange_dict(event_range, fname) - ) + ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict(event_range, fname) ts.set_eventrange_simulated(event_range, "outputfile") ts.set_file_merged([fname], outputfile, ranges_map, "guid") @@ -39,18 +33,14 @@ def test_save_restore_status( print(ts._status) assert ts._status == ts2._status - def test_set_processed( - self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges - ): + def test_set_processed(self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges): config.ray["outputdir"] = tmp_path job = PandaJob(list(sample_job.values())[0]) ts = TaskStatus(job, tmp_path, config) ranges_list = list(sample_ranges.values())[0] for r in ranges_list: - ts.set_eventrange_simulated( - EventRange.build_from_dict(r), "outputfile" - ) + ts.set_eventrange_simulated(EventRange.build_from_dict(r), "outputfile") # need to save as set_event_range_simulated is lazy ts.save_status() @@ -58,9 +48,7 @@ def test_set_processed( assert ts.get_nsimulated() == nevents assert ts.get_nmerged() == 0 - def test_set_failed( - self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges - ): + def test_set_failed(self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges): config.ray["outputdir"] = tmp_path job = PandaJob(list(sample_job.values())[0]) ts = TaskStatus(job, tmp_path, config) @@ -75,9 +63,7 @@ def test_set_failed( assert ts.get_nfailed() == nevents assert ts.get_nmerged() == 0 - def test_set_merged( - self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges - ): + def test_set_merged(self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges): config.ray["outputdir"] = tmp_path job = PandaJob(list(sample_job.values())[0]) ts = TaskStatus(job, tmp_path, config) @@ -96,19 +82,15 @@ def test_set_merged( for i in range(0, n_output_per_input_file): ranges_list = [] for j in range(hits_per_file): - ranges_list.append( - ranges[file_no + (i * offset) + (i + j) * offset] - ) + ranges_list.append(ranges[file_no + (i * offset) + (i + j) * offset]) arbitrary_range = EventRange.build_from_dict(ranges_list[0]) fname = arbitrary_range.PFN outputfile = f"{fname}-MERGED-{arbitrary_range.eventRangeID}" ranges_map = {} for r in ranges_list: event_range = EventRange.build_from_dict(r) - ranges_map[event_range.eventRangeID] = ( - TaskStatus.build_eventrange_dict( - event_range, f"outputfile-{event_range.eventRangeID}" - ) + ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict( + event_range, f"outputfile-{event_range.eventRangeID}" ) ts.set_file_merged([fname], outputfile, ranges_map, "guid") @@ -122,19 +104,15 @@ def test_set_merged( for i in range(0, n_output_per_input_file): ranges_list = [] for j in range(hits_per_file): - ranges_list.append( - ranges[file_no + (i * offset) + (i + j) * offset] - ) + ranges_list.append(ranges[file_no + (i * offset) + (i + j) * offset]) arbitrary_range = EventRange.build_from_dict(ranges_list[0]) fname = arbitrary_range.PFN outputfile = f"{fname}-MERGED-{arbitrary_range.eventRangeID}" ranges_map = {} for r in ranges_list: event_range = EventRange.build_from_dict(r) - ranges_map[event_range.eventRangeID] = ( - TaskStatus.build_eventrange_dict( - event_range, f"outputfile-{event_range.eventRangeID}" - ) + ranges_map[event_range.eventRangeID] = TaskStatus.build_eventrange_dict( + event_range, f"outputfile-{event_range.eventRangeID}" ) ts.set_file_merged([fname], outputfile, ranges_map, "guid") ts.save_status() From b90e102156439fe62f617c41a580bb2976a23ecd Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 15:48:38 -0700 Subject: [PATCH 11/14] remove comments --- pyproject.toml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0eef6f4..985b501 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,10 +65,5 @@ select = [ "I" ] -# ignore = [ -# # pycodestyle -# "E501", -# ] - [tool.ruff.lint.isort] no-lines-before = ["third-party", "first-party", "standard-library"] From bdaa98122747eb68c4465a5257cb54be4d0e029d Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 16:09:00 -0700 Subject: [PATCH 12/14] add ruff checks --- bin/validate-raythena-job.py | 2 +- pyproject.toml | 6 +++++- src/raythena/actors/esworker.py | 6 +++--- src/raythena/drivers/esdriver.py | 2 +- src/raythena/utils/bookkeeper.py | 4 ++-- src/raythena/utils/eventservice.py | 30 +++++++++++++++--------------- src/raythena/utils/exception.py | 13 +++++++------ src/raythena/utils/logging.py | 3 ++- tests/conftest.py | 4 ++-- tests/test_eventservice.py | 12 ++++++------ tests/test_pilothttp.py | 8 ++++---- tests/test_taskstatus.py | 16 ++++++++-------- 12 files changed, 56 insertions(+), 50 deletions(-) diff --git a/bin/validate-raythena-job.py b/bin/validate-raythena-job.py index 5a373e0..435c9c0 100644 --- a/bin/validate-raythena-job.py +++ b/bin/validate-raythena-job.py @@ -24,7 +24,7 @@ def validate_job(job_dir, job_state_file): with open(job_state_file) as f: job_state = json.load(f) merged_input_files = job_state["merged"] - merged_output_files = set([list(x.keys())[0] for x in merged_input_files.values()]) + merged_output_files = set([next(iter(x.keys())) for x in merged_input_files.values()]) event_numbers = set() for output_file in merged_output_files: output_file_abs = path.join(job_dir, "final", output_file) diff --git a/pyproject.toml b/pyproject.toml index 985b501..11cbe51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,8 @@ dependencies = [ line-length = 120 indent-width = 4 +target-version = "py39" + [tool.ruff.lint] select = [ @@ -62,7 +64,9 @@ select = [ # flake8-simplify "SIM", # isort - "I" + "I", + # Ruff + "RUF", ] [tool.ruff.lint.isort] diff --git a/src/raythena/actors/esworker.py b/src/raythena/actors/esworker.py index 60bbfaf..959d73b 100644 --- a/src/raythena/actors/esworker.py +++ b/src/raythena/actors/esworker.py @@ -100,7 +100,7 @@ def __init__( actor_no: int, actor_count: int, job: PandaJob = None, - event_ranges: Sequence[EventRange] = None, + event_ranges: Optional[Sequence[EventRange]] = None, ) -> None: """ Initialize attributes, instantiate a payload and setup the workdir @@ -260,7 +260,7 @@ def stagein(self) -> None: self.start_time = int(start_time[0]) * 3600 + int(start_time[1]) * 60 + int(start_time[2]) time_limit = time_limit_monitor.readline().split(":") if len(time_limit) < 3: - time_limit = ["0"] + time_limit + time_limit = ["0", *time_limit] self.time_limit = int(time_limit[0]) * 3600 + int(time_limit[1]) * 60 + int(time_limit[2]) timer_thread = threading.Thread(name="timer", target=self.check_time, daemon=True) timer_thread.start() @@ -502,7 +502,7 @@ def stageout_event_service_files(self, ranges_update: Mapping[str, str]) -> Opti if range_update["eventStatus"] == "failed": self._logger.warning("event range failed, will not stage-out") continue - if "path" in range_update and range_update["path"]: + if range_update.get("path"): cfile_key = "path" else: raise StageOutFailed(self.id) diff --git a/src/raythena/drivers/esdriver.py b/src/raythena/drivers/esdriver.py index 8dd861f..fc44142 100644 --- a/src/raythena/drivers/esdriver.py +++ b/src/raythena/drivers/esdriver.py @@ -586,7 +586,7 @@ def run(self) -> None: if len(jobs) > 1: self._logger.critical("Raythena can only handle one job") return - job = list(jobs.values())[0] + job = next(iter(jobs.values())) job["eventService"] = "true" job["jobPars"] = f"--eventService=True {job['jobPars']}" self.panda_taskid = job["taskID"] diff --git a/src/raythena/utils/bookkeeper.py b/src/raythena/utils/bookkeeper.py index 6919a76..be2004c 100644 --- a/src/raythena/utils/bookkeeper.py +++ b/src/raythena/utils/bookkeeper.py @@ -159,7 +159,7 @@ def is_stale(self) -> bool: return len(self._update_queue) > 0 @staticmethod - def build_eventrange_dict(eventrange: EventRange, output_file: str = None) -> dict[str, Any]: + def build_eventrange_dict(eventrange: EventRange, output_file: Optional[str] = None) -> dict[str, Any]: """ Takes an EventRange object and retuns the dict representation which should be saved in the state file @@ -377,7 +377,7 @@ def __init__(self, config: Config) -> None: self._logger = make_logger(self.config, "BookKeeper") self.actors: dict[str, Optional[str]] = dict() self.rangesID_by_actor: dict[str, set[str]] = dict() - #  Output files for which we are ready to launch a merge transform + # Output files for which we are ready to launch a merge transform self.files_ready_to_merge: dict[str, list[tuple[str, EventRange]]] = dict() # Event ranges for a given input file which have been simulated and a ready to be merged self.ranges_to_merge: dict[str, list[tuple[str, EventRange]]] = dict() diff --git a/src/raythena/utils/eventservice.py b/src/raythena/utils/eventservice.py index dcce653..beefa1f 100644 --- a/src/raythena/utils/eventservice.py +++ b/src/raythena/utils/eventservice.py @@ -91,7 +91,7 @@ class PandaJobQueue: See PandaJob doc for the format """ - def __init__(self, jobs: Mapping[str, JobDef] = None) -> None: + def __init__(self, jobs: Optional[Mapping[str, JobDef]] = None) -> None: self.jobs: dict[str, PandaJob] = dict() self.distributed_jobs_ids = list() @@ -676,7 +676,7 @@ class EventRangeUpdate: def __init__( self, - range_update: dict[str, list[MutableMapping[str, Union[str, int]]]] = None, + range_update: Optional[dict[str, list[MutableMapping[str, Union[str, int]]]]] = None, ) -> None: """ Wraps the range update dict in an object. The range update should be in the harvester-supported format. @@ -739,10 +739,10 @@ def build_from_dict(panda_id: str, range_update: Sequence[PilotEventRangeUpdateD range_update: Sequence[PilotEventRangeUpdateDef] = json.loads(range_update["eventRanges"][0]) for range_elt in range_update: - if "zipFile" in range_elt and range_elt["zipFile"]: + if range_elt.get("zipFile"): range_update_type = "zipFile" file_info: FileInfo = range_elt.get("zipFile", None) - elif "esOutput" in range_elt and range_elt["esOutput"]: + elif range_elt.get("esOutput"): range_update_type = "esOutput" file_info: FileInfo = range_elt.get("esOutput", None) else: @@ -808,16 +808,16 @@ class PandaJobRequest: def __init__( self, - node: str = None, - disk_space: str = None, - working_group: str = None, - prod_source_label: str = None, - computing_element: str = None, - site_name: str = None, - resource_type: str = None, - mem: str = None, - cpu: str = None, - allow_other_country: str = None, + node: str = "", + disk_space: str = "", + working_group: str = "", + prod_source_label: str = "", + computing_element: str = "", + site_name: str = "", + resource_type: str = "", + mem: str = "", + cpu: str = "", + allow_other_country: str = "", ) -> None: self.node = node self.diskSpace = disk_space @@ -1201,7 +1201,7 @@ class JobReport: """ - def __init__(self, exitCode: int = 0, exitMsg: str = None, exitMsgExtra: str = None) -> None: + def __init__(self, exitCode: int = 0, exitMsg: Optional[str] = None, exitMsgExtra: Optional[str] = None) -> None: self.exitCode = exitCode self.exitMsg = exitMsg self.exitMsgExtra = exitMsgExtra diff --git a/src/raythena/utils/exception.py b/src/raythena/utils/exception.py index 0b63202..e6f4f4a 100644 --- a/src/raythena/utils/exception.py +++ b/src/raythena/utils/exception.py @@ -1,5 +1,6 @@ import threading from queue import Empty, Queue +from typing import Optional class ErrorCodes: @@ -105,7 +106,7 @@ class BaseRaythenaException(Exception): Base class for raythena exception """ - def __init__(self, worker_id: str, error_code: int, message: str = None) -> None: + def __init__(self, worker_id: str, error_code: int, message: Optional[str] = None) -> None: """ Initialize worker_id, error code and message @@ -133,7 +134,7 @@ def __init__( worker_id: str, src_state: str, dst_state: str, - message: str = None, + message: str = "", ) -> None: super().__init__(worker_id, ErrorCodes.ILLEGAL_WORKER_STATE, message) self.src_state = src_state @@ -151,7 +152,7 @@ class StageInFailed(BaseRaythenaException): Raised when the worker was unable to stage-in data """ - def __init__(self, worker_id: str, message: str = None) -> None: + def __init__(self, worker_id: str, message: Optional[str] = None) -> None: super().__init__(worker_id, ErrorCodes.STAGEIN_FAILED, message) def __reduce__(self): @@ -163,7 +164,7 @@ class StageOutFailed(BaseRaythenaException): Raised when the worker was unable to stage-out data """ - def __init__(self, worker_id: str, message: str = None) -> None: + def __init__(self, worker_id: str, message: Optional[str] = None) -> None: super().__init__(worker_id, ErrorCodes.STAGEOUT_FAILED, message) def __reduce__(self): @@ -175,7 +176,7 @@ class FailedPayload(BaseRaythenaException): Raised when the worker payload failed """ - def __init__(self, worker_id: str, message: str = None) -> None: + def __init__(self, worker_id: str, message: Optional[str] = None) -> None: super().__init__(worker_id, ErrorCodes.PAYLOAD_FAILED, message) def __reduce__(self): @@ -187,7 +188,7 @@ class UnknownException(BaseRaythenaException): Raised when no other exception type applies """ - def __init__(self, worker_id: str, message: str = None) -> None: + def __init__(self, worker_id: str, message: Optional[str] = None) -> None: super().__init__(worker_id, ErrorCodes.UNKNOWN, message) def __reduce__(self): diff --git a/src/raythena/utils/logging.py b/src/raythena/utils/logging.py index 86b6bb5..895f45c 100644 --- a/src/raythena/utils/logging.py +++ b/src/raythena/utils/logging.py @@ -1,12 +1,13 @@ import logging import sys from time import gmtime +from typing import Optional from raythena.utils.config import Config _initialized = False -def make_logger(config: Config, name: str, filepath: str = None) -> logging.Logger: +def make_logger(config: Config, name: str, filepath: Optional[str] = None) -> logging.Logger: global _initialized if not _initialized: configure_logger(config, filepath) diff --git a/tests/conftest.py b/tests/conftest.py index bf53046..f4a98e0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -215,7 +215,7 @@ def sample_multijobs( "destinationDblock": job_name, "dispatchDBlockToken": "NULL", "jobPars": ( - f"--eventService={str(is_eventservice)} --skipEvents=0 --firstEvent=1 " + f"--eventService={is_eventservice!s} --skipEvents=0 --firstEvent=1 " '--preExec "from AthenaCommon.DetFlags ' "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff() "' @@ -307,7 +307,7 @@ def sample_job(is_eventservice, input_output_file_list, nhits_per_file, nevents_ "destinationDblock": job_name, "dispatchDBlockToken": "NULL", "jobPars": ( - f"--eventService={str(is_eventservice)} --skipEvents=0 --firstEvent=1 " + f"--eventService={is_eventservice!s} --skipEvents=0 --firstEvent=1 " '--preExec "from AthenaCommon.DetFlags ' "import DetFlags;DetFlags.ID_setOn();DetFlags.Calo_setOff();" 'DetFlags.Muon_setOff();DetFlags.Lucid_setOff();DetFlags.Truth_setOff() "' diff --git a/tests/test_eventservice.py b/tests/test_eventservice.py index 703fcb9..5244ffc 100644 --- a/tests/test_eventservice.py +++ b/tests/test_eventservice.py @@ -83,7 +83,7 @@ class TestEventRangeQueue: def test_new(self, nevents, sample_job, sample_ranges): ranges_queue = EventRangeQueue() assert len(ranges_queue) == 0 - ranges = list(sample_ranges.values())[0] + ranges = next(iter(sample_ranges.values())) ranges_queue = EventRangeQueue.build_from_list(ranges) assert ( len(ranges) @@ -104,7 +104,7 @@ def test_new(self, nevents, sample_job, sample_ranges): def test_concat(self, nevents, sample_job, sample_ranges): ranges_queue = EventRangeQueue() - ranges = list(sample_ranges.values())[0] + ranges = next(iter(sample_ranges.values())) ranges_queue.concat(ranges) assert ( len(ranges) @@ -127,7 +127,7 @@ def test_update( sample_failed_rangeupdate, ): pandaID = "0" - ranges = list(sample_ranges.values())[0] + ranges = next(iter(sample_ranges.values())) ranges_queue = EventRangeQueue.build_from_list(ranges) nsuccess = int(nevents / 2) @@ -152,7 +152,7 @@ def test_update( def test_get_next(self, sample_job, sample_ranges): ranges_queue = EventRangeQueue() assert not ranges_queue.get_next_ranges(10) - ranges = list(sample_ranges.values())[0] + ranges = next(iter(sample_ranges.values())) ranges_queue.concat(ranges) nranges = len(ranges_queue) nranges_requested = max(1, int(nranges / 3)) @@ -300,12 +300,12 @@ def test_process_event_ranges_update( class TestPandaJob: def test_build_pandajob(self, sample_job): - job_dict = list(sample_job.values())[0] + job_dict = next(iter(sample_job.values())) job = PandaJob(job_dict) for k in job_dict: assert k in job assert job_dict[k] == job[k] - assert job.get_id() == list(sample_job.keys())[0] + assert job.get_id() == next(iter(sample_job.keys())) job["k"] = "v" assert job["k"] == "v" diff --git a/tests/test_pilothttp.py b/tests/test_pilothttp.py index 68b24d7..60b974d 100644 --- a/tests/test_pilothttp.py +++ b/tests/test_pilothttp.py @@ -46,7 +46,7 @@ def payload(self, tmpdir, config, sample_job): cwd = os.getcwd() config.ray["workdir"] = str(tmpdir) os.chdir(tmpdir) - job_dict = list(sample_job.values())[0] + job_dict = next(iter(sample_job.values())) job = PandaJob(job_dict) payload = self.setup_payload(config) payload.start(job) @@ -58,7 +58,7 @@ def payload(self, tmpdir, config, sample_job): def test_getjob(self, payload, is_eventservice, config, sample_job): if not is_eventservice: pytest.skip() - job_dict = list(sample_job.values())[0] + job_dict = next(iter(sample_job.values())) job = PandaJob(job_dict) res = requests.post("http://127.0.0.1:8080/server/panda/getJob").json() assert job["PandaID"] == PandaJob(res)["PandaID"] @@ -125,7 +125,7 @@ def test_getranges( if not is_eventservice: pytest.skip() - job_dict = list(sample_job.values())[0] + job_dict = next(iter(sample_job.values())) job = PandaJob(job_dict) data = { @@ -138,7 +138,7 @@ def test_getranges( assert res["StatusCode"] == 500 assert payload.should_request_more_ranges() ranges = list() - for r in list(sample_ranges.values())[0]: + for r in next(iter(sample_ranges.values())): ranges.append(EventRange.build_from_dict(r)) payload.submit_new_ranges(ranges) payload.submit_new_ranges(None) diff --git a/tests/test_taskstatus.py b/tests/test_taskstatus.py index 18722c3..0a9aca5 100644 --- a/tests/test_taskstatus.py +++ b/tests/test_taskstatus.py @@ -5,9 +5,9 @@ class TestTaskStatus: def test_save_restore_status(self, nfiles, tmp_path, config, sample_job, sample_ranges): config.ray["outputdir"] = tmp_path - job = PandaJob(list(sample_job.values())[0]) + job = PandaJob(next(iter(sample_job.values()))) ts = TaskStatus(job, tmp_path, config) - ranges = list(sample_ranges.values())[0] + ranges = next(iter(sample_ranges.values())) hits_per_file = int(job["esmergeSpec"]["nEventsPerOutputFile"]) events_per_file = int(job["nEventsPerInputFile"]) assert events_per_file % hits_per_file == 0 @@ -35,10 +35,10 @@ def test_save_restore_status(self, nfiles, tmp_path, config, sample_job, sample_ def test_set_processed(self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges): config.ray["outputdir"] = tmp_path - job = PandaJob(list(sample_job.values())[0]) + job = PandaJob(next(iter(sample_job.values()))) ts = TaskStatus(job, tmp_path, config) - ranges_list = list(sample_ranges.values())[0] + ranges_list = next(iter(sample_ranges.values())) for r in ranges_list: ts.set_eventrange_simulated(EventRange.build_from_dict(r), "outputfile") @@ -50,10 +50,10 @@ def test_set_processed(self, nfiles, nevents, tmp_path, config, sample_job, samp def test_set_failed(self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges): config.ray["outputdir"] = tmp_path - job = PandaJob(list(sample_job.values())[0]) + job = PandaJob(next(iter(sample_job.values()))) ts = TaskStatus(job, tmp_path, config) - ranges_list = list(sample_ranges.values())[0] + ranges_list = next(iter(sample_ranges.values())) for r in ranges_list: ts.set_eventrange_failed(EventRange.build_from_dict(r)) @@ -65,10 +65,10 @@ def test_set_failed(self, nfiles, nevents, tmp_path, config, sample_job, sample_ def test_set_merged(self, nfiles, nevents, tmp_path, config, sample_job, sample_ranges): config.ray["outputdir"] = tmp_path - job = PandaJob(list(sample_job.values())[0]) + job = PandaJob(next(iter(sample_job.values()))) ts = TaskStatus(job, tmp_path, config) - ranges = list(sample_ranges.values())[0] + ranges = next(iter(sample_ranges.values())) for e in ranges: er = EventRange.build_from_dict(e) ts.set_eventrange_simulated(er, f"outputfile-{er.eventRangeID}") From 56ec25b15068d7e10cc2487743e1cbb8203c6d0c Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 16:15:51 -0700 Subject: [PATCH 13/14] fix ruff checks --- src/raythena/actors/esworker.py | 127 ++++++++++++++--------------- src/raythena/utils/config.py | 83 ++++++++++--------- src/raythena/utils/eventservice.py | 4 +- src/raythena/utils/exception.py | 25 +++--- 4 files changed, 118 insertions(+), 121 deletions(-) diff --git a/src/raythena/actors/esworker.py b/src/raythena/actors/esworker.py index 959d73b..0156efa 100644 --- a/src/raythena/actors/esworker.py +++ b/src/raythena/actors/esworker.py @@ -38,6 +38,43 @@ # Type returned by the worker methods to the driver WorkerResponse = tuple[str, int, Any] +READY_FOR_JOB = 0 # initial state, before the first job request +JOB_REQUESTED = 1 # job has been requested to the driver, waiting for result +READY_FOR_EVENTS = 2 # ready to request new events for the current job +EVENT_RANGES_REQUESTED = 3 # event ranges have been requested to the driver, waiting for result +FINISHING_LOCAL_RANGES = 4 # do not request additional ranges, will move to STAGE_OUT once local cache is empty +PROCESSING = 5 # currently processing event ranges +FINISHING = 6 # Performing cleanup of resources, preparing final server update +DONE = 7 # Actor has finished processing job +STAGE_IN = 8 # Staging-in data. +STAGE_OUT = 9 # Staging-out data + +STATES_NAME = { + READY_FOR_JOB: "READY_FOR_JOB", + JOB_REQUESTED: "JOB_REQUESTED", + READY_FOR_EVENTS: "READY_FOR_EVENTS", + EVENT_RANGES_REQUESTED: "EVENT_RANGES_REQUESTED", + FINISHING_LOCAL_RANGES: "FINISHING_LOCAL_RANGES", + PROCESSING: "PROCESSING", + FINISHING: "FINISHING", + DONE: "DONE", + STAGE_IN: "STAGE_IN", + STAGE_OUT: "STAGE_OUT", +} + +# authorize state transition from x to y if y in TRANSITION[X] +TRANSITIONS = { + READY_FOR_JOB: [JOB_REQUESTED], + JOB_REQUESTED: [STAGE_IN, DONE], + STAGE_IN: [READY_FOR_EVENTS], + READY_FOR_EVENTS: [EVENT_RANGES_REQUESTED, STAGE_OUT], + EVENT_RANGES_REQUESTED: [FINISHING_LOCAL_RANGES, PROCESSING, STAGE_OUT], + FINISHING_LOCAL_RANGES: [STAGE_OUT], + PROCESSING: [READY_FOR_EVENTS, STAGE_OUT], + STAGE_OUT: [FINISHING], + FINISHING: [DONE], + DONE: [READY_FOR_JOB], +} @ray.remote(num_cpus=1, max_restarts=1, max_task_retries=3) class ESWorker: @@ -47,51 +84,13 @@ class ESWorker: A worker instance is a stateful object which basically transitions from job request -> stage-in -> processing <-> ranges request -> stage-out -> done - Allowed transition are defined by ESWorker.TRANSITIONS + Allowed transition are defined by TRANSITIONS The current state defines what message will be sent to the driver when it requests the worker state using get_message(). The driver needs to frequently call get_message() and process requests from the worker, allowing the worker to progress in the job processing. """ - READY_FOR_JOB = 0 # initial state, before the first job request - JOB_REQUESTED = 1 # job has been requested to the driver, waiting for result - READY_FOR_EVENTS = 2 # ready to request new events for the current job - EVENT_RANGES_REQUESTED = 3 # event ranges have been requested to the driver, waiting for result - FINISHING_LOCAL_RANGES = 4 # do not request additional ranges, will move to STAGE_OUT once local cache is empty - PROCESSING = 5 # currently processing event ranges - FINISHING = 6 # Performing cleanup of resources, preparing final server update - DONE = 7 # Actor has finished processing job - STAGE_IN = 8 # Staging-in data. - STAGE_OUT = 9 # Staging-out data - - STATES_NAME = { - READY_FOR_JOB: "READY_FOR_JOB", - JOB_REQUESTED: "JOB_REQUESTED", - READY_FOR_EVENTS: "READY_FOR_EVENTS", - EVENT_RANGES_REQUESTED: "EVENT_RANGES_REQUESTED", - FINISHING_LOCAL_RANGES: "FINISHING_LOCAL_RANGES", - PROCESSING: "PROCESSING", - FINISHING: "FINISHING", - DONE: "DONE", - STAGE_IN: "STAGE_IN", - STAGE_OUT: "STAGE_OUT", - } - - # authorize state transition from x to y if y in TRANSITION[X] - TRANSITIONS = { - READY_FOR_JOB: [JOB_REQUESTED], - JOB_REQUESTED: [STAGE_IN, DONE], - STAGE_IN: [READY_FOR_EVENTS], - READY_FOR_EVENTS: [EVENT_RANGES_REQUESTED, STAGE_OUT], - EVENT_RANGES_REQUESTED: [FINISHING_LOCAL_RANGES, PROCESSING, STAGE_OUT], - FINISHING_LOCAL_RANGES: [STAGE_OUT], - PROCESSING: [READY_FOR_EVENTS, STAGE_OUT], - STAGE_OUT: [FINISHING], - FINISHING: [DONE], - DONE: [READY_FOR_JOB], - } - def __init__( self, actor_id: str, @@ -119,9 +118,9 @@ def __init__( self._logger = make_logger(self.config, self.id) self.session_log_dir = session_log_dir self.job = None - self.transitions = ESWorker.TRANSITIONS + self.transitions = TRANSITIONS self.node_ip = get_node_ip() - self.state = ESWorker.READY_FOR_JOB + self.state = READY_FOR_JOB self.payload_job_dir = None self.payload_actor_output_dir = None self.payload_actor_process_dir = None @@ -141,10 +140,10 @@ def __init__( self.time_limit = -1 self.elapsed = 1 if job: - self.transition_state(ESWorker.JOB_REQUESTED) + self.transition_state(JOB_REQUESTED) self.receive_job(Messages.REPLY_OK, job) if event_ranges: - self.transition_state(ESWorker.EVENT_RANGES_REQUESTED) + self.transition_state(EVENT_RANGES_REQUESTED) self.receive_event_ranges(Messages.REPLY_OK, event_ranges) def check_time(self) -> None: @@ -296,7 +295,7 @@ def stagein(self) -> None: except Exception as e: self._logger.warning(f"Failed to stagein payload: {e}") raise StageInFailed(self.id) from e - self.transition_state(ESWorker.READY_FOR_EVENTS if self.is_event_service_job() else ESWorker.PROCESSING) + self.transition_state(READY_FOR_EVENTS if self.is_event_service_job() else PROCESSING) def stageout(self) -> None: """ @@ -310,7 +309,7 @@ def stageout(self) -> None: - The worker is in the DONE state. """ self.payload.stageout() - self.transition_state(ESWorker.FINISHING) + self.transition_state(FINISHING) self.terminate_actor() def transition_state(self, dest: int) -> None: @@ -325,12 +324,12 @@ def transition_state(self, dest: int) -> None: """ if dest not in self.transitions[self.state]: self._logger.error( - f"Illegal transition from {ESWorker.STATES_NAME[self.state]} to {ESWorker.STATES_NAME[dest]}" + f"Illegal transition from {STATES_NAME[self.state]} to {STATES_NAME[dest]}" ) raise IllegalWorkerState( worker_id=self.id, - src_state=ESWorker.STATES_NAME[self.state], - dst_state=ESWorker.STATES_NAME[dest], + src_state=STATES_NAME[self.state], + dst_state=STATES_NAME[dest], ) self.state = dest @@ -362,7 +361,7 @@ def receive_job(self, reply: int, job: PandaJob) -> WorkerResponse: """ self.job = job if reply == Messages.REPLY_OK and self.job: - self.transition_state(ESWorker.STAGE_IN) + self.transition_state(STAGE_IN) try: self.stagein() except BaseRaythenaException: @@ -370,7 +369,7 @@ def receive_job(self, reply: int, job: PandaJob) -> WorkerResponse: except Exception as e: raise WrappedException(self.id, e) from e else: - self.transition_state(ESWorker.DONE) + self.transition_state(DONE) self._logger.error("Could not fetch job. Set state to done.") return self.return_message(Messages.REPLY_OK) @@ -387,8 +386,8 @@ def mark_new_job(self) -> WorkerResponse: """ # TODO: either remove this functionality (event service workers will only ever have one job) # TODO: or finish the implementation by also cleaning up the filesystem - self.transition_state(ESWorker.READY_FOR_JOB) - self.transition_state(ESWorker.JOB_REQUESTED) + self.transition_state(READY_FOR_JOB) + self.transition_state(JOB_REQUESTED) return self.return_message(Messages.REQUEST_NEW_JOB) def receive_event_ranges(self, reply: int, event_ranges: Sequence[EventRange]) -> WorkerResponse: @@ -414,7 +413,7 @@ def receive_event_ranges(self, reply: int, event_ranges: Sequence[EventRange]) - """ if reply == Messages.REPLY_NO_MORE_EVENT_RANGES or not event_ranges: # no new ranges... finish processing local cache then terminate actor - self.transition_state(ESWorker.FINISHING_LOCAL_RANGES) + self.transition_state(FINISHING_LOCAL_RANGES) self.payload.submit_new_ranges(None) return self.return_message(Messages.REPLY_OK) for crange in event_ranges: @@ -425,7 +424,7 @@ def receive_event_ranges(self, reply: int, event_ranges: Sequence[EventRange]) - ) self.payload.submit_new_ranges(event_ranges) - self.transition_state(ESWorker.PROCESSING) + self.transition_state(PROCESSING) return self.return_message(Messages.REPLY_OK) def return_message(self, message: int, data: Any = None) -> WorkerResponse: @@ -460,7 +459,7 @@ def terminate_actor(self) -> None: """ self.payload.stop() # self.cpu_monitor.stop() - self.transition_state(ESWorker.DONE) + self.transition_state(DONE) def should_request_ranges(self) -> bool: """ @@ -474,12 +473,12 @@ def should_request_ranges(self) -> bool: True if more event ranges are needed by the payload """ # do not transition if not in a state allowing for event ranges request - if ESWorker.READY_FOR_EVENTS not in self.transitions[self.state]: + if READY_FOR_EVENTS not in self.transitions[self.state]: return False res = self.payload.should_request_more_ranges() if res: - self.transition_state(ESWorker.READY_FOR_EVENTS) + self.transition_state(READY_FOR_EVENTS) return res def stageout_event_service_files(self, ranges_update: Mapping[str, str]) -> Optional[EventRangeUpdate]: @@ -558,13 +557,13 @@ def get_message(self) -> WorkerResponse: to the worker or if the worker produced output data. """ try: - while self.state != ESWorker.DONE: + while self.state != DONE: payload_message = self.get_payload_message() if payload_message: return payload_message - elif self.state == ESWorker.READY_FOR_JOB: + elif self.state == READY_FOR_JOB: # ready to get a new job - self.transition_state(ESWorker.JOB_REQUESTED) + self.transition_state(JOB_REQUESTED) return self.return_message(Messages.REQUEST_NEW_JOB) elif self.payload.is_complete(): # check if there are any remaining message from the payload in queue. @@ -574,11 +573,11 @@ def get_message(self) -> WorkerResponse: return payload_message else: # if no more message, proceed to stage-out - self.transition_state(ESWorker.STAGE_OUT) + self.transition_state(STAGE_OUT) self.stageout() return self.return_message(Messages.PROCESS_DONE) elif self.is_event_service_job() and ( - self.state == ESWorker.READY_FOR_EVENTS or self.should_request_ranges() + self.state == READY_FOR_EVENTS or self.should_request_ranges() ): req = EventRangeRequest() req.add_event_request( @@ -587,9 +586,9 @@ def get_message(self) -> WorkerResponse: self.job["taskID"], self.job["jobsetID"], ) - self.transition_state(ESWorker.EVENT_RANGES_REQUESTED) + self.transition_state(EVENT_RANGES_REQUESTED) return self.return_message(Messages.REQUEST_EVENT_RANGES, req) - elif self.state == ESWorker.DONE: + elif self.state == DONE: return self.return_message(Messages.PROCESS_DONE) else: time.sleep(1) # Nothing to do, sleeping... diff --git a/src/raythena/utils/config.py b/src/raythena/utils/config.py index 2dc54b2..57b70cf 100644 --- a/src/raythena/utils/config.py +++ b/src/raythena/utils/config.py @@ -1,6 +1,45 @@ import os import yaml +required_conf_settings = { + "payload": { + "pandaqueue": str, + "logfilename": str, + "extrasetup": str, + "hpcresource": str, + "extrapostpayload": str, + "containerengine": str, + "containerextraargs": str, + "containerextrasetup": str, + "pilotkillfile": str, + "pilotversion": str, + "pilotkilltime": int, + "timemonitorfile": str, + }, + "harvester": { + "endpoint": str, + "harvesterconf": str, + }, + "ray": { + "workdir": str, + "taskprogressbasedir": str, + "headip": str, + "redisport": int, + "redispassword": str, + "timeoutinterval": int, + "mergemaxprocesses": int, + "cachesizefactor": int, + }, + "resources": { + "corepernode": int, + }, + "logging": { + "level": str, + "driverlogfile": str, + "workerlogfile": str, + "copyraylogs": bool, + }, +} class Config: """Class storing app configuration. @@ -10,50 +49,10 @@ class Config: Note that not all arguments can be specified using cli or env variable, some of them can only be specified from the conf file. See the file for more information about which settings can be specified using cli. Any parameter can be specified in the config file, the only constraint checked being that - attributes in Config.required_conf_settings should be present in the config file. This allows to specify + attributes in required_conf_settings should be present in the config file. This allows to specify custom settings for plugins if necessary. """ - required_conf_settings = { - "payload": { - "pandaqueue": str, - "logfilename": str, - "extrasetup": str, - "hpcresource": str, - "extrapostpayload": str, - "containerengine": str, - "containerextraargs": str, - "containerextrasetup": str, - "pilotkillfile": str, - "pilotversion": str, - "pilotkilltime": int, - "timemonitorfile": str, - }, - "harvester": { - "endpoint": str, - "harvesterconf": str, - }, - "ray": { - "workdir": str, - "taskprogressbasedir": str, - "headip": str, - "redisport": int, - "redispassword": str, - "timeoutinterval": int, - "mergemaxprocesses": int, - "cachesizefactor": int, - }, - "resources": { - "corepernode": int, - }, - "logging": { - "level": str, - "driverlogfile": str, - "workerlogfile": str, - "copyraylogs": bool, - }, - } - def __init__(self, config_path: str, *args, **kwargs) -> None: """Parse the config file to an object @@ -180,7 +179,7 @@ def _validate(self) -> None: for ( template_section, template_params, - ) in Config.required_conf_settings.items(): + ) in required_conf_settings.items(): section_params = getattr(self, template_section, None) if section_params is None: raise Exception(f"Malformed configuration file: section '{template_section}' not found") diff --git a/src/raythena/utils/eventservice.py b/src/raythena/utils/eventservice.py index beefa1f..5c89904 100644 --- a/src/raythena/utils/eventservice.py +++ b/src/raythena/utils/eventservice.py @@ -1072,8 +1072,8 @@ class EventRange: DONE = "finished" FAILED = "failed" FATAL = "fatal" - STATES = [READY, ASSIGNED, DONE, FAILED, FATAL] - + STATES = frozenset([READY, ASSIGNED, DONE, FAILED, FATAL]) + def __init__( self, event_range_id: str, diff --git a/src/raythena/utils/exception.py b/src/raythena/utils/exception.py index e6f4f4a..aca2ed4 100644 --- a/src/raythena/utils/exception.py +++ b/src/raythena/utils/exception.py @@ -2,19 +2,13 @@ from queue import Empty, Queue from typing import Optional +ILLEGAL_WORKER_STATE = 20 +STAGEIN_FAILED = 30 +STAGEOUT_FAILED = 40 +PAYLOAD_FAILED = 50 +UNKNOWN = 0 -class ErrorCodes: - """ - Defines error codes constants and associated default error message for each error code - """ - - ILLEGAL_WORKER_STATE = 20 - STAGEIN_FAILED = 30 - STAGEOUT_FAILED = 40 - PAYLOAD_FAILED = 50 - UNKNOWN = 0 - - ERROR_CODES_GENRIC_MESSAGES = { +ERROR_CODES_GENRIC_MESSAGES = { ILLEGAL_WORKER_STATE: "Illegal worker state transition", STAGEIN_FAILED: "Failed to stagein data", STAGEOUT_FAILED: "Failed to stageout data", @@ -22,6 +16,11 @@ class ErrorCodes: UNKNOWN: "Unknown error", } +class ErrorCodes: + """ + Defines error codes constants and associated default error message for each error code + """ + @staticmethod def get_error_message(error_code: int) -> str: """ @@ -33,7 +32,7 @@ def get_error_message(error_code: int) -> str: Returns: The default error message """ - return ErrorCodes.ERROR_CODES_GENRIC_MESSAGES.get(error_code, "") + return ERROR_CODES_GENRIC_MESSAGES.get(error_code, "") class ExThread(threading.Thread): From 046646d075f6de725a269d84857f19224cbbdcc4 Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 4 Oct 2024 16:16:04 -0700 Subject: [PATCH 14/14] format --- src/raythena/actors/esworker.py | 9 +++------ src/raythena/utils/config.py | 1 + src/raythena/utils/eventservice.py | 2 +- src/raythena/utils/exception.py | 13 +++++++------ 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/raythena/actors/esworker.py b/src/raythena/actors/esworker.py index 0156efa..f13e5f6 100644 --- a/src/raythena/actors/esworker.py +++ b/src/raythena/actors/esworker.py @@ -76,6 +76,7 @@ DONE: [READY_FOR_JOB], } + @ray.remote(num_cpus=1, max_restarts=1, max_task_retries=3) class ESWorker: """ @@ -323,9 +324,7 @@ def transition_state(self, dest: int) -> None: IllegalWorkerState if the transition isn't allowed """ if dest not in self.transitions[self.state]: - self._logger.error( - f"Illegal transition from {STATES_NAME[self.state]} to {STATES_NAME[dest]}" - ) + self._logger.error(f"Illegal transition from {STATES_NAME[self.state]} to {STATES_NAME[dest]}") raise IllegalWorkerState( worker_id=self.id, src_state=STATES_NAME[self.state], @@ -576,9 +575,7 @@ def get_message(self) -> WorkerResponse: self.transition_state(STAGE_OUT) self.stageout() return self.return_message(Messages.PROCESS_DONE) - elif self.is_event_service_job() and ( - self.state == READY_FOR_EVENTS or self.should_request_ranges() - ): + elif self.is_event_service_job() and (self.state == READY_FOR_EVENTS or self.should_request_ranges()): req = EventRangeRequest() req.add_event_request( self.job["PandaID"], diff --git a/src/raythena/utils/config.py b/src/raythena/utils/config.py index 57b70cf..6a95457 100644 --- a/src/raythena/utils/config.py +++ b/src/raythena/utils/config.py @@ -41,6 +41,7 @@ }, } + class Config: """Class storing app configuration. diff --git a/src/raythena/utils/eventservice.py b/src/raythena/utils/eventservice.py index 5c89904..4e14c5f 100644 --- a/src/raythena/utils/eventservice.py +++ b/src/raythena/utils/eventservice.py @@ -1073,7 +1073,7 @@ class EventRange: FAILED = "failed" FATAL = "fatal" STATES = frozenset([READY, ASSIGNED, DONE, FAILED, FATAL]) - + def __init__( self, event_range_id: str, diff --git a/src/raythena/utils/exception.py b/src/raythena/utils/exception.py index aca2ed4..eff9a0c 100644 --- a/src/raythena/utils/exception.py +++ b/src/raythena/utils/exception.py @@ -9,12 +9,13 @@ UNKNOWN = 0 ERROR_CODES_GENRIC_MESSAGES = { - ILLEGAL_WORKER_STATE: "Illegal worker state transition", - STAGEIN_FAILED: "Failed to stagein data", - STAGEOUT_FAILED: "Failed to stageout data", - PAYLOAD_FAILED: "Payload execution failed", - UNKNOWN: "Unknown error", - } + ILLEGAL_WORKER_STATE: "Illegal worker state transition", + STAGEIN_FAILED: "Failed to stagein data", + STAGEOUT_FAILED: "Failed to stageout data", + PAYLOAD_FAILED: "Payload execution failed", + UNKNOWN: "Unknown error", +} + class ErrorCodes: """