From bf83dbedf574d8081dbdee81c77fac6eb935eb3b Mon Sep 17 00:00:00 2001 From: Kyle Conroy Date: Fri, 24 Jun 2022 15:00:33 -0400 Subject: [PATCH] RemoteSlurmJob: add support for passing addl_slurm_kwargs (0.2.0 release) (#4) * RemoteSlurm: implement addl_slurm_kwargs --- crimpl/common.py | 24 ++++++++++++++++-------- crimpl/remoteslurm.py | 19 +++++++++++++++++-- setup.py | 4 ++-- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/crimpl/common.py b/crimpl/common.py index 8f9e5bc..12e75e5 100644 --- a/crimpl/common.py +++ b/crimpl/common.py @@ -5,7 +5,7 @@ import json as _json from time import sleep as _sleep -__version__ = '0.1.0' +__version__ = '0.2.0' def _new_job_name(): return _datetime.now().strftime('%Y.%m.%d-%H.%M.%S') @@ -18,9 +18,9 @@ def _run_cmd(cmd, detach=False, log_cmd=True, allow_retries=True): while True: try: if detach: - ret = _subprocess.Popen(cmd, shell=True, stderr=_subprocess.DEVNULL) + ret = _subprocess.Popen(cmd, shell=True, stderr=_subprocess.STDOUT) else: - ret = _subprocess.check_output(cmd, shell=True, stderr=_subprocess.DEVNULL).decode('utf-8').strip() + ret = _subprocess.check_output(cmd, shell=True, stderr=_subprocess.STDOUT).decode('utf-8').strip() except _subprocess.CalledProcessError as err: # print("error output: {}".format(err.output)) if allow_retries and err.returncode == 255 and i < 5: @@ -315,10 +315,18 @@ def _submit_script_cmds(self, script, files, ignore_files, elif not self.conda_installed and conda_env is not False: raise ValueError("conda is not installed on the remote server. Install manually or call server.install_conda()") - _slurm_kwarg_to_prefix = {'nprocs': '-n ', - 'walltime': '-t ', - 'mail_type': '--mail-type=', - 'mail_user': '--mail-user='} + def _slurm_kwarg_to_prefix(k): + exceptions = {'nprocs': '-n ', + 'walltime': '-t ', + 'mail_type': '--mail-type=', + 'mail_user': '--mail-user='} + if k in exceptions.keys(): + return exceptions.get(k) + elif len(k) == 1: + return f"-{k} " + else: + return f"--{k}=" + create_env_cmd, conda_env_path = self._create_conda_env(conda_env, isolate_env, job_name=job_name, check_if_exists=True, run_cmd=False) @@ -336,7 +344,7 @@ def _submit_script_cmds(self, script, files, ignore_files, sched_script += ["#SBATCH -J {}".format(job_name)] for k,v in sched_kwargs.items(): if v is None: continue - prefix = _slurm_kwarg_to_prefix.get(k, False) + prefix = _slurm_kwarg_to_prefix(k) if prefix is False: raise NotImplementedError("slurm command for {} not implemented".format(k)) if k=='mail_type' and isinstance(v, list): diff --git a/crimpl/remoteslurm.py b/crimpl/remoteslurm.py index c88cf6b..e5e4bc6 100644 --- a/crimpl/remoteslurm.py +++ b/crimpl/remoteslurm.py @@ -258,6 +258,7 @@ def submit_script(self, script, files=[], walltime='2-00:00:00', mail_type='END,FAIL', mail_user=None, + addl_slurm_kwargs={}, ignore_files=[], wait_for_job_status=False, trial_run=False): @@ -301,6 +302,10 @@ def submit_script(self, script, files=[], * `mail_user` (string, optional, default=None): email to send notifications. If not provided or None, will default to the value in . Prepended to `script` as "#SBATCH --mail_user=mail_user" + * `addl_slurm_kwargs` (dict, optional, default={}): additional kwargs + to pass to slurm. Entries will be prepended to `script` as + "#SBATCH - " or "#SBATCH --=" depending on whether the + key (`k`) is a single character or multiple characters, respectively. * `ignore_files` (list, optional, default=[]): list of filenames on the remote server to ignore when calling <.check_output> * `wait_for_job_status` (bool or string or list, optional, default=False): @@ -341,7 +346,8 @@ def submit_script(self, script, files=[], nprocs=nprocs, walltime=walltime, mail_type=mail_type, - mail_user=mail_user if mail_user is not None else self.server.mail_user) + mail_user=mail_user if mail_user is not None else self.server.mail_user, + **addl_slurm_kwargs) if trial_run: return cmds @@ -351,7 +357,13 @@ def submit_script(self, script, files=[], # TODO: get around need to add IP to known hosts (either by # expecting and answering yes, or by looking into subnet options) - out = self.server._run_server_cmd(cmd) + try: + out = self.server._run_server_cmd(cmd) + except _subprocess.CalledProcessError as e: + if addl_slurm_kwargs: + raise ValueError(f"failed to submit to scheduler, addl_slurm_kwargs may be invalid. Original error: {e.output}") + raise ValueError(f"failed to submit to scheduler. Original error: {e.output}") + if "sbatch" in cmd: self._slurm_id = out.split(' ')[-1] @@ -507,6 +519,7 @@ def submit_job(self, script, files=[], walltime='2-00:00:00', mail_type='END,FAIL', mail_user=None, + addl_slurm_kwargs={}, ignore_files=[], wait_for_job_status=False, trial_run=False): @@ -525,6 +538,7 @@ def submit_job(self, script, files=[], * `walltime`: passed to * `mail_type`: passed to * `mail_user`: passed to + * `addl_slurm_kwargs': pass to ` * `ignore_files`: passed to * `wait_for_job_status`: passed to * `trial_run`: passed to @@ -543,6 +557,7 @@ def submit_job(self, script, files=[], walltime=walltime, mail_type=mail_type, mail_user=mail_user, + addl_slurm_kwargs=addl_slurm_kwargs, ignore_files=ignore_files, wait_for_job_status=wait_for_job_status, trial_run=trial_run) diff --git a/setup.py b/setup.py index 6fab803..c463c37 100644 --- a/setup.py +++ b/setup.py @@ -6,14 +6,14 @@ long_description = fh.read() setup(name='crimpl', - version='0.1.0', + version='0.2.0', description='Compute Resources Made Simple', long_description=long_description, long_description_content_type="text/markdown", author='Kyle Conroy', author_email='kyleconroy@gmail.com', url='https://www.github.com/kecnry/crimpl', - download_url = 'https://github.com/kecnry/crimpl/tarball/0.1.0', + download_url = 'https://github.com/kecnry/crimpl/tarball/0.2.0', packages=['crimpl'], install_requires=['boto3'], classifiers=[