From 799cc56ef86046292f2e40ac52ff892e17185ab2 Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Mon, 10 Jul 2023 21:29:10 -0500 Subject: [PATCH 01/26] config: Remove symlinks and load config dynamically Use the `MCSCRIPT_CONFIG` environment variable to specify a file or module which will provide the cluster configuration. --- .../config-torque-oak/mpiexec_man.txt | 0 .../config-torque-oak/qsub_man.txt | 0 mcscript/config.py | 1 - mcscript/config/__init__.py | 49 +++++++++++++++++++ .../config-ompi.py => mcscript/config/ompi.py | 2 +- .../config/slurm_nersc.py | 10 ++-- .../config/torque_oak.py | 2 +- .../config/uge_ndcrc.py | 2 +- 8 files changed, 58 insertions(+), 8 deletions(-) rename {config/doc => doc}/config-torque-oak/mpiexec_man.txt (100%) rename {config/doc => doc}/config-torque-oak/qsub_man.txt (100%) delete mode 120000 mcscript/config.py create mode 100644 mcscript/config/__init__.py rename config/config-ompi.py => mcscript/config/ompi.py (99%) rename config/config-slurm-nersc.py => mcscript/config/slurm_nersc.py (99%) rename config/config-torque-oak.py => mcscript/config/torque_oak.py (99%) rename config/config-uge-ndcrc.py => mcscript/config/uge_ndcrc.py (99%) diff --git a/config/doc/config-torque-oak/mpiexec_man.txt b/doc/config-torque-oak/mpiexec_man.txt similarity index 100% rename from config/doc/config-torque-oak/mpiexec_man.txt rename to doc/config-torque-oak/mpiexec_man.txt diff --git a/config/doc/config-torque-oak/qsub_man.txt b/doc/config-torque-oak/qsub_man.txt similarity index 100% rename from config/doc/config-torque-oak/qsub_man.txt rename to doc/config-torque-oak/qsub_man.txt diff --git a/mcscript/config.py b/mcscript/config.py deleted file mode 120000 index 3721332..0000000 --- a/mcscript/config.py +++ /dev/null @@ -1 +0,0 @@ -../config.py \ No newline at end of file diff --git a/mcscript/config/__init__.py b/mcscript/config/__init__.py new file mode 100644 index 0000000..f8a9653 --- /dev/null +++ b/mcscript/config/__init__.py @@ -0,0 +1,49 @@ +"""config -- load cluster configuration + + Language: Python 3 + + Patrick J. Fasano + Physics Division, Argonne National Laboratory + + + 07/10/23 (pjf): Created. +""" + +import importlib, importlib.util +import os +import sys + + +################################################################ +# cluster configuration module +################################################################ + +# get cluster config parameter from environment +__cluster_config_name = os.environ.get("MCSCRIPT_CONFIG") +if not __cluster_config_name: + raise RuntimeError("MCSCRIPT_CONFIG not defined") + +# get ModuleSpec (either from explicit filename or from module name) +if os.path.exists(__cluster_config_name): + __spec = importlib.util.spec_from_file_location( + "mcscript.config.cluster_config", __cluster_config_name + ) +elif (__spec := importlib.util.find_spec(__cluster_config_name)) is not None: + pass +else: + raise ModuleNotFoundError( + f"MCSCRIPT_CONFIG not found: {__cluster_config_name}", + name=__cluster_config_name + ) + +# import cluster config module +cluster_config = importlib.util.module_from_spec(__spec) +sys.modules["mcscript.config.cluster_config"] = cluster_config +__spec.loader.exec_module(cluster_config) + + +# delegate to cluster_config module +def __getattr__(name:str): + try: + return globals()[name] + except KeyError: + return getattr(cluster_config, name) diff --git a/config/config-ompi.py b/mcscript/config/ompi.py similarity index 99% rename from config/config-ompi.py rename to mcscript/config/ompi.py index 758f3e6..e4ea4d5 100644 --- a/config/config-ompi.py +++ b/mcscript/config/ompi.py @@ -18,7 +18,7 @@ import signal import sys -from . import ( +from .. import ( parameters, utils, ) diff --git a/config/config-slurm-nersc.py b/mcscript/config/slurm_nersc.py similarity index 99% rename from config/config-slurm-nersc.py rename to mcscript/config/slurm_nersc.py index 11ad369..f2d9495 100644 --- a/config/config-slurm-nersc.py +++ b/mcscript/config/slurm_nersc.py @@ -77,10 +77,12 @@ import re from tabnanny import verbose -from . import control -from . import exception -from . import parameters -from . import utils +from .. import ( + control, + exception, + parameters, + utils, +) cluster_specs = { diff --git a/config/config-torque-oak.py b/mcscript/config/torque_oak.py similarity index 99% rename from config/config-torque-oak.py rename to mcscript/config/torque_oak.py index c208110..ea2b573 100644 --- a/config/config-torque-oak.py +++ b/mcscript/config/torque_oak.py @@ -19,7 +19,7 @@ import math import os -from . import parameters +from .. import parameters ################################################################ ################################################################ diff --git a/config/config-uge-ndcrc.py b/mcscript/config/uge_ndcrc.py similarity index 99% rename from config/config-uge-ndcrc.py rename to mcscript/config/uge_ndcrc.py index a7abda6..83365bf 100644 --- a/config/config-uge-ndcrc.py +++ b/mcscript/config/uge_ndcrc.py @@ -70,7 +70,7 @@ import math import os -from . import parameters +from .. import parameters queues = { From d7b18392350fb8f42d5a957979ea79e8bfa2efd1 Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Thu, 13 Jul 2023 17:42:25 -0500 Subject: [PATCH 02/26] qsubm: Modify to work in non-editable mode * Use `entry_points` to install `qsubm` command. * Move `qsubm.py` into `mcscript` module. * Remove references to `MCSCRIPT_DIR`. * Move wrapper scripts into `mcscript` module, and install using `package_data`. * Get path to wrappers via `pkg_resources.resource_filename()`. --- INSTALL.md | 82 ++- mcscript/config/ompi.py | 4 +- mcscript/config/slurm_nersc.py | 24 +- mcscript/config/torque_oak.py | 27 +- mcscript/config/uge_ndcrc.py | 24 +- .../job_wrappers/bash_job_wrapper.sh | 0 .../job_wrappers/csh_job_wrapper.csh | 0 mcscript/qsubm.py | 452 ++++++++++++++++ mcscript_init.sh | 1 - qsubm.py | 484 ------------------ setup.py | 12 +- tools/qsubm | 3 - 12 files changed, 546 insertions(+), 567 deletions(-) rename bash_job_wrapper.sh => mcscript/job_wrappers/bash_job_wrapper.sh (100%) rename csh_job_wrapper.csh => mcscript/job_wrappers/csh_job_wrapper.csh (100%) create mode 100644 mcscript/qsubm.py delete mode 120000 mcscript_init.sh delete mode 100644 qsubm.py delete mode 100755 tools/qsubm diff --git a/INSTALL.md b/INSTALL.md index e70fa09..6f18c0e 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -13,6 +13,7 @@ Department of Physics, University of Notre Dame + 01/01/18 (pjf): Update for installation with `pip`. + 02/06/18 (pjf): Update MCSCRIPT_SOURCE file path. + 02/09/18 (mac): Overhaul configuration instructions. ++ 07/10/23 (pjf): Update for `MCSCRIPT_CONFIG` variable. ---------------------------------------------------------------- @@ -43,77 +44,72 @@ Department of Physics, University of Notre Dame Set up the package in your `PYTHONPATH` by running `pip`: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - % python3 -m pip install --user --editable . + % python3 -m pip install --user . ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Note that the `.` here means to install the Python package defined by the code - in the current directory. + in the current directory. If you are actively developing `mcscript` itself, + you may want to pass the `--editable` flag to `pip`. a. Subsequently updating source: ~~~~~~~~~~~~~~~~ % git pull - % python3 -m pip install --user --editable . + % python3 -m pip install --user . ~~~~~~~~~~~~~~~~ - This subsequent `pip install`, when updating the source code, is a precaution - in case, e.g., the package dependencies have changed. - # 2. Local configuration - The local configuration file provides functions which construct the batch + The local configuration module provides functions which construct the batch submission (qsub, sbatch, etc.) command lines and and serial/parallel execution launch (mpiexec, srun, etc.) invocations appropriate to your - cluster and running needs. You need to create a symbolic link `config.py` to - point to the correct configuration file for the system or cluster you are - running on. + cluster and running needs. You must define the `MCSCRIPT_CONFIG` environment + variable to specify the correct configuration module for the system or + cluster you are running on. If you are only doing *local* runs (i.e., no batch job submission) on your - laptop/workstation, and if you are using with OpenMPI as your MPI - implementation, you can use the generic configuration file config-ompi.py in - mcscript/config: + laptop/workstation, and if you are using with OpenMPI as your MPI + implementation, you can use the generic configuration module + `mcscript.config.ompi`: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - % ln -s config/config-ompi.py config.py + % export MCSCRIPT_CONFIG="mcscript.config.ompi" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Example local configuration files for Univa Grid Engine at the Notre - Dame Center for Research Computing and SLURM at NERSC are included - in the mcscript/config directory. + Local configuration modules for several clusters are provided as part of the + `mcscript` distribution: - >#### @NDCRC: #### - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - % ln -s config/config-uge-ndcrc.py config.py - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - >#### @NERSC: #### - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - % ln -s config/config-slurm-nersc.py config.py - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + - `mcscript.config.uge_ndcrc` - Univa Grid Engine at CRC (Notre Dame) + - `mcscript.config.slurm_nersc` - Slurm at NERSC (LBNL) + - `mcscript.config.torque_oak` - Torque at UBC ARC (TRIUMF) Otherwise, whenever you move to a new cluster, you will have to - write such a file, to take into account the pecularities of the + write such a module, to take into account the pecularities of the batch submission software and queue structure of that cluster. - You can use the above example files as models to define your own configuration - file appropriate to your own cluster and your own running needs. + You can use the above example modules (distributed in `mcscript/config`) as + models to define your own configuration module(s) appropriate to your own + cluster(s) and your own running needs. In such a case, you can set + `MCSCRIPT_CONFIG` to the full path of the configuration module file: + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + % export MCSCRIPT_CONFIG="/home/alice/code/gadget_acme.py" + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # 3. Environment variables - You will also need to add the `mcscript\tools` directory to your command path. - Furthermore, the mcscript job submission utility "qsubm" expects certain - environment variables to be defined at submission time: + The `mcscript` package expects certain environment variables to be defined + at both submission and run time: - > MCSCRIPT_DIR specifies the directory in which the mcscript package is - > installed, i.e., the directory where the file qsubm.py is found. (Note - > that qsubm uses this information to locate certain auxiliary script files - > used as part of the job submission process.) + > `MCSCRIPT_CONFIG` (described above) specifies the cluster configuration + > module. - > MCSCRIPT_INSTALL_HOME specifies the directory in which executables are + > `MCSCRIPT_INSTALL_HOME` specifies the directory in which executables are > found. - > MCSCRIPT_RUN_HOME specifies the directory in which job files are found. + > `MCSCRIPT_RUN_HOME` specifies the directory in which job files are found. - > MCSCRIPT_WORK_HOME specifies the parent directory in which run scratch + > `MCSCRIPT_WORK_HOME` specifies the parent directory in which run scratch > directories should be made. This will normally be on a fast scratch > filesystem. @@ -126,11 +122,10 @@ Department of Physics, University of Notre Dame ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # mcscript - setenv MCSCRIPT_DIR ${HOME}/code/mcscript + setenv MCSCRIPT_CONFIG mcscript.config.ompi setenv MCSCRIPT_INSTALL_HOME ${HOME}/code/install setenv MCSCRIPT_RUN_HOME ${HOME}/runs setenv MCSCRIPT_WORK_HOME ${SCRATCH}/runs - setenv PATH ${MCSCRIPT_DIR}/tools:${PATH} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Alternatively, if you are a bash user, you would add something like the @@ -138,11 +133,10 @@ Department of Physics, University of Notre Dame ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # mcscript - export MCSCRIPT_DIR=${HOME}/code/mcscript + export MCSCRIPT_CONFIG=mcscript.config.ompi export MCSCRIPT_INSTALL_HOME=${HOME}/code/install export MCSCRIPT_RUN_HOME=${HOME}/runs export MCSCRIPT_WORK_HOME=${SCRATCH}/runs - export PATH=${MCSCRIPT_DIR}/tools:${PATH} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You may also need to set environment variables expected by the scripting for @@ -152,7 +146,7 @@ Department of Physics, University of Notre Dame To tell mcscript about this file, make sure you set MCSCRIPT_SOURCE at the time you submit the job, i.e., before calling qsubm: - > MCSCRIPT_SOURCE (optional) should give the full qualified + > `MCSCRIPT_SOURCE` (optional) should give the full qualified > filename (i.e., including path) to any shell code which should > be "sourced" at the beginning of the batch job. This should be > sh/bash-compatible code. diff --git a/mcscript/config/ompi.py b/mcscript/config/ompi.py index e4ea4d5..bbd81b1 100644 --- a/mcscript/config/ompi.py +++ b/mcscript/config/ompi.py @@ -30,7 +30,7 @@ ################################################################ -def submission(job_name,job_file,qsubm_path,environment_definitions,args): +def submission(job_name,job_file,environment_definitions,args): """Prepare submission command invocation. Arguments: @@ -39,8 +39,6 @@ def submission(job_name,job_file,qsubm_path,environment_definitions,args): job_file (str): job script file - qsubm_path (str): path to qsubm files (for locating wrapper script) - environment_definitions (list of str): list of environment variable definitions to include in queue submission arguments diff --git a/mcscript/config/slurm_nersc.py b/mcscript/config/slurm_nersc.py index f2d9495..9d3e992 100644 --- a/mcscript/config/slurm_nersc.py +++ b/mcscript/config/slurm_nersc.py @@ -75,7 +75,7 @@ import subprocess import shutil import re -from tabnanny import verbose +import pkg_resources from .. import ( control, @@ -256,8 +256,8 @@ def qsubm_arguments(parser): help="deadline for job execution (e.g., \"2022-01-19T00:06:59\"); default " "set by MCSCRIPT_DEADLINE" ) - -def submission(job_name,job_file,qsubm_path,environment_definitions,args): + +def submission(job_name,job_file,environment_definitions,args): """Prepare submission command invocation. Arguments: @@ -266,8 +266,6 @@ def submission(job_name,job_file,qsubm_path,environment_definitions,args): job_file (str): job script file - qsubm_path (str): path to qsubm files (for locating wrapper script) - environment_definitions (list of str): list of environment variable definitions to include in queue submission arguments @@ -444,14 +442,18 @@ def submission(job_name,job_file,qsubm_path,environment_definitions,args): # calls interpreter explicitly, so do not have to rely upon default python # version or shebang line in script if "csh" in os.environ.get("SHELL", ""): - job_wrapper = os.path.join(qsubm_path, "csh_job_wrapper.csh") + job_wrapper = pkg_resources.resource_filename( + "mcscript", "job_wrappers/csh_job_wrapper.csh" + ) elif "bash" in os.environ.get("SHELL", ""): - job_wrapper = os.path.join(qsubm_path, "bash_job_wrapper.sh") + job_wrapper = pkg_resources.resource_filename( + "mcscript", "job_wrappers/bash_job_wrapper.sh" + ) else: - job_wrapper = "" - submission_invocation += [ - job_wrapper, - ] + job_wrapper = None + + if job_wrapper: + submission_invocation += [job_wrapper] # use GNU parallel to launch multiple workers per job if args.workers > 1: diff --git a/mcscript/config/torque_oak.py b/mcscript/config/torque_oak.py index ea2b573..faa04ab 100644 --- a/mcscript/config/torque_oak.py +++ b/mcscript/config/torque_oak.py @@ -18,6 +18,7 @@ import math import os +import pkg_resources from .. import parameters @@ -33,7 +34,7 @@ "oak": ("oak", 32, 16, 16) } -def submission(job_name, job_file, qsubm_path, environment_definitions, args): +def submission(job_name, job_file, environment_definitions, args): """Prepare submission command invocation. Arguments: @@ -42,8 +43,6 @@ def submission(job_name, job_file, qsubm_path, environment_definitions, args): job_file (str): job script file - qsubm_path (str): path to qsubm files (for locating wrapper script) - environment_definitions (list of str): list of environment variable definitions to include in queue submission arguments @@ -149,15 +148,19 @@ def submission(job_name, job_file, qsubm_path, environment_definitions, args): # # calls interpreter explicitly, so do not have to rely upon default python # version or shebang line in script - if "csh" in os.environ.get("SHELL"): - job_wrapper = os.path.join(qsubm_path, "csh_job_wrapper.csh") - elif "bash" in os.environ.get("SHELL"): - job_wrapper = os.path.join(qsubm_path, "bash_job_wrapper.sh") - submission_invocation += [ - "-F", # specifies command line arguments to (wrapper) script - "{} {}".format(os.environ["MCSCRIPT_PYTHON"],job_file), # all arguments to (wrapper) script as single string (with spaces between arguments) - job_wrapper # the (wrapper) script itself - ] + if "csh" in os.environ.get("SHELL", ""): + job_wrapper = pkg_resources.resource_filename( + "mcscript", "job_wrappers/csh_job_wrapper.csh" + ) + elif "bash" in os.environ.get("SHELL", ""): + job_wrapper = pkg_resources.resource_filename( + "mcscript", "job_wrappers/bash_job_wrapper.sh" + ) + else: + job_wrapper = None + + if job_wrapper: + submission_invocation += [job_wrapper] # standard input for submission submission_string = "" diff --git a/mcscript/config/uge_ndcrc.py b/mcscript/config/uge_ndcrc.py index 83365bf..92d40ac 100644 --- a/mcscript/config/uge_ndcrc.py +++ b/mcscript/config/uge_ndcrc.py @@ -69,6 +69,7 @@ import math import os +import pkg_resources from .. import parameters @@ -91,7 +92,7 @@ ################################################################ ################################################################ -def submission(job_name, job_file, qsubm_path, environment_definitions, args): +def submission(job_name, job_file, environment_definitions, args): """Prepare submission command invocation. Arguments: @@ -100,8 +101,6 @@ def submission(job_name, job_file, qsubm_path, environment_definitions, args): job_file (str): job script file - qsubm_path (str): path to qsubm files (for locating wrapper script) - environment_definitions (list of str): list of environment variable definitions to include in queue submission arguments @@ -190,12 +189,21 @@ def submission(job_name, job_file, qsubm_path, environment_definitions, args): # # calls interpreter explicitly, so do not have to rely upon default python # version or shebang line in script - if "csh" in os.environ.get("SHELL"): - job_wrapper = os.path.join(qsubm_path, "csh_job_wrapper.csh") - elif "bash" in os.environ.get("SHELL"): - job_wrapper = os.path.join(qsubm_path, "bash_job_wrapper.sh") + if "csh" in os.environ.get("SHELL", ""): + job_wrapper = pkg_resources.resource_filename( + "mcscript", "job_wrappers/csh_job_wrapper.csh" + ) + elif "bash" in os.environ.get("SHELL", ""): + job_wrapper = pkg_resources.resource_filename( + "mcscript", "job_wrappers/bash_job_wrapper.sh" + ) + else: + job_wrapper = None + + if job_wrapper: + submission_invocation += [job_wrapper] + submission_invocation += [ - job_wrapper, os.environ["MCSCRIPT_PYTHON"], job_file ] diff --git a/bash_job_wrapper.sh b/mcscript/job_wrappers/bash_job_wrapper.sh similarity index 100% rename from bash_job_wrapper.sh rename to mcscript/job_wrappers/bash_job_wrapper.sh diff --git a/csh_job_wrapper.csh b/mcscript/job_wrappers/csh_job_wrapper.csh similarity index 100% rename from csh_job_wrapper.csh rename to mcscript/job_wrappers/csh_job_wrapper.csh diff --git a/mcscript/qsubm.py b/mcscript/qsubm.py new file mode 100644 index 0000000..67f0594 --- /dev/null +++ b/mcscript/qsubm.py @@ -0,0 +1,452 @@ +"""qsubm -- generic queue submission for task-oriented batch scripts + + See INSTALL.md for configuration information: + + - A local definitions file config.py must be defined. + + - Several environment variables must be defined. In addition to the + mandatory environment variables defined there, the following (deprecated?) + variables are recognized (but not frequently useful): + + > MCSCRIPT_LAUNCH_HOME (optional) specifies the parent directory in which + > run subdirectories for qsub invocation and output logging should be made. + > Otherwise, this will default to MCSCRIPT_WORK_HOME. + + > MCSCRIPT_PYTHON (optional) specifies the command name to use to invoke + > Python 3 to execut run script files. The default is simply "python3", + > assuming the Python 3 executable is in the shell's command search + > PATH. However, you can instead specify, e.g., a full, qualified filename + > (i.e., including path). See note on "Availability of Python" in INSTALL.md. + + Language: Python 3 + + M. A. Caprio + University of Notre Dame + + + 3/6/13 (mac): Based on earlier qsubm csh script. + + 7/4/13 (mac): Support for multiple cluster flavors via qsubm_local. + + 1/22/14 (mac): Python 3 update. + + 10/27/14 (mac): Updates to --archive handling. + + 5/14/15 (mac): + - Insert "future" statements for Python 2 legacy support. + - Add --noredirect switch. + - Mandatory environment variable QSUBM_PYTHON. + + 8/4/15 (mac): Make user environment variable definitions into option. + + 6/13/16 (mac): Rename environment variables to MCSCRIPT_*. + + 6/22/16 (mac): Update to use config.py for local configuration. + + 12/14/16 (mac): Add --here option. + + 12/29/16 (mac): + - Add --spread option. + - Remove --pernode option. + - Make --opt option repeatable. + + 1/16/17 (mac): Add --serialthreads option. + + 2/23/17 (mac): Switch from os.mkdir to mcscript.utils.mkdir. + + 3/16/17 (mac): + - Add --setup option. + - Change environment interface to pass MCSCRIPT_TASK_MODE. + + 3/18/17 (mac): + - Revise to support updated hybrid run parameters. + - Rename option --setup to --prerun. + + 5/22/17 (mac): Fix processing of boolean option --redirect. + + 10/11/17 (pjf): Add --switchwaittime option. + + 01/05/18 (pjf): Sort arguments into groups. + + 02/11/18 (pjf): + - Pass through MCSCRIPT_INSTALL_HOME. + - Use job_environ for submission. + + 07/06/18 (pjf): + - Pass queue via MCSCRIPT_RUN_QUEUE. + - Remove MCSCRIPT_HYBRID_NODESIZE. + + 06/04/19 (pjf): + - Add hook for individual configurations to add command-line arguments. + - Move --switchwaittime option into config-slurm-nersc.py. + + 09/11/19 (pjf): Add expert mode argument. + + 11/18/19 (pjf): Fix job file existence check. + + 06/26/20 (mac): Make MCSCRIPT_PYTHON and MCSCRIPT_RUN_PREFIX optional. + + 10/11/20 (pjf): + - Rename `--num` to `--jobs`. + - Add `--workers` to allow multiple workers per job. + + 02/01/22 (pjf): Allow MCSCRIPT_RUN_HOME to be a colon-delimited list. + + 02/08/22 (pjf): + - Fix script extension selection. + - Switch from subprocess.Popen to subprocess.run. + + 07/02/22 (pjf): + - Force run_prefix="run". + - Warn if MCSCRIPT_RUN_PREFIX still defined. + + 07/14/22 (pjf): + - Add `--edit` mode. + - Update xterm title when running directly. + + 09/20/22 (pjf): Use os.exec instead of subprocess for local run_mode. +""" + +import argparse +import os +import shutil +import subprocess +import sys + +import mcscript.config +import mcscript.utils +import types + +def get_user_config(): + """Get user configuration from environment.""" + user_config = types.SimpleNamespace() + + user_config.install_home = mcscript.utils.expand_path(os.environ.get("MCSCRIPT_INSTALL_HOME", "")) + user_config.run_home_list = mcscript.utils.expand_path(os.environ.get("MCSCRIPT_RUN_HOME", "").split(":")) + user_config.work_home = mcscript.utils.expand_path(os.environ.get("MCSCRIPT_WORK_HOME", "")) + + # optional fields + user_config.launch_dir = mcscript.utils.expand_path(os.environ.get("MCSCRIPT_LAUNCH_DIR", "")) + user_config.python_executable = mcscript.utils.expand_path(os.environ.get("MCSCRIPT_PYTHON", "")) + user_config.env_script = mcscript.utils.expand_path(os.environ.get("MCSCRIPT_SOURCE", "")) + user_config.run_prefix = "run" + + return user_config + + +def parse_args(): + """Parse arguments. + + Returns: + (argparse.Namespace) parsed arguments + """ + parser = argparse.ArgumentParser( + description="Queue submission for numbered run.", + usage="%(prog)s [option] run queue|RUN wall [var1=val1, ...]\n", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + epilog= + """Simply omit the queue name and leave off the wall time for a + local interactive run. + + Environment variables for qsubm are described in INSTALL.md. + + Note that qsubm relies upon code in the local `config.py` + configuration file for the system or cluster you are running on, in + order to interpret the following arguments and translate them into + arguments for your local batch system. Your local configuration + file might not make use of or support all the parallel environment + options listed below. + """ + ) + + # general arguments + parser.add_argument("run", help="Run number (e.g., 0000 for run0000)") + + # latter arguments are made optional to simplify bare-bones syntax for --toc, etc., calls + parser.add_argument("queue", nargs='?', help="Submission queue, or RUN for direct interactive run", default="RUN") + parser.add_argument("wall", type=int, nargs='?', help="Wall time (minutes)", default=60) + ##parser.add_argument("vars", nargs="?", help="Environment variables to pass to script, with optional values, comma delimited (e.g., METHOD2, PARAM=1.0)") + parser.add_argument("--here", action="store_true", help="Force run in current working directory") + parser.add_argument("--vars", help="Environment variables to pass to script, with optional values, comma delimited (e.g., --vars=METHOD2, PARAM=1.0)") + ## parser.add_argument("--stat", action="store_true", help="Display queue status information") + parser.add_argument("--jobs", type=int, default=1, help="Number of (identical) jobs to submit") + parser.add_argument("--workers", type=int, default=1, help="Number of workers to launch per job (not supported by all queues)") + parser.add_argument("--opt", action="append", help="Additional option arguments to be passed to job submission command (e.g., --opt=\"-m ae\" or --opt=\"--mail-type=END,FAIL\"), may be repeated (e.g., --opt=\"-A acct\" --opt=\"-a 1200\"); beware the spaces may be important to the job submission command") + parser.add_argument("--expert", action="store_true", help="Run mcscript in expert mode") + + # serial run parallelization parameters + serial_group = parser.add_argument_group("serial run options (single-node, non-MPI)") + serial_group.add_argument("--serialthreads", type=int, default=1, help="OMP threads") + + # hybrid run parallelization parameters + # + # Not all local configuration files need necessarily require or + # respect all of the following parameters. + hybrid_group = parser.add_argument_group("hybrid run options") + hybrid_group.add_argument("--nodes", type=int, default=1, help="number of nodes") + hybrid_group.add_argument("--ranks", type=int, default=1, help="number of MPI ranks") + hybrid_group.add_argument("--threads", type=int, default=1, help="OMP threads per rank)") + hybrid_group.add_argument("--nodesize", type=int, default=0, help="logical threads available per node" + " (might instead be interpreted as physical CPUs depending on local config file)") + ##hybrid_group.add_argument("--undersubscription", type=int, default=1, help="undersubscription factor (e.g., spread=2 requests twice the cores needed)") + + # multi-task interface: invocation modes + task_mode_group = parser.add_mutually_exclusive_group() + task_mode_group.add_argument("--edit", action="store_true", help="Edit run script using EDITOR") + task_mode_group.add_argument("--toc", action="store_true", help="Invoke run script to generate task table of contents") + task_mode_group.add_argument("--unlock", action="store_true", help="Delete any .lock or .fail flags for tasks") + task_mode_group.add_argument("--archive", action="store_true", help="Invoke archive-generation run") + task_mode_group.add_argument("--prerun", action="store_true", help="Invoke prerun mode, for argument validation and file staging only") + task_mode_group.add_argument("--offline", action="store_true", help="Invoke offline mode, to create batch scripts for later submission instead of running compute codes") + + # multi-task interface: task selection + task_selection_group = parser.add_argument_group("multi-task run options") + task_selection_group.add_argument("--pool", help="Set task pool (may be Unix-style filename pattern, or comma-delimited list, or ALL) for task selection") + task_selection_group.add_argument("--phase", type=int, default=0, help="Set task phase for task selection") + task_selection_group.add_argument("--start", type=int, default=0, help="Set starting task number for task selection") + task_selection_group.add_argument("--limit", type=int, help="Set task count limit for task selection") + task_selection_group.add_argument("--redirect", default="True", choices=["True", "False"], help="Allow redirection of standard" + " output/error to file (may want to disable for interactive debugging)") + + # site-local options + try: + mcscript.config.qsubm_arguments(parser) + except AttributeError: + # local config doesn't provide arguments, ignore gracefully + pass + + return parser.parse_args() + +def main(): + """Run qsubm command-line utility.""" + + args = parse_args() + user_config = get_user_config() + + ################################################################ + # environment processing + ################################################################ + + if args.here: + user_config.run_home_list = [os.environ["PWD"]] + user_config.work_home = os.environ["PWD"] + user_config.launch_home = os.environ["PWD"] + + if not user_config.run_home_list: + print("MCSCRIPT_RUN_HOME not found in environment") + exit(1) + + if not user_config.work_home: + print("MCSCRIPT_WORK_HOME not found in environment") + exit(1) + + if not user_config.launch_home: + user_config.launch_home = user_config.work_home + + if not user_config.python_executable: + python_executable = "python3" + + if not user_config.install_home: + print("MCSCRIPT_INSTALL_HOME not found in environment") + exit(1) + + ################################################################ + # argument processing + ################################################################ + + # set run name + run = user_config.run_prefix + args.run + print("Run:", run) + + # ...and process run file + script_extensions = [".py", ".csh"] + job_file = None + for extension in script_extensions: + for run_home in user_config.run_home_list: + filename = os.path.join(run_home, run+extension) + if os.path.exists(filename): + job_file = filename + job_extension = extension + break + print(" Run homes:", user_config.run_home_list) # useful to report now, in case job file missing + if (job_file is None): + print(f"No job file {run}.* found with an extension in the set {script_extensions}.") + exit(1) + print(" Job file:", job_file) + + # set queue and flag batch or local mode + # force local run for task.py toc mode + if ((args.queue == "RUN") or args.toc or args.unlock): + run_mode = "local" + run_queue = "local" + print(" Mode:", run_mode) + else: + run_mode = "batch" + run_queue = args.queue + print(" Mode:", run_mode, "(%s)" % args.queue) + + # set wall time + wall_time_min = args.wall + print(" Wall time (min): {:d}".format(wall_time_min)) + wall_time_sec = wall_time_min*60 + + # environment definitions: general run parameters + environment_definitions = [ + "MCSCRIPT_RUN={:s}".format(run), + "MCSCRIPT_JOB_FILE={:s}".format(job_file), + "MCSCRIPT_RUN_MODE={:s}".format(run_mode), + "MCSCRIPT_RUN_QUEUE={:s}".format(run_queue), + "MCSCRIPT_WALL_SEC={:d}".format(wall_time_sec), + "MCSCRIPT_WORKERS={:d}".format(args.workers), + ] + + # environment definitions: serial run parameters + environment_definitions += [ + "MCSCRIPT_SERIAL_THREADS={:d}".format(args.serialthreads) + ] + + # environment definitions: hybrid run parameters + environment_definitions += [ + "MCSCRIPT_HYBRID_NODES={:d}".format(args.nodes), + "MCSCRIPT_HYBRID_RANKS={:d}".format(args.ranks), + "MCSCRIPT_HYBRID_THREADS={:d}".format(args.threads), + ] + + + # set multi-task run parameters + if (args.edit): + editor = os.environ.get("EDITOR", "vi") + os.execlp(editor, editor, job_file) + elif (args.toc): + task_mode = mcscript.task.TaskMode.kTOC + elif (args.unlock): + task_mode = mcscript.task.TaskMode.kUnlock + elif (args.archive): + task_mode = mcscript.task.TaskMode.kArchive + elif (args.prerun): + task_mode = mcscript.task.TaskMode.kPrerun + elif (args.offline): + task_mode = mcscript.task.TaskMode.kOffline + else: + task_mode = mcscript.task.TaskMode.kRun + + # environment definitions: multi-task run parameters + environment_definitions += [ + f"MCSCRIPT_TASK_MODE={task_mode.value:d}", + f"MCSCRIPT_TASK_PHASE={args.phase:d}", + f"MCSCRIPT_TASK_START_INDEX={args.start:d}", + f"MCSCRIPT_TASK_REDIRECT={args.redirect:s}", + ] + # TODO (mac): neaten up so that these arguments are always provided + # (and simplify this code to a simple list += as above) + if (args.pool is not None): + environment_definitions.append(f"MCSCRIPT_TASK_POOL={args.pool:s}") + if (args.limit is not None): + environment_definitions.append(f"MCSCRIPT_TASK_COUNT_LIMIT={args.limit:d}") + + # environment definitions: pass through install directory + environment_definitions += [ + f"MCSCRIPT_INSTALL_HOME={user_config.install_home:s}" + ] + + # include additional environment setup if defined + if user_config.env_script: + environment_definitions += [ + f"MCSCRIPT_SOURCE={user_config.env_script:s}" + ] + + # set user-specified variable definitions + # Note conditional is required since "".split(", ") is [""] rather than []. + if (args.vars is None): + user_environment_definitions = [] + else: + user_environment_definitions = args.vars.split(",") + print(" User environment definitions:", user_environment_definitions) + + environment_definitions += user_environment_definitions + + + ################################################################ + # directory setup + ################################################################ + + # set up scratch directory (for batch job work) + # name is defined here, but creation is left up to job script, + # in case scratch is local to the compute note + work_dir = os.path.join(user_config.work_home, run) + ## if ( not os.path.exists(work_dir)): + ## mcscript.utils.mkdir(work_dir) + environment_definitions.append(f"MCSCRIPT_WORK_DIR={work_dir}") + + # set up run launch directory (for batch job output logging) + launch_dir_parent = os.path.join(user_config.launch_home, run) + if not os.path.exists(user_config.launch_home): + mcscript.utils.mkdir(user_config.launch_home) + if not os.path.exists(launch_dir_parent): + mcscript.utils.mkdir(launch_dir_parent) + if args.archive: + # archive mode + # launch in archive directory rather than usual batch job output directory + # (important since if batch job server directs output to the + # regular output directory while tar is archiving that directory, + # tar will return with an error code, torpedoing the archive task) + launch_dir = os.path.join(user_config.launch_home, run, "archive") + else: + # standard run mode + launch_dir = os.path.join(user_config.launch_home, run, "batch") + if not os.path.exists(launch_dir): + mcscript.utils.mkdir(launch_dir) + environment_definitions.append(f"MCSCRIPT_LAUNCH_DIR={launch_dir}") + + + ################################################################ + # job environment setup + ################################################################ + + # construct job name + job_name = f"{run:s}" + ##job_name += "-w%d" % args.width + if args.pool is not None: + job_name += "-{args.pool:s}" + job_name += "-{args.phase:s}" + print(" Job name:", job_name) + + # process environment definitions + # regularize environment definitions + # Convert all plain variable name definitions "VAR" into definition + # as null string "VAR=". Note that "VAR" would be an environment + # variable pass-through request to qsub, but it causes trouble with + # defining an environment for local execution. So doing this + # regularization simplifies further processing and ensures + # uniformity of the environment between batch and local runs. + for i in range(len(environment_definitions)): + if (not "=" in environment_definitions[i]): + environment_definitions[i] += "=" + print() + print("Vars:", ",".join(environment_definitions)) + # for local run + job_environ=os.environ + environment_keyvalues = [ + entry.split("=") + for entry in environment_definitions + ] + job_environ.update(dict(environment_keyvalues)) + + + ################################################################ + # run invocation + ################################################################ + + # flush script output before invoking job + print() + sys.stdout.flush() + + # handle batch run + if (run_mode == "batch"): + + # set local qsub arguments + (submission_args, submission_input_string, repetitions) = mcscript.config.submission(job_name, job_file, environment_definitions, args) + + # notes: options must come before command on some platforms (e.g., Univa) + print(" ".join(submission_args)) + print(submission_input_string) + print() + print("-"*64) + for i in range(repetitions): + subprocess.run( + submission_args, + input=submission_input_string, + stdout=sys.stdout, + stderr=subprocess.STDOUT, # to redirect via stdout + env=job_environ, + cwd=launch_dir + ) + + # handle interactive run + # Note: We call interpreter rather than trying to directly execute + # job file since this saves us from bothering with execute permissions. + # But, beware the interpreter enforced by the script's shebang line might + # be different from the version of the interpreter found in the below invocation, + # especially in a "module" environment. + elif (run_mode == "local"): + if (job_extension == ".py"): + popen_args = [user_config.python_executable, job_file] + elif (job_extension == ".csh"): + popen_args = ["csh", job_file] + print() + print("-"*64) + if task_mode is mcscript.task.TaskMode.kRun: + print(f"\033]2;qsubm {run}\007") + os.chdir(launch_dir) + os.execvpe(popen_args[0], popen_args, env=job_environ) diff --git a/mcscript_init.sh b/mcscript_init.sh deleted file mode 120000 index 69523b5..0000000 --- a/mcscript_init.sh +++ /dev/null @@ -1 +0,0 @@ -deprecated/mcscript_init.sh \ No newline at end of file diff --git a/qsubm.py b/qsubm.py deleted file mode 100644 index bed228b..0000000 --- a/qsubm.py +++ /dev/null @@ -1,484 +0,0 @@ -"""qsubm -- generic queue submission for task-oriented batch scripts - - See INSTALL.md for configuration information: - - - A local definitions file config.py must be defined. - - - Several environment variables must be defined. In addition to the - mandatory environment variables defined there, the following (deprecated?) - variables are recognized (but not frequently useful): - - > MCSCRIPT_LAUNCH_HOME (optional) specifies the parent directory in which - > run subdirectories for qsub invocation and output logging should be made. - > Otherwise, this will default to MCSCRIPT_WORK_HOME. - - > MCSCRIPT_PYTHON (optional) specifies the command name to use to invoke - > Python 3 to execut run script files. The default is simply "python3", - > assuming the Python 3 executable is in the shell's command search - > PATH. However, you can instead specify, e.g., a full, qualified filename - > (i.e., including path). See note on "Availability of Python" in INSTALL.md. - - Language: Python 3 - - M. A. Caprio - University of Notre Dame - - + 3/6/13 (mac): Based on earlier qsubm csh script. - + 7/4/13 (mac): Support for multiple cluster flavors via qsubm_local. - + 1/22/14 (mac): Python 3 update. - + 10/27/14 (mac): Updates to --archive handling. - + 5/14/15 (mac): - - Insert "future" statements for Python 2 legacy support. - - Add --noredirect switch. - - Mandatory environment variable QSUBM_PYTHON. - + 8/4/15 (mac): Make user environment variable definitions into option. - + 6/13/16 (mac): Rename environment variables to MCSCRIPT_*. - + 6/22/16 (mac): Update to use config.py for local configuration. - + 12/14/16 (mac): Add --here option. - + 12/29/16 (mac): - - Add --spread option. - - Remove --pernode option. - - Make --opt option repeatable. - + 1/16/17 (mac): Add --serialthreads option. - + 2/23/17 (mac): Switch from os.mkdir to mcscript.utils.mkdir. - + 3/16/17 (mac): - - Add --setup option. - - Change environment interface to pass MCSCRIPT_TASK_MODE. - + 3/18/17 (mac): - - Revise to support updated hybrid run parameters. - - Rename option --setup to --prerun. - + 5/22/17 (mac): Fix processing of boolean option --redirect. - + 10/11/17 (pjf): Add --switchwaittime option. - + 01/05/18 (pjf): Sort arguments into groups. - + 02/11/18 (pjf): - - Pass through MCSCRIPT_INSTALL_HOME. - - Use job_environ for submission. - + 07/06/18 (pjf): - - Pass queue via MCSCRIPT_RUN_QUEUE. - - Remove MCSCRIPT_HYBRID_NODESIZE. - + 06/04/19 (pjf): - - Add hook for individual configurations to add command-line arguments. - - Move --switchwaittime option into config-slurm-nersc.py. - + 09/11/19 (pjf): Add expert mode argument. - + 11/18/19 (pjf): Fix job file existence check. - + 06/26/20 (mac): Make MCSCRIPT_PYTHON and MCSCRIPT_RUN_PREFIX optional. - + 10/11/20 (pjf): - - Rename `--num` to `--jobs`. - - Add `--workers` to allow multiple workers per job. - + 02/01/22 (pjf): Allow MCSCRIPT_RUN_HOME to be a colon-delimited list. - + 02/08/22 (pjf): - - Fix script extension selection. - - Switch from subprocess.Popen to subprocess.run. - + 07/02/22 (pjf): - - Force run_prefix="run". - - Warn if MCSCRIPT_RUN_PREFIX still defined. - + 07/14/22 (pjf): - - Add `--edit` mode. - - Update xterm title when running directly. - + 09/20/22 (pjf): Use os.exec instead of subprocess for local run_mode. -""" - -import argparse -import os -import shutil -import subprocess -import sys - -import mcscript.config # local configuration (usually symlink) -import mcscript.utils - -################################################################ -# argument parsing -################################################################ - -parser = argparse.ArgumentParser( - description="Queue submission for numbered run.", - usage= - "%(prog)s [option] run queue|RUN wall [var1=val1, ...]\n", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - epilog= - """Simply omit the queue name and leave off the wall time for a - local interactive run. - - Environment variables for qsubm are described in INSTALL.md. - - Note that qsubm relies upon code in the local `config.py` - configuration file for the system or cluster you are running on, in - order to interpret the following arguments and translate them into - arguments for your local batch system. Your local configuration - file might not make use of or support all the parallel environment - options listed below. - """ - ) - -# general arguments -parser.add_argument("run", help="Run number (e.g., 0000 for run0000)") -# latter arguments are made optional to simplify bare-bones syntax for --toc, etc., calls -parser.add_argument("queue", nargs='?', help="Submission queue, or RUN for direct interactive run", default="RUN") -parser.add_argument("wall", type=int, nargs='?', help="Wall time (minutes)", default=60) -##parser.add_argument("vars", nargs="?", help="Environment variables to pass to script, with optional values, comma delimited (e.g., METHOD2, PARAM=1.0)") -parser.add_argument("--here", action="store_true", help="Force run in current working directory") -parser.add_argument("--vars", help="Environment variables to pass to script, with optional values, comma delimited (e.g., --vars=METHOD2, PARAM=1.0)") -## parser.add_argument("--stat", action="store_true", help="Display queue status information") -parser.add_argument("--jobs", type=int, default=1, help="Number of (identical) jobs to submit") -parser.add_argument("--workers", type=int, default=1, help="Number of workers to launch per job (not supported by all queues)") -parser.add_argument("--opt", action="append", help="Additional option arguments to be passed to job submission command (e.g., --opt=\"-m ae\" or --opt=\"--mail-type=END,FAIL\"), may be repeated (e.g., --opt=\"-A acct\" --opt=\"-a 1200\"); beware the spaces may be important to the job submission command") -parser.add_argument("--expert", action="store_true", help="Run mcscript in expert mode") - -# serial run parallelization parameters -serial_group = parser.add_argument_group("serial run options (single-node, non-MPI)") -serial_group.add_argument("--serialthreads", type=int, default=1, help="OMP threads") - -# hybrid run parallelization parameters -# -# Not all local configuration files need necessarily require or -# respect all of the following parameters. -hybrid_group = parser.add_argument_group("hybrid run options") -hybrid_group.add_argument("--nodes", type=int, default=1, help="number of nodes") -hybrid_group.add_argument("--ranks", type=int, default=1, help="number of MPI ranks") -hybrid_group.add_argument("--threads", type=int, default=1, help="OMP threads per rank)") -hybrid_group.add_argument("--nodesize", type=int, default=0, help="logical threads available per node" - " (might instead be interpreted as physical CPUs depending on local config file)") -##hybrid_group.add_argument("--undersubscription", type=int, default=1, help="undersubscription factor (e.g., spread=2 requests twice the cores needed)") - -# multi-task interface: invocation modes -task_mode_group = parser.add_mutually_exclusive_group() -task_mode_group.add_argument("--edit", action="store_true", help="Edit run script using EDITOR") -task_mode_group.add_argument("--toc", action="store_true", help="Invoke run script to generate task table of contents") -task_mode_group.add_argument("--unlock", action="store_true", help="Delete any .lock or .fail flags for tasks") -task_mode_group.add_argument("--archive", action="store_true", help="Invoke archive-generation run") -task_mode_group.add_argument("--prerun", action="store_true", help="Invoke prerun mode, for argument validation and file staging only") -task_mode_group.add_argument("--offline", action="store_true", help="Invoke offline mode, to create batch scripts for later submission instead of running compute codes") - -# multi-task interface: task selection -task_selection_group = parser.add_argument_group("multi-task run options") -task_selection_group.add_argument("--pool", help="Set task pool (may be Unix-style filename pattern, or comma-delimited list, or ALL) for task selection") -task_selection_group.add_argument("--phase", type=int, default=0, help="Set task phase for task selection") -task_selection_group.add_argument("--start", type=int, help="Set starting task number for task selection") -task_selection_group.add_argument("--limit", type=int, help="Set task count limit for task selection") -task_selection_group.add_argument("--redirect", default="True", choices=["True", "False"], help="Allow redirection of standard" - " output/error to file (may want to disable for interactive debugging)") - -# some special options (deprecated?) -##parser.add_argument("--epar", type=int, default=None, help="Width for embarassingly parallel job") -##parser.add_argument("--nopar", action="store_true", help="Disable parallel resource requests (for use on special serial queues)") - -# site-local options -try: - mcscript.config.qsubm_arguments(parser) -except AttributeError: - # local config doesn't provide arguments, ignore gracefully - pass - -##parser.print_help() -##print -args = parser.parse_args() -##printargs - -################################################################ -# special mode: status display -################################################################ - -# TODO -# will have to modify argument processing to allow no arguments, local -# customization for qstat - -# @ i = 0 -# while (($i == 0) || ($loop)) -# @ i++ -# clear -# echo "****************************************************************" -# qstat -u $user -# if ($loop) sleep 5 -# end - -## if (args.stat): -## pass - -################################################################ -# environment processing -################################################################ - -if (args.here): - run_home_list = [os.environ["PWD"]] -elif ("MCSCRIPT_RUN_HOME" in os.environ): - run_home_list = os.environ["MCSCRIPT_RUN_HOME"].split(":") -else: - print("MCSCRIPT_RUN_HOME not found in environment") - exit(1) - -if (args.here): - work_home = os.environ["PWD"] -elif ("MCSCRIPT_WORK_HOME" in os.environ): - work_home = os.environ["MCSCRIPT_WORK_HOME"] -else: - print("MCSCRIPT_WORK_HOME not found in environment") - exit(1) - -if (args.here): - launch_home = os.environ["PWD"] -elif ("MCSCRIPT_LAUNCH_HOME" in os.environ): - launch_home = os.environ["MCSCRIPT_LAUNCH_HOME"] -else: - launch_home = work_home - -if ("MCSCRIPT_RUN_PREFIX" in os.environ): - # run_prefix = os.environ["MCSCRIPT_RUN_PREFIX"] - print("****************************************************************") - print("MCSCRIPT_RUN_PREFIX is now ignored.") - print("Runs MUST use the prefix 'run`.") - print("****************************************************************") -run_prefix = "run" - -if ("MCSCRIPT_PYTHON" in os.environ): - python_executable = os.environ["MCSCRIPT_PYTHON"] -else: - python_executable = "python3" - -if ("MCSCRIPT_DIR" in os.environ): - qsubm_path = os.environ["MCSCRIPT_DIR"] -else: - print("MCSCRIPT_DIR not found in environment") - exit(1) - -################################################################ -# argument processing -################################################################ - -# set run name -run = run_prefix + args.run -print("Run:", run) - -# ...and process run file -script_extensions = [".py", ".csh"] -job_file = None -for extension in script_extensions: - for run_home in run_home_list: - filename = os.path.join(run_home, run+extension) - if os.path.exists(filename): - job_file = filename - job_extension = extension - break -print(" Run homes:", run_home_list) # useful to report now, in case job file missing -if (job_file is None): - print("No job file %s.* found with an extension in the set %s." % (run, script_extensions)) - exit(1) -print(" Job file:", job_file) - -# set queue and flag batch or local mode -# force local run for task.py toc mode -if ((args.queue == "RUN") or args.toc or args.unlock): - run_mode = "local" - run_queue = "local" - print(" Mode:", run_mode) -else: - run_mode = "batch" - run_queue = args.queue - print(" Mode:", run_mode, "(%s)" % args.queue) - -# set wall time -wall_time_min = args.wall -print(" Wall time (min): {:d}".format(wall_time_min)) -wall_time_sec = wall_time_min*60 - -# environment definitions: general run parameters -environment_definitions = [ - "MCSCRIPT_RUN={:s}".format(run), - "MCSCRIPT_JOB_FILE={:s}".format(job_file), - "MCSCRIPT_RUN_MODE={:s}".format(run_mode), - "MCSCRIPT_RUN_QUEUE={:s}".format(run_queue), - "MCSCRIPT_WALL_SEC={:d}".format(wall_time_sec), - "MCSCRIPT_WORKERS={:d}".format(args.workers), -] - -# environment definitions: serial run parameters -environment_definitions += [ - "MCSCRIPT_SERIAL_THREADS={:d}".format(args.serialthreads) -] - -# environment definitions: hybrid run parameters -environment_definitions += [ - "MCSCRIPT_HYBRID_NODES={:d}".format(args.nodes), - "MCSCRIPT_HYBRID_RANKS={:d}".format(args.ranks), - "MCSCRIPT_HYBRID_THREADS={:d}".format(args.threads), -] - - -# set multi-task run parameters -if (args.edit): - editor = os.environ.get("EDITOR", "vi") - os.execlp(editor, editor, job_file) -elif (args.toc): - task_mode = mcscript.task.TaskMode.kTOC -elif (args.unlock): - task_mode = mcscript.task.TaskMode.kUnlock -elif (args.archive): - task_mode = mcscript.task.TaskMode.kArchive -elif (args.prerun): - task_mode = mcscript.task.TaskMode.kPrerun -elif (args.offline): - task_mode = mcscript.task.TaskMode.kOffline -else: - task_mode = mcscript.task.TaskMode.kRun - -# TODO (mac): neaten up so that these arguments are always provided -# (and simplify this code to a simple list += as above) -environment_definitions.append("MCSCRIPT_TASK_MODE={:d}".format(task_mode.value)) -if (args.pool is not None): - environment_definitions.append("MCSCRIPT_TASK_POOL={:s}".format(args.pool)) -if (args.phase is not None): - environment_definitions.append("MCSCRIPT_TASK_PHASE={:d}".format(args.phase)) -if (args.start is not None): - environment_definitions.append("MCSCRIPT_TASK_START_INDEX={:d}".format(args.start)) -if (args.limit is not None): - environment_definitions.append("MCSCRIPT_TASK_COUNT_LIMIT={:d}".format(args.limit)) -environment_definitions.append("MCSCRIPT_TASK_REDIRECT={:s}".format(args.redirect)) - -# pass through install directory -if os.environ.get("MCSCRIPT_INSTALL_HOME"): - environment_definitions += [ - "MCSCRIPT_INSTALL_HOME={:s}".format(os.environ["MCSCRIPT_INSTALL_HOME"]) - ] -elif os.environ.get("MCSCRIPT_INSTALL_DIR"): - # TODO remove deprecated environment variable - print("****************************************************************") - print("MCSCRIPT_INSTALL_DIR is now MCSCRIPT_INSTALL_HOME.") - print("Please update your environment variables.") - print("****************************************************************") - environment_definitions += [ - "MCSCRIPT_INSTALL_HOME={:s}".format(os.environ["MCSCRIPT_INSTALL_DIR"]) - ] -else: - print("MCSCRIPT_INSTALL_HOME not found in environment") - exit(1) - -# include additional environment setup if defined -if os.environ.get("MCSCRIPT_SOURCE"): - environment_definitions += [ - "MCSCRIPT_SOURCE={:s}".format(os.environ["MCSCRIPT_SOURCE"]) - ] - -# set user-specified variable definitions -# Note conditional is required since "".split(", ") is [""] rather than []. -if (args.vars is None): - user_environment_definitions = [] -else: - user_environment_definitions = args.vars.split(",") - print(" User environment definitions:", user_environment_definitions) - -environment_definitions += user_environment_definitions - - -################################################################ -# directory setup -################################################################ - -# set up scratch directory (for batch job work) -# name is defined here, but creation is left up to job script, -# in case scratch is local to the compute note -work_dir = os.path.join(work_home, run) -## if ( not os.path.exists(work_dir)): -## mcscript.utils.mkdir(work_dir) -environment_definitions.append("MCSCRIPT_WORK_DIR=%s" % work_dir) - -# set up run launch directory (for batch job output logging) -launch_dir_parent = os.path.join(launch_home, run) -if ( not os.path.exists(launch_home)): - mcscript.utils.mkdir(launch_home) -if ( not os.path.exists(launch_dir_parent)): - mcscript.utils.mkdir(launch_dir_parent) -if (args.archive): - # archive mode - # launch in archive directory rather than usual batch job output directory - # (important since if batch job server directs output to the - # regular output directory while tar is archiving that directory, - # tar will return with an error code, torpedoing the archive task) - launch_dir = os.path.join(launch_home, run, "archive") -else: - # standard run mode - launch_dir = os.path.join(launch_home, run, "batch") -if ( not os.path.exists(launch_dir)): - mcscript.utils.mkdir(launch_dir) -environment_definitions.append("MCSCRIPT_LAUNCH_DIR=%s" % launch_dir) - - -################################################################ -# job environment setup -################################################################ - -# construct job name -job_name = "%s" % run -##job_name += "-w%d" % args.width -if (args.pool is not None): - job_name += "-%s" % args.pool -job_name += "-%s" % args.phase -print(" Job name:", job_name) - -# process environment definitions -# regularize environment definitions -# Convert all plain variable name definitions "VAR" into definition -# as null string "VAR=". Note that "VAR" would be an environment -# variable pass-through request to qsub, but it causes trouble with -# defining an environment for local execution. So doing this -# regularization simplifies further processing and ensures -# uniformity of the environment between batch and local runs. -for i in range(len(environment_definitions)): - if (not "=" in environment_definitions[i]): - environment_definitions[i] += "=" -print() -print("Vars:", ",".join(environment_definitions)) -# for local run -job_environ=os.environ -environment_keyvalues = [ - entry.split("=") - for entry in environment_definitions - ] -job_environ.update(dict(environment_keyvalues)) - - -################################################################ -# run invocation -################################################################ - -# flush script output before invoking job -print() -sys.stdout.flush() - -# handle batch run -if (run_mode == "batch"): - - # set local qsub arguments - (submission_args, submission_input_string, repetitions) = mcscript.config.submission(job_name, job_file, qsubm_path, environment_definitions, args) - - # notes: options must come before command on some platforms (e.g., Univa) - print(" ".join(submission_args)) - print(submission_input_string) - print() - print("-"*64) - for i in range(repetitions): - subprocess.run( - submission_args, - input=submission_input_string, - stdout=sys.stdout, - stderr=subprocess.STDOUT, # to redirect via stdout - env=job_environ, - cwd=launch_dir - ) - -# handle interactive run -# Note: We call interpreter rather than trying to directly execute -# job file since this saves us from bothering with execute permissions. -# But, beware the interpreter enforced by the script's shebang line might -# be different from the version of the interpreter found in the below invocation, -# especially in a "module" environment. -elif (run_mode == "local"): - if (job_extension == ".py"): - popen_args = [python_executable, job_file] - elif (job_extension == ".csh"): - popen_args = ["csh", job_file] - print() - print("-"*64) - if task_mode is mcscript.task.TaskMode.kRun: - print(f"\033]2;qsubm {run}\007") - os.chdir(launch_dir) - os.execvpe(popen_args[0], popen_args, env=job_environ) diff --git a/setup.py b/setup.py index 3c3a961..15c58f9 100644 --- a/setup.py +++ b/setup.py @@ -6,6 +6,16 @@ author="Mark A. Caprio, Patrick J. Fasano, University of Notre Dame", description=("Scripting setup, utilities, and task control for cluster runs"), license="MIT", - packages=find_packages(include='mcscript*'), + packages=find_packages(include=['mcscript*']), + entry_points={ + "console_scripts": [ + "qsubm = mcscript.qsubm:main", + ], + }, + package_data={ + "mcscript": [ + "job_wrappers/*", + ] + }, classifiers=[], ) diff --git a/tools/qsubm b/tools/qsubm deleted file mode 100755 index ee93616..0000000 --- a/tools/qsubm +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -if [ -z "$MCSCRIPT_PYTHON" ]; then export MCSCRIPT_PYTHON=python3; fi -${MCSCRIPT_PYTHON} ${MCSCRIPT_DIR}/qsubm.py ${*} From 3d79aa0541443ca3d6b8db67176723d41ce4bcc1 Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Thu, 13 Jul 2023 17:46:13 -0500 Subject: [PATCH 03/26] pep420: Convert mcscript to namespace package --- mcscript/__init__.py | 42 ------------------------------------------ setup.py | 4 ++-- 2 files changed, 2 insertions(+), 44 deletions(-) delete mode 100644 mcscript/__init__.py diff --git a/mcscript/__init__.py b/mcscript/__init__.py deleted file mode 100644 index 765b6b5..0000000 --- a/mcscript/__init__.py +++ /dev/null @@ -1,42 +0,0 @@ -""" mcscript package -- scripting setup, utilities, and task control for cluster runs - - Language: Python 3 - - M. A. Caprio - Department of Physics, University of Notre Dame - - + 06/05/13 (mac): Derived from earlier job.py, script.py, task.py complex (2/13). - + 01/22/14 (mac): Python 3 update. - + 05/13/15 (mac): Insert "future" statements for attempted Python 2 legacy support. - + 06/13/16 (mac): Restructure submodules and generate __init__.py loader file. - + 11/22/16 (mac): Continue restructuring submodules. - -""" - -# To import submodule functions as package.submodule.foo: -# import package.submodule -# from . import submodule # is this actually necessary? must review... -# -# To import submodule functions package.submodule.foo at top module -# level, as package.foo: -# -# import package.submodule -# from package.submodule import * - -# load parameters -from . import parameters - -# load local hooks -from . import config - -# load control functions -# imported into global namespace -from . import control -from .control import * - -# load utilities submodule -from . import utils -## from mcscript.utils import * - -# load task machinery submodule -from . import task diff --git a/setup.py b/setup.py index 15c58f9..6eb4e22 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup, find_packages +from setuptools import setup, find_namespace_packages setup( name="mcscript", @@ -6,7 +6,7 @@ author="Mark A. Caprio, Patrick J. Fasano, University of Notre Dame", description=("Scripting setup, utilities, and task control for cluster runs"), license="MIT", - packages=find_packages(include=['mcscript*']), + packages=find_namespace_packages(include=['mcscript*']), entry_points={ "console_scripts": [ "qsubm = mcscript.qsubm:main", From 9b4b1ea3823636410a9d1cc623981fd88cfb1fc3 Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Thu, 13 Jul 2023 19:45:15 -0500 Subject: [PATCH 04/26] mcscript: Fix imports for namespace module * Replace `mcscript` -> `mcscript.control` where applicable. --- example/runex01.py | 11 +++++++---- example/runex02.py | 13 +++++++++---- example/runex03.py | 11 ++++++----- example/runex04.py | 11 ++++++----- example/runex05.py | 8 +++++--- mcscript/config/ompi.py | 3 ++- mcscript/control.py | 10 +++++----- tools/mcscript_test.py | 6 ++++-- 8 files changed, 44 insertions(+), 29 deletions(-) diff --git a/example/runex01.py b/example/runex01.py index 9062951..71e6c43 100644 --- a/example/runex01.py +++ b/example/runex01.py @@ -14,8 +14,11 @@ """ import mcscript +import mcscript.control +import mcscript.parameters +import mcscript.utils -mcscript.init() +mcscript.control.init() ################################################################## # main body @@ -55,7 +58,7 @@ # example of running an executable # -# Note that mcscript.call is a wrapper to the subprocess +# Note that mcscript.control.call is a wrapper to the subprocess # package, but does a lot more... It generates logging output, it # checks the return code and generates an exception on failure # (i.e., a nonzero return), it can provide input lines to the code @@ -64,10 +67,10 @@ # # See the docstring for mcscript.utils.call for further information. -mcscript.call(["/bin/cat","hello.txt"]) +mcscript.control.call(["/bin/cat","hello.txt"]) ################################################################ # termination ################################################################ -mcscript.termination() +mcscript.control.termination() diff --git a/example/runex02.py b/example/runex02.py index 1dfe3e0..aadb19e 100644 --- a/example/runex02.py +++ b/example/runex02.py @@ -122,7 +122,12 @@ # -- make available some functions for use in script # (including mcscript utility functions and mcscript.task task management) import mcscript -mcscript.init() +import mcscript.control +import mcscript.task +import mcscript.parameters +import mcscript.utils +import mcscript.exception +mcscript.control.init() ################################################################## # build task list @@ -214,7 +219,7 @@ def say_hello(task): # save results file to common results directory print("Saving renamed output file...") results_filename = "{}-hello-{:s}.txt".format(mcscript.parameters.run.name,task["metadata"]["descriptor"]) - mcscript.call( + mcscript.control.call( [ "cp", "--verbose", @@ -255,7 +260,7 @@ def say_goodbye(task): # save results file to common results directory print("Saving renamed output file...") results_filename = "{}-goodbye-{:s}.txt".format(mcscript.parameters.run.name,task["metadata"]["descriptor"]) - mcscript.call( + mcscript.control.call( [ "cp", "--verbose", @@ -283,4 +288,4 @@ def say_goodbye(task): # termination ################################################################ -mcscript.termination() +mcscript.control.termination() diff --git a/example/runex03.py b/example/runex03.py index e0f06bc..e4b7d38 100644 --- a/example/runex03.py +++ b/example/runex03.py @@ -37,8 +37,9 @@ import os import mcscript +import mcscript.control -mcscript.init() +mcscript.control.init() ################################################################## # main body @@ -54,17 +55,17 @@ # via standard input (optional parameter input_lines), and various # other possibilities depending on the optional parameters. # -# See the docstring for mcscript.call for further information. +# See the docstring for mcscript.control.call for further information. # running an executable "unwrapped" -- no OpenMP/MPI setup -mcscript.call(["lscpu"]) +mcscript.control.call(["lscpu"]) # running a "hybrid" exectuable -- use both OpenMP and MPI executable_filename = os.path.join(os.environ["MCSCRIPT_DIR"],"example","hello-hybrid") -mcscript.call([executable_filename],mode=mcscript.CallMode.kHybrid) +mcscript.control.call([executable_filename],mode=mcscript.control.CallMode.kHybrid) ################################################################ # termination ################################################################ -mcscript.termination() +mcscript.control.termination() diff --git a/example/runex04.py b/example/runex04.py index 38aa77c..d704462 100644 --- a/example/runex04.py +++ b/example/runex04.py @@ -17,8 +17,9 @@ import os import mcscript +import mcscript.control -mcscript.init() +mcscript.control.init() print(64*"-") print("Python's environment (os.environ):") @@ -28,21 +29,21 @@ print(64*"-") print("Local invocation of env:") -mcscript.call(["env"],mode=mcscript.CallMode.kLocal) +mcscript.control.call(["env"],mode=mcscript.control.CallMode.kLocal) print() print(64*"-") print("Invocation of env as serial compute code:") -mcscript.call(["env"],mode=mcscript.CallMode.kSerial) +mcscript.control.call(["env"],mode=mcscript.control.CallMode.kSerial) print() print(64*"-") print("Invocation of env as hybrid compute code:") -mcscript.call(["env"],mode=mcscript.CallMode.kHybrid) +mcscript.control.call(["env"],mode=mcscript.control.CallMode.kHybrid) print() ################################################################ # termination ################################################################ -mcscript.termination() +mcscript.control.termination() diff --git a/example/runex05.py b/example/runex05.py index 0d35970..2305d32 100644 --- a/example/runex05.py +++ b/example/runex05.py @@ -13,14 +13,16 @@ """ import mcscript +import mcscript.control +import mcscript.parameters -mcscript.init() +mcscript.control.init() ################################################################## # main body ################################################################## -mcscript.call( +mcscript.control.call( ["cat"], input_lines=[ "", @@ -39,4 +41,4 @@ # termination ################################################################ -mcscript.termination() +mcscript.control.termination() diff --git a/mcscript/config/ompi.py b/mcscript/config/ompi.py index bbd81b1..c2738d1 100644 --- a/mcscript/config/ompi.py +++ b/mcscript/config/ompi.py @@ -19,6 +19,7 @@ import sys from .. import ( + exception, parameters, utils, ) @@ -57,7 +58,7 @@ def submission(job_name,job_file,environment_definitions,args): """ - raise(ScriptError("no batch submission")) + raise exception.ScriptError("no batch submission") ################################################################ diff --git a/mcscript/control.py b/mcscript/control.py index a29abeb..43ede30 100644 --- a/mcscript/control.py +++ b/mcscript/control.py @@ -12,7 +12,7 @@ + Reformat output from call, including adding wall time. + Deprecate aliases to call mode. - 6/28/17 (mac): - + Remove storage of stdout/sterr by POpen.communicate in mcscript.call. + + Remove storage of stdout/sterr by POpen.communicate in mcscript.control.call. + Remove deprecated aliases to call mode. - 06/07/19 (pjf): + Use new (Python 3.5+) subprocess interface subprocess.run. @@ -282,10 +282,10 @@ def call( Examples: - >>> mcscript.call(["cat"],input_lines=["a","b"]) # basic - >>> mcscript.call(["catx"],input_lines=["a","b"]) # for execution failure - >>> mcscript.call(["cat","badfile"],input_lines=["a","b"]) # for nonzero return - >>> mcscript.call(["cat","badfile"],input_lines=["a","b"],mode=mcscript.CallMode.kSerial) # specifying run mode + >>> mcscript.control.call(["cat"],input_lines=["a","b"]) # basic + >>> mcscript.control.call(["catx"],input_lines=["a","b"]) # for execution failure + >>> mcscript.control.call(["cat","badfile"],input_lines=["a","b"]) # for nonzero return + >>> mcscript.control.call(["cat","badfile"],input_lines=["a","b"],mode=mcscript.control.CallMode.kSerial) # specifying run mode """ diff --git a/tools/mcscript_test.py b/tools/mcscript_test.py index 427aa5a..f93ef06 100644 --- a/tools/mcscript_test.py +++ b/tools/mcscript_test.py @@ -23,11 +23,13 @@ os.environ["MCSCRIPT_EPAR"] = "-1" import mcscript +import mcscript.control +import mcscript.utils -mcscript.init() +mcscript.control.init() # test utils submodule print("Time stamp:",mcscript.utils.time_stamp()) # long form -print("Time stamp:",mcscript.time_stamp()) +print("Time stamp:",mcscript.control.time_stamp()) From 31a158677687d652d08e9968af23b18f1e178e9b Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Thu, 13 Jul 2023 19:45:35 -0500 Subject: [PATCH 05/26] qsubm: Use relative imports --- mcscript/qsubm.py | 51 +++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/mcscript/qsubm.py b/mcscript/qsubm.py index 67f0594..97b8bb7 100644 --- a/mcscript/qsubm.py +++ b/mcscript/qsubm.py @@ -83,23 +83,26 @@ import shutil import subprocess import sys - -import mcscript.config -import mcscript.utils import types +from . import ( + config, + task, + utils, +) + def get_user_config(): """Get user configuration from environment.""" user_config = types.SimpleNamespace() - user_config.install_home = mcscript.utils.expand_path(os.environ.get("MCSCRIPT_INSTALL_HOME", "")) - user_config.run_home_list = mcscript.utils.expand_path(os.environ.get("MCSCRIPT_RUN_HOME", "").split(":")) - user_config.work_home = mcscript.utils.expand_path(os.environ.get("MCSCRIPT_WORK_HOME", "")) + user_config.install_home = utils.expand_path(os.environ.get("MCSCRIPT_INSTALL_HOME", "")) + user_config.run_home_list = utils.expand_path(os.environ.get("MCSCRIPT_RUN_HOME", "").split(":")) + user_config.work_home = utils.expand_path(os.environ.get("MCSCRIPT_WORK_HOME", "")) # optional fields - user_config.launch_dir = mcscript.utils.expand_path(os.environ.get("MCSCRIPT_LAUNCH_DIR", "")) - user_config.python_executable = mcscript.utils.expand_path(os.environ.get("MCSCRIPT_PYTHON", "")) - user_config.env_script = mcscript.utils.expand_path(os.environ.get("MCSCRIPT_SOURCE", "")) + user_config.launch_dir = utils.expand_path(os.environ.get("MCSCRIPT_LAUNCH_DIR", "")) + user_config.python_executable = utils.expand_path(os.environ.get("MCSCRIPT_PYTHON", "")) + user_config.env_script = utils.expand_path(os.environ.get("MCSCRIPT_SOURCE", "")) user_config.run_prefix = "run" return user_config @@ -181,7 +184,7 @@ def parse_args(): # site-local options try: - mcscript.config.qsubm_arguments(parser) + config.qsubm_arguments(parser) except AttributeError: # local config doesn't provide arguments, ignore gracefully pass @@ -289,17 +292,17 @@ def main(): editor = os.environ.get("EDITOR", "vi") os.execlp(editor, editor, job_file) elif (args.toc): - task_mode = mcscript.task.TaskMode.kTOC + task_mode = task.TaskMode.kTOC elif (args.unlock): - task_mode = mcscript.task.TaskMode.kUnlock + task_mode = task.TaskMode.kUnlock elif (args.archive): - task_mode = mcscript.task.TaskMode.kArchive + task_mode = task.TaskMode.kArchive elif (args.prerun): - task_mode = mcscript.task.TaskMode.kPrerun + task_mode = task.TaskMode.kPrerun elif (args.offline): - task_mode = mcscript.task.TaskMode.kOffline + task_mode = task.TaskMode.kOffline else: - task_mode = mcscript.task.TaskMode.kRun + task_mode = task.TaskMode.kRun # environment definitions: multi-task run parameters environment_definitions += [ @@ -346,15 +349,15 @@ def main(): # in case scratch is local to the compute note work_dir = os.path.join(user_config.work_home, run) ## if ( not os.path.exists(work_dir)): - ## mcscript.utils.mkdir(work_dir) + ## utils.mkdir(work_dir) environment_definitions.append(f"MCSCRIPT_WORK_DIR={work_dir}") # set up run launch directory (for batch job output logging) launch_dir_parent = os.path.join(user_config.launch_home, run) if not os.path.exists(user_config.launch_home): - mcscript.utils.mkdir(user_config.launch_home) + utils.mkdir(user_config.launch_home) if not os.path.exists(launch_dir_parent): - mcscript.utils.mkdir(launch_dir_parent) + utils.mkdir(launch_dir_parent) if args.archive: # archive mode # launch in archive directory rather than usual batch job output directory @@ -366,7 +369,7 @@ def main(): # standard run mode launch_dir = os.path.join(user_config.launch_home, run, "batch") if not os.path.exists(launch_dir): - mcscript.utils.mkdir(launch_dir) + utils.mkdir(launch_dir) environment_definitions.append(f"MCSCRIPT_LAUNCH_DIR={launch_dir}") @@ -378,8 +381,8 @@ def main(): job_name = f"{run:s}" ##job_name += "-w%d" % args.width if args.pool is not None: - job_name += "-{args.pool:s}" - job_name += "-{args.phase:s}" + job_name += f"-{args.pool:s}" + job_name += f"-{args.phase:d}" print(" Job name:", job_name) # process environment definitions @@ -416,7 +419,7 @@ def main(): if (run_mode == "batch"): # set local qsub arguments - (submission_args, submission_input_string, repetitions) = mcscript.config.submission(job_name, job_file, environment_definitions, args) + (submission_args, submission_input_string, repetitions) = config.submission(job_name, job_file, environment_definitions, args) # notes: options must come before command on some platforms (e.g., Univa) print(" ".join(submission_args)) @@ -446,7 +449,7 @@ def main(): popen_args = ["csh", job_file] print() print("-"*64) - if task_mode is mcscript.task.TaskMode.kRun: + if task_mode is task.TaskMode.kRun: print(f"\033]2;qsubm {run}\007") os.chdir(launch_dir) os.execvpe(popen_args[0], popen_args, env=job_environ) From ab12f006bcd6d9138a4d91d0df61e766850ad49d Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Fri, 21 Jul 2023 03:27:56 -0500 Subject: [PATCH 06/26] task: Fix archive handler if some paths missing * Restore globbing of paths passed to tar. * Reverts to previous behavior (before 7381850). --- mcscript/task.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/mcscript/task.py b/mcscript/task.py index 274ca56..cc59bd2 100644 --- a/mcscript/task.py +++ b/mcscript/task.py @@ -551,14 +551,22 @@ def archive_handler_subarchives(archive_parameters_list): archive_filename = subarchive_filename(archive_parameters) if archive_filename is None: continue - archive_filename_list += [archive_filename] print("Archive: {}".format(archive_filename)) + # check that at least some contents exist (else skip) + available_paths = [] + for path in paths: + if (os.path.isdir(path)): + available_paths.append(path) + if ((len(available_paths)==0) and (not include_metadata)): + print("None of paths {} available and no request to save metadata. Skipping archive...".format(paths)) + continue + # construct archive filename_list = [toc_filename] if (include_metadata): filename_list += ["flags","output","batch"] - filename_list += paths + filename_list += available_paths tar_flags = "zcvf" if compress else "cvf" control.call( [ @@ -572,6 +580,7 @@ def archive_handler_subarchives(archive_parameters_list): ] + filename_list, cwd=parameters.run.work_dir, check_return=True ) + archive_filename_list += [archive_filename] return archive_filename_list @@ -669,10 +678,18 @@ def archive_handler_subarchives_hsi(archive_parameters_list, max_size=2**41, seg archive_filename = subarchive_filename(archive_parameters) if archive_filename is None: continue - archive_filename_list += [archive_filename] print("----------------------------------------------------------------") print("Archive: {}".format(archive_filename)) + # check that at least some contents exist (else skip) + available_paths = [] + for path in paths: + if (os.path.isdir(path)): + available_paths.append(path) + if ((len(available_paths)==0) and (not include_metadata)): + print("None of paths {} available and no request to save metadata. Skipping archive...".format(paths)) + continue + # create named pipes for archives os.mkfifo(archive_filename) @@ -727,7 +744,7 @@ def archive_handler_subarchives_hsi(archive_parameters_list, max_size=2**41, seg filename_list = [toc_filename] if (include_metadata): filename_list += ["flags","output","batch"] - filename_list += paths + filename_list += available_paths tar_flags = "zcvf" if compress else "cvf" try: control.call( @@ -761,6 +778,8 @@ def archive_handler_subarchives_hsi(archive_parameters_list, max_size=2**41, seg if returncode != 0: raise exception.ScriptError("nonzero return") + archive_filename_list += [archive_filename] + return archive_filename_list ################################################################ From 3e91e4905d234e710084b419e2c3c6f2e45364a7 Mon Sep 17 00:00:00 2001 From: "Mark A. Caprio" Date: Fri, 28 Jul 2023 06:51:11 -0700 Subject: [PATCH 07/26] config-slurm-nersc: Remove support for Cori --- config/config-slurm-nersc.py | 123 ++++++++++++++--------------------- 1 file changed, 50 insertions(+), 73 deletions(-) diff --git a/config/config-slurm-nersc.py b/config/config-slurm-nersc.py index 11ad369..e000d4b 100644 --- a/config/config-slurm-nersc.py +++ b/config/config-slurm-nersc.py @@ -55,18 +55,9 @@ + 08/05/22 (pjf): Fix job_id() for array jobs. + 09/20/22 (pjf): Prevent use of `--jobs` with `--time-min`. + 12/15/22 (mac): Revert default license to uppercase SCRATCH on Cori. + + 07/28/23 (mac): Remove support for Cori. """ -# Notes: -# -# Cori Haswell -# -# http://www.nersc.gov/users/computational-systems/cori/running-jobs/general-running-jobs-recommendations/ -# -# Common options: -# -# --opt="--mail-type=ALL" - import datetime import os import sys @@ -84,41 +75,41 @@ cluster_specs = { - "cori": { - "default": os.environ.get("CRAY_CPU_TARGET"), - "node_types": { - "haswell": { - "constraint": "haswell", - "core_specialization": True, - "queues": ["regular", "shared", "interactive", "debug", "premium", "flex", "overrun"], - "cores_per_node": 32, - "threads_per_core": 2, - "domains_per_node": 2, - "cores_per_domain": 16, - "nodes_per_switch": 384, - }, - "mic-knl": { - "core_specialization": True, - "constraint": "knl,quad,cache", - "queues": ["regular", "interactive", "debug", "premium", "low", "flex", "overrun"], - "cores_per_node": 68, - "threads_per_core": 4, - "domains_per_node": 4, - "cores_per_domain": 16, - "nodes_per_switch": 384, - }, - "cmem": { - "constraint": "amd", - "core_specialization": True, - "queues": ["bigmem", "interactive", "shared"], - "cores_per_node": 32, - "threads_per_core": 2, - "domains_per_node": 2, - "cores_per_domain": 16, - "nodes_per_switch": 1, - } - }, - }, + ## "cori": { + ## "default": os.environ.get("CRAY_CPU_TARGET"), + ## "node_types": { + ## "haswell": { + ## "constraint": "haswell", + ## "core_specialization": True, + ## "queues": ["regular", "shared", "interactive", "debug", "premium", "flex", "overrun"], + ## "cores_per_node": 32, + ## "threads_per_core": 2, + ## "domains_per_node": 2, + ## "cores_per_domain": 16, + ## "nodes_per_switch": 384, + ## }, + ## "mic-knl": { + ## "core_specialization": True, + ## "constraint": "knl,quad,cache", + ## "queues": ["regular", "interactive", "debug", "premium", "low", "flex", "overrun"], + ## "cores_per_node": 68, + ## "threads_per_core": 4, + ## "domains_per_node": 4, + ## "cores_per_domain": 16, + ## "nodes_per_switch": 384, + ## }, + ## "cmem": { + ## "constraint": "amd", + ## "core_specialization": True, + ## "queues": ["bigmem", "interactive", "shared"], + ## "cores_per_node": 32, + ## "threads_per_core": 2, + ## "domains_per_node": 2, + ## "cores_per_domain": 16, + ## "nodes_per_switch": 1, + ## } + ## }, + ## }, "perlmutter": { "default": "cpu", "node_types": { @@ -240,10 +231,7 @@ def qsubm_arguments(parser): "--switchwaittime", type=str, default="12:00:00", help="maximum time to wait for switch count; 0 disables constraint" ) - if nersc_host == "cori": - default_licenses = "SCRATCH,cfs" - else: - default_licenses = "scratch,cfs" + default_licenses = "scratch,cfs" group.add_argument( "--licenses", type=str, default=default_licenses, help="licenses to request for job" @@ -400,9 +388,7 @@ def submission(job_name,job_file,qsubm_path,environment_definitions,args): if args.jobs > 1: submission_invocation += ["--array={:g}-{:g}".format(0, args.jobs-1)] - if (nersc_host == "cori") and (args.queue in ["xfer", "compile"]): - control.module(["load", "esslurm"]) - elif args.queue in node_spec["queues"]: + if args.queue in node_spec["queues"]: # target cpu submission_invocation += ["--constraint={}".format(node_constraint)] @@ -546,26 +532,17 @@ def serial_invocation(base): # run on front end invocation = base else: - if (os.getenv("NERSC_HOST") == "cori") and (parameters.run.num_workers == 1): - # run unwrapped on Cori - invocation = base - else: - # run on compute node - invocation = [ - "srun", - "--ntasks={}".format(1), - "--nodes={}".format(1), - "--cpus-per-task={}".format(parameters.run.serial_threads), - "--export=ALL" - ] - - # 7/29/17 (mac): cpu-bind=cores is now recommended for edison as well - # cpu-bind=cores is recommended for cori but degrades performance on edison (mac, 4/3/17) - invocation += [ - "--cpu-bind=cores" - ] - - invocation += base + # run on compute node + invocation = [ + "srun", + "--ntasks={}".format(1), + "--nodes={}".format(1), + "--cpus-per-task={}".format(parameters.run.serial_threads), + "--export=ALL", + "--cpu-bind=cores", + ] + + invocation += base return invocation @@ -705,7 +682,7 @@ def init(): """ Do any local setup tasks. Invoked after mcscript sets the various configuration variables - and changed the cwd to the scratch directory. + and changes the cwd to the scratch directory. """ # attach signal handler for USR1 From ea09339ebfba8b4c425f376d18363db58606dcfd Mon Sep 17 00:00:00 2001 From: "Mark A. Caprio" Date: Fri, 28 Jul 2023 12:49:05 -0700 Subject: [PATCH 08/26] qsubm: Simplify argument handling for local runs --- qsubm.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/qsubm.py b/qsubm.py index bed228b..c2a7796 100644 --- a/qsubm.py +++ b/qsubm.py @@ -76,6 +76,7 @@ - Add `--edit` mode. - Update xterm title when running directly. + 09/20/22 (pjf): Use os.exec instead of subprocess for local run_mode. + + 07/28/23 (mac/slv): Simplify argument handling for local runs (replace "RUN" with None as default queue). """ import argparse @@ -92,13 +93,12 @@ ################################################################ parser = argparse.ArgumentParser( - description="Queue submission for numbered run.", + description="Set up execution environment for run script and submit to batch system (or launch local interactive execution).", usage= - "%(prog)s [option] run queue|RUN wall [var1=val1, ...]\n", + "%(prog)s [option] run [queue] [wall] [var1=val1, ...]\n", formatter_class=argparse.ArgumentDefaultsHelpFormatter, epilog= - """Simply omit the queue name and leave off the wall time for a - local interactive run. + """For a local (interactive) run, simply omit the queue name (and wall time) arguments. Environment variables for qsubm are described in INSTALL.md. @@ -114,7 +114,7 @@ # general arguments parser.add_argument("run", help="Run number (e.g., 0000 for run0000)") # latter arguments are made optional to simplify bare-bones syntax for --toc, etc., calls -parser.add_argument("queue", nargs='?', help="Submission queue, or RUN for direct interactive run", default="RUN") +parser.add_argument("queue", nargs='?', help="Submission queue (or omit for local interactive run)", default=None) parser.add_argument("wall", type=int, nargs='?', help="Wall time (minutes)", default=60) ##parser.add_argument("vars", nargs="?", help="Environment variables to pass to script, with optional values, comma delimited (e.g., METHOD2, PARAM=1.0)") parser.add_argument("--here", action="store_true", help="Force run in current working directory") @@ -267,14 +267,11 @@ # set queue and flag batch or local mode # force local run for task.py toc mode -if ((args.queue == "RUN") or args.toc or args.unlock): +if ((args.queue is None) or args.toc or args.unlock): run_mode = "local" - run_queue = "local" - print(" Mode:", run_mode) else: run_mode = "batch" - run_queue = args.queue - print(" Mode:", run_mode, "(%s)" % args.queue) +print(" Mode: {:s} (Queue: {:s})".format(run_mode,str(args.queue))) # set wall time wall_time_min = args.wall @@ -286,7 +283,7 @@ "MCSCRIPT_RUN={:s}".format(run), "MCSCRIPT_JOB_FILE={:s}".format(job_file), "MCSCRIPT_RUN_MODE={:s}".format(run_mode), - "MCSCRIPT_RUN_QUEUE={:s}".format(run_queue), + "MCSCRIPT_RUN_QUEUE={:s}".format(str(args.queue)), "MCSCRIPT_WALL_SEC={:d}".format(wall_time_sec), "MCSCRIPT_WORKERS={:d}".format(args.workers), ] From f7d941f4a35be25c159067c67b6d5f03ab5ba691 Mon Sep 17 00:00:00 2001 From: "Mark A. Caprio" Date: Thu, 31 Aug 2023 23:45:42 -0400 Subject: [PATCH 09/26] config/config-slurm-nersc: Provide GPU support on Perlmutter --- config/config-slurm-nersc.py | 45 +++++++++++++++++++++++-------- config/nersc_select_gpu_device.sh | 5 ++++ 2 files changed, 39 insertions(+), 11 deletions(-) create mode 100755 config/nersc_select_gpu_device.sh diff --git a/config/config-slurm-nersc.py b/config/config-slurm-nersc.py index e000d4b..a549629 100644 --- a/config/config-slurm-nersc.py +++ b/config/config-slurm-nersc.py @@ -335,12 +335,12 @@ def submission(job_name,job_file,qsubm_path,environment_definitions,args): raise exception.ScriptError( "ensure 'cmem' module is loaded when using --node-type=cmem" ) - elif (node_type in ["haswell", "mic-knl"]) and (node_type != os.environ.get("CRAY_CPU_TARGET", "")): - raise exception.ScriptError( - "--node-type={:s} does not match CRAY_CPU_TARGET={:s}".format( - node_type, os.environ.get("CRAY_CPU_TARGET", "") - ) - ) + ## elif (node_type in ["haswell", "mic-knl"]) and (node_type != os.environ.get("CRAY_CPU_TARGET", "")): + ## raise exception.ScriptError( + ## "--node-type={:s} does not match CRAY_CPU_TARGET={:s}".format( + ## node_type, os.environ.get("CRAY_CPU_TARGET", "") + ## ) + ## ) # check for multiple workers with requeueable jobs if args.time_min and (args.workers > 1): @@ -384,12 +384,19 @@ def submission(job_name,job_file,qsubm_path,environment_definitions,args): if (node_spec["core_specialization"]) and (args.nodes > 1): submission_invocation += ["--core-spec={}".format(node_cores-(domain_cores*node_domains))] + # gpu options + if node_type == "gpu": + # assumes typical configuration of single GPU per MPI rank + # https://docs.nersc.gov/jobs/affinity/#perlmutter + submission_invocation += ["--gpus-per-task=1"] + submission_invocation += ["--gpu-bind=none"] + # job array for repetitions if args.jobs > 1: submission_invocation += ["--array={:g}-{:g}".format(0, args.jobs-1)] if args.queue in node_spec["queues"]: - # target cpu + # target cpu/gpu submission_invocation += ["--constraint={}".format(node_constraint)] if slurm_time_to_seconds(args.switchwaittime) > 0: @@ -598,14 +605,30 @@ def hybrid_invocation(base): "--cpus-per-task={}".format(parameters.run.hybrid_threads), "--export=ALL" ] - # 4/3/17 (mac): cpu-bind=cores is recommended for cori but degrades performance on edison - # 7/29/17 (mac): cpu-bind=cores is now recommended for edison as well + + # buffering + # recommended by pm 02/02/23 + invocation += [ + "--unbuffered" + ] + + # cpu binding invocation += [ "--cpu-bind=cores" ] - # use local path instead + # executable wrapper for GPU affinity + gpu_enabled = os.environ.get("MPICH_GPU_SUPPORT_ENABLED")=="1" + if gpu_enabled: + executable_wrapper_path = os.path.join(os.environ["MCSCRIPT_DIR"], "config", "nersc_select_gpu_device.sh") + if (parameters.run.hybrid_nodes >= 128): + executable_wrapper_path = broadcast_executable(executable_wrapper_path) + invocation += [executable_wrapper_path] + + # executable invocation += [executable_path] + + # arguments invocation += base[1:] return invocation @@ -696,7 +719,7 @@ def init(): parameters.run.install_dir, cpu_target ) - # get extract metadata from Slurm + # extract metadata from Slurm if job_id() != "0": # get hostname parameters.run.host_name = subprocess.run( diff --git a/config/nersc_select_gpu_device.sh b/config/nersc_select_gpu_device.sh new file mode 100755 index 0000000..03d95d3 --- /dev/null +++ b/config/nersc_select_gpu_device.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# select_cpu_device wrapper script +# based on https://docs.nersc.gov/jobs/affinity/#perlmutter +export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID +exec $* From c2de5474705800fb7763ead19b1773acc857a6f2 Mon Sep 17 00:00:00 2001 From: "Mark A. Caprio" Date: Fri, 8 Sep 2023 20:23:51 -0400 Subject: [PATCH 10/26] config/config-slurm-nersc: Add gpu-hbm80g node type --- config/config-slurm-nersc.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/config/config-slurm-nersc.py b/config/config-slurm-nersc.py index a549629..81c76c6 100644 --- a/config/config-slurm-nersc.py +++ b/config/config-slurm-nersc.py @@ -134,6 +134,17 @@ "gpus_per_node": 4, "nodes_per_switch": 128, }, + "gpu-hbm80g": { + "queues": ["regular", "interactive", "debug", "preempt", "overrun"], + "core_specialization": False, + "constraint": "gpu&hbm80g", + "cores_per_node": 64, + "threads_per_core": 2, + "domains_per_node": 4, + "cores_per_domain": 16, + "gpus_per_node": 4, + "nodes_per_switch": 128, + }, }, }, } From 23f22b328cfe467795e14658ea6c0fa62a69e788 Mon Sep 17 00:00:00 2001 From: "Mark A. Caprio" Date: Sun, 10 Sep 2023 13:21:31 -0500 Subject: [PATCH 11/26] qsubm: Provide diagnostic environment variables MCSCRIPT_QSUBM_INVOCATION and MCSCRIPT_SUBMISSION_INVOCATION --- mcscript/parameters.py | 26 +++++++++++++++++--------- qsubm.py | 12 +++++++++--- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/mcscript/parameters.py b/mcscript/parameters.py index 5a67b6d..e51b4dc 100644 --- a/mcscript/parameters.py +++ b/mcscript/parameters.py @@ -19,6 +19,8 @@ - Remove MCSCRIPT_HYBRID_NODESIZE. + 06/02/20 (pjf): Add methods get_elapsed_time() and get_remaining_time(). + 10/11/20 (pjf): Add num_workers to parameters. + + 09/10/23 (mac): Support diagnostic environment variables MCSCRIPT_QSUBM_INVOCATION + and MCSCRIPT_SUBMISSION_INVOCATION. """ import os @@ -93,6 +95,10 @@ def populate(self): self.hybrid_ranks = int(os.environ["MCSCRIPT_HYBRID_RANKS"]) self.hybrid_threads = int(os.environ["MCSCRIPT_HYBRID_THREADS"]) + # environment definitions: diagnostic + self.qsubm_invocation = os.getenv("MCSCRIPT_QSUBM_INVOCATION") + self.submission_invocation = os.getenv("MCSCRIPT_SUBMISSION_INVOCATION") + # generate local definitions # # To be provided by local configuration init. @@ -121,15 +127,17 @@ def run_data_string(self): message = "\n".join( [ - "Run: {}".format(self.name), - "Job file: {}".format(self.job_file), - "Job ID: {}".format(self.job_id), - "Host name: {}".format(self.host_name), - "Batch mode: {}".format(self.batch_mode), - "Batch launch directory: {}".format(self.launch_dir), - "Work directory: {}".format(self.work_dir), - "Install directory: {}".format(self.install_dir), - "Wall time: {} sec (={:.2f} min)".format(self.wall_time_sec,self.wall_time_sec/60) + "Run: {}".format(self.name), + "Job file: {}".format(self.job_file), + "Job ID: {}".format(self.job_id), + "Host name: {}".format(self.host_name), + "Batch mode: {}".format(self.batch_mode), + "Batch launch directory: {}".format(self.launch_dir), + "Work directory: {}".format(self.work_dir), + "Install directory: {}".format(self.install_dir), + "Wall time: {} sec (={:.2f} min)".format(self.wall_time_sec,self.wall_time_sec/60), + "Invocation (qsubm): {}".format(self.qsubm_invocation), + "Invocation (submission): {}".format(self.submission_invocation), ] ) diff --git a/qsubm.py b/qsubm.py index c2a7796..7f79ebe 100644 --- a/qsubm.py +++ b/qsubm.py @@ -77,6 +77,8 @@ - Update xterm title when running directly. + 09/20/22 (pjf): Use os.exec instead of subprocess for local run_mode. + 07/28/23 (mac/slv): Simplify argument handling for local runs (replace "RUN" with None as default queue). + + 09/10/23 (mac): Provide diagnostic environment variables MCSCRIPT_QSUBM_INVOCATION + and MCSCRIPT_SUBMISSION_INVOCATION. """ import argparse @@ -300,7 +302,6 @@ "MCSCRIPT_HYBRID_THREADS={:d}".format(args.threads), ] - # set multi-task run parameters if (args.edit): editor = os.environ.get("EDITOR", "vi") @@ -427,7 +428,7 @@ # for local run job_environ=os.environ environment_keyvalues = [ - entry.split("=") + tuple(entry.split("=", maxsplit=1)) # maxsplit is to support values which themselves contain an equals sign for entry in environment_definitions ] job_environ.update(dict(environment_keyvalues)) @@ -441,12 +442,16 @@ print() sys.stdout.flush() +# quiet environment definitions: diagnostic +os.environ["MCSCRIPT_QSUBM_INVOCATION"] = "{}".format(sys.argv) + # handle batch run if (run_mode == "batch"): # set local qsub arguments (submission_args, submission_input_string, repetitions) = mcscript.config.submission(job_name, job_file, qsubm_path, environment_definitions, args) - + os.environ["MCSCRIPT_SUBMISSION_INVOCATION"] = submission_args + # notes: options must come before command on some platforms (e.g., Univa) print(" ".join(submission_args)) print(submission_input_string) @@ -478,4 +483,5 @@ if task_mode is mcscript.task.TaskMode.kRun: print(f"\033]2;qsubm {run}\007") os.chdir(launch_dir) + os.environ["MCSCRIPT_SUBMISSION_INVOCATION"] = "{}".format(popen_args) os.execvpe(popen_args[0], popen_args, env=job_environ) From 4c100f8e082bc800f3b88c4897596bc8ead517ab Mon Sep 17 00:00:00 2001 From: "Mark A. Caprio" Date: Sun, 10 Sep 2023 13:24:32 -0500 Subject: [PATCH 12/26] qsubm: Fix MCSCRIPT_SUBMISSION_INVOCATION --- qsubm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qsubm.py b/qsubm.py index 7f79ebe..c1439c2 100644 --- a/qsubm.py +++ b/qsubm.py @@ -450,7 +450,7 @@ # set local qsub arguments (submission_args, submission_input_string, repetitions) = mcscript.config.submission(job_name, job_file, qsubm_path, environment_definitions, args) - os.environ["MCSCRIPT_SUBMISSION_INVOCATION"] = submission_args + os.environ["MCSCRIPT_SUBMISSION_INVOCATION"] = "{}".format(submission_args) # notes: options must come before command on some platforms (e.g., Univa) print(" ".join(submission_args)) From 9f22aad24d0ac2e05702c53ecd5c20fb814aeedc Mon Sep 17 00:00:00 2001 From: "Mark A. Caprio" Date: Wed, 27 Sep 2023 21:52:57 -0400 Subject: [PATCH 13/26] config/select_gpu_device: Update comment --- config/nersc_select_gpu_device.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/nersc_select_gpu_device.sh b/config/nersc_select_gpu_device.sh index 03d95d3..8e9d807 100755 --- a/config/nersc_select_gpu_device.sh +++ b/config/nersc_select_gpu_device.sh @@ -1,5 +1,5 @@ #!/bin/bash -# select_cpu_device wrapper script +# select_gpu_device wrapper script # based on https://docs.nersc.gov/jobs/affinity/#perlmutter export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID exec $* From 3d995876b017ab98b7bfdeeb685e518de3b69784 Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Fri, 20 Oct 2023 14:14:19 -0500 Subject: [PATCH 14/26] config: Avoid use of srun for serial invocation at NERSC --- mcscript/config/slurm_nersc.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mcscript/config/slurm_nersc.py b/mcscript/config/slurm_nersc.py index e55646b..41e008b 100644 --- a/mcscript/config/slurm_nersc.py +++ b/mcscript/config/slurm_nersc.py @@ -56,6 +56,7 @@ + 09/20/22 (pjf): Prevent use of `--jobs` with `--time-min`. + 12/15/22 (mac): Revert default license to uppercase SCRATCH on Cori. + 07/28/23 (mac): Remove support for Cori. + + 10/20/23 (pjf): Avoid use of srun for serial invocation. """ import datetime @@ -550,7 +551,11 @@ def serial_invocation(base): # # srun --export=ALL ... - if (not os.environ.get("SLURM_JOB_ID")): + # NERSC machines no longer use MOM nodes; OpenMP-only executions should + # generally not use srun to avoid srun delays. However, if using multiple + # workers, srun is (unfortunately) required in order to distribute serial + # tasks across nodes. + if (not os.environ.get("SLURM_JOB_ID")) or (parameters.run.num_workers == 1): # run on front end invocation = base else: @@ -566,6 +571,8 @@ def serial_invocation(base): invocation += base + invocation = base + return invocation def broadcast_executable(executable_path): From 6d072632f9d95ab8cc253826d4bd9172a73e8645 Mon Sep 17 00:00:00 2001 From: "Mark A. Caprio" Date: Sun, 22 Oct 2023 08:24:48 -0700 Subject: [PATCH 15/26] task: Update traceback.print_exception() usage for Python 3.10 --- mcscript/task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mcscript/task.py b/mcscript/task.py index 17e9c3d..42b2565 100644 --- a/mcscript/task.py +++ b/mcscript/task.py @@ -1313,7 +1313,7 @@ def do_task(task_parameters,task,phase_handlers): raise except BaseException as err: # on failure, flag failure and propagate exception so script terminates - traceback.print_exception(etype=type(err), value=err, tb=err.__traceback__, file=sys.stdout) + traceback.print_exception(err, value=err, tb=err.__traceback__, file=sys.stdout) if task_mode is TaskMode.kRun: # process timing task_end_time = time.time() @@ -1507,7 +1507,7 @@ def task_master(task_parameters,task_list,phase_handlers,archive_phase_handlers) # consider an early termination to be successful control.termination(success=True, complete=False) except BaseException as err: - traceback.print_exception(etype=type(err), value=err, tb=err.__traceback__) + traceback.print_exception(err, value=err, tb=err.__traceback__) control.termination(success=False) else: raise(exception.ScriptError("Unsupported run mode: {:s}".format(task_mode))) From dced88ee9c6ee86bd10e85394a7cc26626b18dcd Mon Sep 17 00:00:00 2001 From: "Mark A. Caprio" Date: Sun, 22 Oct 2023 08:53:53 -0700 Subject: [PATCH 16/26] qsubm: Fix naming discrepancy involving user_config.launch_dir --- mcscript/qsubm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcscript/qsubm.py b/mcscript/qsubm.py index 0e7b2c6..e145d65 100644 --- a/mcscript/qsubm.py +++ b/mcscript/qsubm.py @@ -103,7 +103,7 @@ def get_user_config(): user_config.work_home = utils.expand_path(os.environ.get("MCSCRIPT_WORK_HOME", "")) # optional fields - user_config.launch_dir = utils.expand_path(os.environ.get("MCSCRIPT_LAUNCH_DIR", "")) + user_config.launch_home = utils.expand_path(os.environ.get("MCSCRIPT_LAUNCH_HOME", "")) user_config.python_executable = utils.expand_path(os.environ.get("MCSCRIPT_PYTHON", "")) user_config.env_script = utils.expand_path(os.environ.get("MCSCRIPT_SOURCE", "")) user_config.run_prefix = "run" From fdfb799d8587d5e9c96e50fd2b58f2d61c067c5f Mon Sep 17 00:00:00 2001 From: "Mark A. Caprio" Date: Sat, 28 Oct 2023 12:48:40 -0700 Subject: [PATCH 17/26] utils: Overhaul docsctring for search_in_subdirectories() --- mcscript/utils.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/mcscript/utils.py b/mcscript/utils.py index f6bf913..88b41e2 100644 --- a/mcscript/utils.py +++ b/mcscript/utils.py @@ -326,19 +326,31 @@ def search_in_subdirectories( """Search for file in a list of subdirectories, beneath a given base path (or list of base paths). + + Example: + + >>> base_path_list = ["/base_path_1", "/base_path_2"], + >>> subdirectory_list = ["subdirectory_1", "subdirectory_2"], + >>> filename = "the_file_we_are_trying_to_find.txt" + >>> search_in_subdirectories(base_path_list, subdirectory_list, filename) + + "/base_path_2/subdirectory_1/the_file_we_are_trying_to_find.txt" + Arguments: - base_path_or_list (str or list of str): base path in which to search - subdirectories (may alternatively be list of base paths) - subdirectory_path_or_list (list of str, optional, repeatable): - subdirectories to search - filenames (str or list of str): file name(s) (or base file name(s)) to match + + path_or_list_1, ... (str or list[str]): path segment (or list of + possible values to iteratively search) + base (bool, optional): whether to accept given search string as filename root rather than exact match (then just return this base in the result) + fail_on_not_found (bool, optional): whether to raise exception on failure to match (else returns None) + error_message (str, optional): custom error message to display on file not found + verbose (bool, optional): whether to print log messages From 155990c0f4ce99876172ef8e2e65552d2e0741b5 Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Fri, 10 Nov 2023 08:12:52 -0800 Subject: [PATCH 18/26] wrapper: Migrate from pkg_resources to importlib_resources --- mcscript/config/slurm_nersc.py | 45 ++++++++++++++++++++++------------ mcscript/config/torque_oak.py | 31 +++++++++++++++-------- mcscript/config/uge_ndcrc.py | 31 +++++++++++++++-------- mcscript/qsubm.py | 10 +++++++- 4 files changed, 80 insertions(+), 37 deletions(-) diff --git a/mcscript/config/slurm_nersc.py b/mcscript/config/slurm_nersc.py index 41e008b..8105e2d 100644 --- a/mcscript/config/slurm_nersc.py +++ b/mcscript/config/slurm_nersc.py @@ -57,17 +57,22 @@ + 12/15/22 (mac): Revert default license to uppercase SCRATCH on Cori. + 07/28/23 (mac): Remove support for Cori. + 10/20/23 (pjf): Avoid use of srun for serial invocation. + + 11/10/23 (pjf): + - Migrate from pkg_resources to importlib_resources. + - Copy wrapper script to launch_dir to ensure existence. """ import datetime import os import sys import math +import pathlib import signal +import stat import subprocess import shutil import re -import pkg_resources +import importlib_resources from .. import ( control, @@ -447,18 +452,23 @@ def submission(job_name,job_file,environment_definitions,args): # calls interpreter explicitly, so do not have to rely upon default python # version or shebang line in script if "csh" in os.environ.get("SHELL", ""): - job_wrapper = pkg_resources.resource_filename( - "mcscript", "job_wrappers/csh_job_wrapper.csh" - ) + job_wrapper_name = "csh_job_wrapper.csh" elif "bash" in os.environ.get("SHELL", ""): - job_wrapper = pkg_resources.resource_filename( - "mcscript", "job_wrappers/bash_job_wrapper.sh" - ) + job_wrapper_name = "bash_job_wrapper.sh" else: - job_wrapper = None + job_wrapper_name = None + + if job_wrapper_name: + # copy job wrapper to launch directory + job_wrapper_source = ( + importlib_resources.files('mcscript') / "job_wrappers" / job_wrapper_name + ) + job_wrapper = pathlib.Path(parameters.run.launch_dir) / job_wrapper_name + with importlib_resources.as_file(job_wrapper_source) as path: + shutil.copyfile(path, job_wrapper) + job_wrapper.chmod(job_wrapper.stat().st_mode | stat.S_IEXEC) - if job_wrapper: - submission_invocation += [job_wrapper] + submission_invocation += [str(job_wrapper)] # use GNU parallel to launch multiple workers per job if args.workers > 1: @@ -642,12 +652,15 @@ def hybrid_invocation(base): # executable wrapper for GPU affinity gpu_enabled = os.environ.get("MPICH_GPU_SUPPORT_ENABLED")=="1" if gpu_enabled: - executable_wrapper_path = pkg_resources.resource_filename( - "mcscript", "job_wrappers/nersc_select_gpu_device.sh" - ) - if (parameters.run.hybrid_nodes >= 128): - executable_wrapper_path = broadcast_executable(executable_wrapper_path) - invocation += [executable_wrapper_path] + ##executable_wrapper_path = pkg_resources.resource_filename( + ## "mcscript", "job_wrappers/nersc_select_gpu_device.sh" + ##) + ##if (parameters.run.hybrid_nodes >= 128): + ## executable_wrapper_path = broadcast_executable(executable_wrapper_path) + ##invocation += [executable_wrapper_path] + invocation += [ + "--gpus-per-task=1" + ] # executable invocation += [executable_path] diff --git a/mcscript/config/torque_oak.py b/mcscript/config/torque_oak.py index faa04ab..f97d0d6 100644 --- a/mcscript/config/torque_oak.py +++ b/mcscript/config/torque_oak.py @@ -13,12 +13,18 @@ from torque OpenPBS v2.3 and mpiexec from Intel MPI Library for Linux Version 2017: + 10/11/20 (pjf): Rename `--num` to `--jobs`. + + 11/10/23 (pjf): + - Migrate from pkg_resources to importlib_resources. + - Copy wrapper script to launch_dir to ensure existence. """ import math import os -import pkg_resources +import pathlib +import shutil +import stat +import importlib_resources from .. import parameters @@ -149,18 +155,23 @@ def submission(job_name, job_file, environment_definitions, args): # calls interpreter explicitly, so do not have to rely upon default python # version or shebang line in script if "csh" in os.environ.get("SHELL", ""): - job_wrapper = pkg_resources.resource_filename( - "mcscript", "job_wrappers/csh_job_wrapper.csh" - ) + job_wrapper_name = "csh_job_wrapper.csh" elif "bash" in os.environ.get("SHELL", ""): - job_wrapper = pkg_resources.resource_filename( - "mcscript", "job_wrappers/bash_job_wrapper.sh" - ) + job_wrapper_name = "bash_job_wrapper.sh" else: - job_wrapper = None + job_wrapper_name = None + + if job_wrapper_name: + # copy job wrapper to launch directory + job_wrapper_source = ( + importlib_resources.files('mcscript') / "job_wrappers" / job_wrapper_name + ) + job_wrapper = pathlib.Path(parameters.run.launch_dir) / job_wrapper_name + with importlib_resources.as_file(job_wrapper_source) as path: + shutil.copyfile(path, job_wrapper) + job_wrapper.chmod(job_wrapper.stat().st_mode | stat.S_IEXEC) - if job_wrapper: - submission_invocation += [job_wrapper] + submission_invocation += [str(job_wrapper)] # standard input for submission submission_string = "" diff --git a/mcscript/config/uge_ndcrc.py b/mcscript/config/uge_ndcrc.py index 92d40ac..47c9afe 100644 --- a/mcscript/config/uge_ndcrc.py +++ b/mcscript/config/uge_ndcrc.py @@ -18,6 +18,9 @@ - Pass entire environment. - Completely rewrite mapping and binding logic. + 10/11/20 (pjf): Rename `--num` to `--jobs`. + + 11/10/23 (pjf): + - Migrate from pkg_resources to importlib_resources. + - Copy wrapper script to launch_dir to ensure existence. """ @@ -69,7 +72,10 @@ import math import os -import pkg_resources +import pathlib +import shutil +import stat +import importlib_resources from .. import parameters @@ -190,18 +196,23 @@ def submission(job_name, job_file, environment_definitions, args): # calls interpreter explicitly, so do not have to rely upon default python # version or shebang line in script if "csh" in os.environ.get("SHELL", ""): - job_wrapper = pkg_resources.resource_filename( - "mcscript", "job_wrappers/csh_job_wrapper.csh" - ) + job_wrapper_name = "csh_job_wrapper.csh" elif "bash" in os.environ.get("SHELL", ""): - job_wrapper = pkg_resources.resource_filename( - "mcscript", "job_wrappers/bash_job_wrapper.sh" - ) + job_wrapper_name = "bash_job_wrapper.sh" else: - job_wrapper = None + job_wrapper_name = None + + if job_wrapper_name: + # copy job wrapper to launch directory + job_wrapper_source = ( + importlib_resources.files('mcscript') / "job_wrappers" / job_wrapper_name + ) + job_wrapper = pathlib.Path(parameters.run.launch_dir) / job_wrapper_name + with importlib_resources.as_file(job_wrapper_source) as path: + shutil.copyfile(path, job_wrapper) + job_wrapper.chmod(job_wrapper.stat().st_mode | stat.S_IEXEC) - if job_wrapper: - submission_invocation += [job_wrapper] + submission_invocation += [str(job_wrapper)] submission_invocation += [ os.environ["MCSCRIPT_PYTHON"], diff --git a/mcscript/qsubm.py b/mcscript/qsubm.py index e145d65..8e8e7f9 100644 --- a/mcscript/qsubm.py +++ b/mcscript/qsubm.py @@ -79,6 +79,8 @@ + 07/28/23 (mac/slv): Simplify argument handling for local runs (replace "RUN" with None as default queue). + 09/10/23 (mac): Provide diagnostic environment variables MCSCRIPT_QSUBM_INVOCATION and MCSCRIPT_SUBMISSION_INVOCATION. + + 11/10/23 (pjf): Populate selected values in parameters.run for use by + config.submission(). """ import argparse @@ -90,6 +92,7 @@ from . import ( config, + parameters, task, utils, ) @@ -221,7 +224,7 @@ def main(): user_config.launch_home = user_config.work_home if not user_config.python_executable: - python_executable = "python3" + user_config.python_executable = "python3" if not user_config.install_home: print("MCSCRIPT_INSTALL_HOME not found in environment") @@ -273,6 +276,9 @@ def main(): "MCSCRIPT_WALL_SEC={:d}".format(wall_time_sec), "MCSCRIPT_WORKERS={:d}".format(args.workers), ] + parameters.run.name = run + parameters.run.job_file = job_file + parameters.run.run_queue = str(args.queue) # environment definitions: serial run parameters environment_definitions += [ @@ -351,6 +357,7 @@ def main(): ## if ( not os.path.exists(work_dir)): ## utils.mkdir(work_dir) environment_definitions.append(f"MCSCRIPT_WORK_DIR={work_dir}") + parameters.run.work_dir = work_dir # set up run launch directory (for batch job output logging) launch_dir_parent = os.path.join(user_config.launch_home, run) @@ -371,6 +378,7 @@ def main(): if not os.path.exists(launch_dir): utils.mkdir(launch_dir) environment_definitions.append(f"MCSCRIPT_LAUNCH_DIR={launch_dir}") + parameters.run.launch_dir = launch_dir ################################################################ From 9a508070e7d5b3f327fc1872ee397a9206eaf2df Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Fri, 19 Jan 2024 21:06:42 -0600 Subject: [PATCH 19/26] qsubm: Ensure that MCSCRIPT_PYTHON is always set --- mcscript/qsubm.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mcscript/qsubm.py b/mcscript/qsubm.py index 8e8e7f9..c60db03 100644 --- a/mcscript/qsubm.py +++ b/mcscript/qsubm.py @@ -81,6 +81,7 @@ and MCSCRIPT_SUBMISSION_INVOCATION. + 11/10/23 (pjf): Populate selected values in parameters.run for use by config.submission(). + + 01/19/24 (pjf): Ensure `MCSCRIPT_PYTHON` is always set. """ import argparse @@ -329,6 +330,11 @@ def main(): f"MCSCRIPT_INSTALL_HOME={user_config.install_home:s}" ] + # environment definitions: set MCSCRIPT_PYTHON + environment_definitions += [ + f"MCSCRIPT_PYTHON={user_config.python_executable:s}" + ] + # include additional environment setup if defined if user_config.env_script: environment_definitions += [ From 4c806d7d72ecea33a1f05fa0666dcbb66ae71e35 Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Fri, 19 Jan 2024 21:11:47 -0600 Subject: [PATCH 20/26] qsubm: Gracefully handle empty paths in environment variables --- mcscript/qsubm.py | 12 ++++++------ mcscript/utils.py | 9 +++++++-- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/mcscript/qsubm.py b/mcscript/qsubm.py index c60db03..b9d9f89 100644 --- a/mcscript/qsubm.py +++ b/mcscript/qsubm.py @@ -102,14 +102,14 @@ def get_user_config(): """Get user configuration from environment.""" user_config = types.SimpleNamespace() - user_config.install_home = utils.expand_path(os.environ.get("MCSCRIPT_INSTALL_HOME", "")) - user_config.run_home_list = utils.expand_path(os.environ.get("MCSCRIPT_RUN_HOME", "").split(":")) - user_config.work_home = utils.expand_path(os.environ.get("MCSCRIPT_WORK_HOME", "")) + user_config.install_home = utils.expand_path(os.environ.get("MCSCRIPT_INSTALL_HOME")) + user_config.run_home_list = utils.expand_path(os.environ.get("MCSCRIPT_RUN_HOME", ".").split(":")) + user_config.work_home = utils.expand_path(os.environ.get("MCSCRIPT_WORK_HOME")) # optional fields - user_config.launch_home = utils.expand_path(os.environ.get("MCSCRIPT_LAUNCH_HOME", "")) - user_config.python_executable = utils.expand_path(os.environ.get("MCSCRIPT_PYTHON", "")) - user_config.env_script = utils.expand_path(os.environ.get("MCSCRIPT_SOURCE", "")) + user_config.launch_home = utils.expand_path(os.environ.get("MCSCRIPT_LAUNCH_HOME")) + user_config.python_executable = utils.expand_path(os.environ.get("MCSCRIPT_PYTHON")) + user_config.env_script = utils.expand_path(os.environ.get("MCSCRIPT_SOURCE")) user_config.run_prefix = "run" return user_config diff --git a/mcscript/utils.py b/mcscript/utils.py index 88b41e2..f6db6b5 100644 --- a/mcscript/utils.py +++ b/mcscript/utils.py @@ -51,6 +51,7 @@ + Add diagnostic output to topological_sort(). 06/29/22 (pjf): Generalize search_in_subdirectories for multiple filenames. 12/15/22 (pjf): Add get_directory_size(). + 01/19/24 (pjf): Make expand_path() handle None gracefully. """ import collections @@ -333,7 +334,7 @@ def search_in_subdirectories( >>> subdirectory_list = ["subdirectory_1", "subdirectory_2"], >>> filename = "the_file_we_are_trying_to_find.txt" >>> search_in_subdirectories(base_path_list, subdirectory_list, filename) - + "/base_path_2/subdirectory_1/the_file_we_are_trying_to_find.txt" Arguments: @@ -418,12 +419,16 @@ def expand_path(path_or_list): This is a wrapper to various os.path functions, which expand inline variables and ~, and normalize nestings of separators. + Arguments which are `None` will return `None`. + Arguments: path_or_list: (str or list of str) path (or list of paths) as string(s) Returns: (str or list of str): expanded and normalized path(s) """ - if isinstance(path_or_list, (str, bytes, os.PathLike)): + if path_or_list is None: + return None + elif isinstance(path_or_list, (str, bytes, os.PathLike)): expanded_path = os.path.expanduser(os.path.expandvars(path_or_list)) norm_path = os.path.normpath(expanded_path) return norm_path From e7715950cb3be0a4f628720c8c8b825392affa75 Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Fri, 19 Jan 2024 21:13:29 -0600 Subject: [PATCH 21/26] qsubm: Add quiet mode --- mcscript/qsubm.py | 57 +++++++++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/mcscript/qsubm.py b/mcscript/qsubm.py index b9d9f89..2d2828c 100644 --- a/mcscript/qsubm.py +++ b/mcscript/qsubm.py @@ -81,7 +81,9 @@ and MCSCRIPT_SUBMISSION_INVOCATION. + 11/10/23 (pjf): Populate selected values in parameters.run for use by config.submission(). - + 01/19/24 (pjf): Ensure `MCSCRIPT_PYTHON` is always set. + + 01/19/24 (pjf): + - Ensure `MCSCRIPT_PYTHON` is always set. + - Add quiet mode. """ import argparse @@ -154,6 +156,7 @@ def parse_args(): parser.add_argument("--workers", type=int, default=1, help="Number of workers to launch per job (not supported by all queues)") parser.add_argument("--opt", action="append", help="Additional option arguments to be passed to job submission command (e.g., --opt=\"-m ae\" or --opt=\"--mail-type=END,FAIL\"), may be repeated (e.g., --opt=\"-A acct\" --opt=\"-a 1200\"); beware the spaces may be important to the job submission command") parser.add_argument("--expert", action="store_true", help="Run mcscript in expert mode") + parser.add_argument("-q", "--quiet", action="store_true", help="Suppress qsubm output text") # serial run parallelization parameters serial_group = parser.add_argument_group("serial run options (single-node, non-MPI)") @@ -235,9 +238,14 @@ def main(): # argument processing ################################################################ + # quiet modes + if args.edit: + args.quiet = True + # set run name run = user_config.run_prefix + args.run - print("Run:", run) + if not args.quiet: + print("Run:", run) # ...and process run file script_extensions = [".py", ".csh"] @@ -249,11 +257,13 @@ def main(): job_file = filename job_extension = extension break - print(" Run homes:", user_config.run_home_list) # useful to report now, in case job file missing + if not args.quiet: + print(" Run homes:", user_config.run_home_list) # useful to report now, in case job file missing if (job_file is None): print(f"No job file {run}.* found with an extension in the set {script_extensions}.") exit(1) - print(" Job file:", job_file) + if not args.quiet: + print(" Job file:", job_file) # set queue and flag batch or local mode # force local run for task.py toc mode @@ -261,11 +271,13 @@ def main(): run_mode = "local" else: run_mode = "batch" - print(" Mode: {:s} (Queue: {:s})".format(run_mode,str(args.queue))) + if not args.quiet: + print(" Mode: {:s} (Queue: {:s})".format(run_mode,str(args.queue))) # set wall time wall_time_min = args.wall - print(" Wall time (min): {:d}".format(wall_time_min)) + if not args.quiet: + print(" Wall time (min): {:d}".format(wall_time_min)) wall_time_sec = wall_time_min*60 # environment definitions: general run parameters @@ -347,7 +359,8 @@ def main(): user_environment_definitions = [] else: user_environment_definitions = args.vars.split(",") - print(" User environment definitions:", user_environment_definitions) + if not args.quiet: + print(" User environment definitions:", user_environment_definitions) environment_definitions += user_environment_definitions @@ -397,7 +410,8 @@ def main(): if args.pool is not None: job_name += f"-{args.pool:s}" job_name += f"-{args.phase:d}" - print(" Job name:", job_name) + if not args.quiet: + print(" Job name:", job_name) # process environment definitions # regularize environment definitions @@ -410,8 +424,9 @@ def main(): for i in range(len(environment_definitions)): if (not "=" in environment_definitions[i]): environment_definitions[i] += "=" - print() - print("Vars:", ",".join(environment_definitions)) + if not args.quiet: + print() + print("Vars:", ",".join(environment_definitions)) # for local run job_environ=os.environ environment_keyvalues = [ @@ -426,8 +441,9 @@ def main(): ################################################################ # flush script output before invoking job - print() - sys.stdout.flush() + if not args.quiet: + print() + sys.stdout.flush() # quiet environment definitions: diagnostic os.environ["MCSCRIPT_QSUBM_INVOCATION"] = "{}".format(sys.argv) @@ -441,10 +457,11 @@ def main(): os.environ["MCSCRIPT_SUBMISSION_INVOCATION"] = "{}".format(submission_args) # notes: options must come before command on some platforms (e.g., Univa) - print(" ".join(submission_args)) - print(submission_input_string) - print() - print("-"*64) + if not args.quiet: + print(" ".join(submission_args)) + print(submission_input_string) + print() + print("-"*64) for i in range(repetitions): subprocess.run( submission_args, @@ -466,10 +483,12 @@ def main(): popen_args = [user_config.python_executable, job_file] elif (job_extension == ".csh"): popen_args = ["csh", job_file] - print() - print("-"*64) + if not args.quiet: + print() + print("-"*64) if task_mode is task.TaskMode.kRun: - print(f"\033]2;qsubm {run}\007") + if not args.quiet: + print(f"\033]2;qsubm {run}\007") os.chdir(launch_dir) os.environ["MCSCRIPT_SUBMISSION_INVOCATION"] = "{}".format(popen_args) os.execvpe(popen_args[0], popen_args, env=job_environ) From 65e57ea2ff4fe17c8321cbe78c5a6dab34dee4c7 Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Fri, 19 Jan 2024 21:14:36 -0600 Subject: [PATCH 22/26] qsubm: Cosmetic improvements to argument handling --- mcscript/qsubm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mcscript/qsubm.py b/mcscript/qsubm.py index 2d2828c..e6c706d 100644 --- a/mcscript/qsubm.py +++ b/mcscript/qsubm.py @@ -84,6 +84,7 @@ + 01/19/24 (pjf): - Ensure `MCSCRIPT_PYTHON` is always set. - Add quiet mode. + - Cosmetic improvements to argument handling. """ import argparse @@ -175,7 +176,7 @@ def parse_args(): ##hybrid_group.add_argument("--undersubscription", type=int, default=1, help="undersubscription factor (e.g., spread=2 requests twice the cores needed)") # multi-task interface: invocation modes - task_mode_group = parser.add_mutually_exclusive_group() + task_mode_group = parser.add_argument_group("run modes").add_mutually_exclusive_group() task_mode_group.add_argument("--edit", action="store_true", help="Edit run script using EDITOR") task_mode_group.add_argument("--toc", action="store_true", help="Invoke run script to generate task table of contents") task_mode_group.add_argument("--unlock", action="store_true", help="Delete any .lock or .fail flags for tasks") @@ -189,7 +190,7 @@ def parse_args(): task_selection_group.add_argument("--phase", type=int, default=0, help="Set task phase for task selection") task_selection_group.add_argument("--start", type=int, default=0, help="Set starting task number for task selection") task_selection_group.add_argument("--limit", type=int, help="Set task count limit for task selection") - task_selection_group.add_argument("--redirect", default="True", choices=["True", "False"], help="Allow redirection of standard" + task_selection_group.add_argument("--redirect", default=True, action=argparse.BooleanOptionalAction, help="Redirect standard" " output/error to file (may want to disable for interactive debugging)") # site-local options From de42568ce7af4390c7d4f8a5b691fc8ff5d766ae Mon Sep 17 00:00:00 2001 From: Mark Caprio Date: Fri, 19 Jan 2024 19:47:11 -0800 Subject: [PATCH 23/26] INSTALL: Add warning about use of pip install --editable flag --- INSTALL.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 6f18c0e..367cad1 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -48,8 +48,10 @@ Department of Physics, University of Notre Dame ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Note that the `.` here means to install the Python package defined by the code - in the current directory. If you are actively developing `mcscript` itself, - you may want to pass the `--editable` flag to `pip`. + in the current directory. If you are actively developing `mcscript` itself, + you may want to pass the `--editable` flag to `pip`. However, beware that + this may result in a fragile installation, e.g., the wrong version of `qsubm` + may be executed if you upgrade. a. Subsequently updating source: From 4e33a0a501b7ef0d44f16f9a1dd22a6b196c53eb Mon Sep 17 00:00:00 2001 From: Patrick Fasano Date: Fri, 19 Jan 2024 22:10:55 -0600 Subject: [PATCH 24/26] qsubm: Fix redirect flag formatting --- mcscript/qsubm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcscript/qsubm.py b/mcscript/qsubm.py index e6c706d..05c7cff 100644 --- a/mcscript/qsubm.py +++ b/mcscript/qsubm.py @@ -329,7 +329,7 @@ def main(): f"MCSCRIPT_TASK_MODE={task_mode.value:d}", f"MCSCRIPT_TASK_PHASE={args.phase:d}", f"MCSCRIPT_TASK_START_INDEX={args.start:d}", - f"MCSCRIPT_TASK_REDIRECT={args.redirect:s}", + f"MCSCRIPT_TASK_REDIRECT={args.redirect}", ] # TODO (mac): neaten up so that these arguments are always provided # (and simplify this code to a simple list += as above) From 12af9882ef3773088289c5986a5808a31ffb6ea6 Mon Sep 17 00:00:00 2001 From: "Mark A. Caprio" Date: Wed, 6 Mar 2024 17:37:24 -0500 Subject: [PATCH 25/26] config/slurm_nersc: Add pass-through environment variable MCSCRIPT_NODE_TYPE from submission to runtime --- mcscript/config/slurm_nersc.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mcscript/config/slurm_nersc.py b/mcscript/config/slurm_nersc.py index 8105e2d..90efb40 100644 --- a/mcscript/config/slurm_nersc.py +++ b/mcscript/config/slurm_nersc.py @@ -60,6 +60,7 @@ + 11/10/23 (pjf): - Migrate from pkg_resources to importlib_resources. - Copy wrapper script to launch_dir to ensure existence. + + 03/06/24 (mac): Make """ import datetime @@ -372,6 +373,9 @@ def submission(job_name,job_file,environment_definitions,args): else: raise err + # cluster-specific environment variables (to pass through to runtime script) + os.environ["MCSCRIPT_NODE_TYPE"] = node_type + # start accumulating command line submission_invocation = [ "sbatch" ] @@ -402,7 +406,7 @@ def submission(job_name,job_file,environment_definitions,args): submission_invocation += ["--core-spec={}".format(node_cores-(domain_cores*node_domains))] # gpu options - if node_type == "gpu": + if node_type in {"gpu", "gpu-hbm80g"}: # assumes typical configuration of single GPU per MPI rank # https://docs.nersc.gov/jobs/affinity/#perlmutter submission_invocation += ["--gpus-per-task=1"] @@ -650,8 +654,8 @@ def hybrid_invocation(base): ] # executable wrapper for GPU affinity - gpu_enabled = os.environ.get("MPICH_GPU_SUPPORT_ENABLED")=="1" - if gpu_enabled: + node_type = os.environ["MCSCRIPT_NODE_TYPE"] + if node_type in {"gpu", "gpu-hbm80g"}: ##executable_wrapper_path = pkg_resources.resource_filename( ## "mcscript", "job_wrappers/nersc_select_gpu_device.sh" ##) From 056cec3155a0d23ccc82667548eb154dc512c688 Mon Sep 17 00:00:00 2001 From: "Mark A. Caprio" Date: Thu, 13 Jun 2024 17:19:50 -0400 Subject: [PATCH 26/26] Update version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6eb4e22..ca0e7b1 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="mcscript", - version="1.0.0", + version="2.0.0", author="Mark A. Caprio, Patrick J. Fasano, University of Notre Dame", description=("Scripting setup, utilities, and task control for cluster runs"), license="MIT",