From d91ede1be0ffbe0d3f8946a50b4fb97ea482a0ea Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 21 Jun 2024 13:57:28 +0200 Subject: [PATCH 01/49] Add initial virtual cluster tests --- Makefile | 7 + integration_test/cluster_test/component.py | 39 ++++ integration_test/cluster_test/dispatch.sh | 12 + integration_test/cluster_test/dispatch.ymmsl | 15 ++ .../cluster_test/implementations.ymmsl | 8 + integration_test/cluster_test/multiple.sh | 12 + integration_test/cluster_test/multiple.ymmsl | 32 +++ integration_test/cluster_test/settings.ymmsl | 5 + integration_test/cluster_test/single.sh | 12 + integration_test/cluster_test/single.ymmsl | 10 + integration_test/conftest.py | 4 + integration_test/test_cluster.Dockerfile | 9 + integration_test/test_cluster.py | 207 ++++++++++++++++++ tox.ini | 21 +- 14 files changed, 392 insertions(+), 1 deletion(-) create mode 100644 integration_test/cluster_test/component.py create mode 100755 integration_test/cluster_test/dispatch.sh create mode 100644 integration_test/cluster_test/dispatch.ymmsl create mode 100644 integration_test/cluster_test/implementations.ymmsl create mode 100755 integration_test/cluster_test/multiple.sh create mode 100644 integration_test/cluster_test/multiple.ymmsl create mode 100644 integration_test/cluster_test/settings.ymmsl create mode 100755 integration_test/cluster_test/single.sh create mode 100644 integration_test/cluster_test/single.ymmsl create mode 100644 integration_test/test_cluster.Dockerfile create mode 100644 integration_test/test_cluster.py diff --git a/Makefile b/Makefile index c54cc9b6..91045897 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,9 @@ endif .PHONY: test test: test_python test_scripts test_cpp test_fortran +.PHONY: test_all +test_all: test test_cluster + .PHONY: test_python_only test_python_only: MUSCLE_TEST_PYTHON_ONLY=1 tox @@ -37,6 +40,10 @@ test_cpp: cpp test_fortran: fortran_tests cd libmuscle/fortran && $(MAKE) test +.PHONY: test_cluster +test_cluster: + tox -e cluster + .PHONY: test_scripts test_scripts: cd scripts && $(MAKE) test diff --git a/integration_test/cluster_test/component.py b/integration_test/cluster_test/component.py new file mode 100644 index 00000000..e14d0523 --- /dev/null +++ b/integration_test/cluster_test/component.py @@ -0,0 +1,39 @@ +import logging + +from libmuscle import Instance, Message +from ymmsl import Operator + + +def component() -> None: + """A simple dummy component. + + This sends and receives on all operators, allowing different coupling patterns + with a single program. + """ + instance = Instance({ + Operator.F_INIT: ['init_in'], + Operator.O_I: ['inter_out'], + Operator.S: ['inter_in'], + Operator.O_F: ['final_out']}) + + while instance.reuse_instance(): + # F_INIT + steps = instance.get_setting('steps', 'int') + + instance.receive('init_in', default=Message(0.0)) + + for step in range(steps): + # O_I + instance.send('inter_out', Message(step)) + + # S + instance.receive('inter_in', default=Message(0.0)) + + # O_F + instance.send('final_out', Message(steps)) + + +if __name__ == '__main__': + logging.basicConfig() + logging.getLogger().setLevel(logging.INFO) + component() diff --git a/integration_test/cluster_test/dispatch.sh b/integration_test/cluster_test/dispatch.sh new file mode 100755 index 00000000..10fb1fb9 --- /dev/null +++ b/integration_test/cluster_test/dispatch.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +#SBATCH --time=0:1:00 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=2 + +set -e + +source /home/cerulean/venv/bin/activate + +muscle_manager --log-level=DEBUG --start-all /home/cerulean/cluster_test/dispatch.ymmsl /home/cerulean/cluster_test/settings.ymmsl /home/cerulean/cluster_test/implementations.ymmsl + diff --git a/integration_test/cluster_test/dispatch.ymmsl b/integration_test/cluster_test/dispatch.ymmsl new file mode 100644 index 00000000..a786e2a9 --- /dev/null +++ b/integration_test/cluster_test/dispatch.ymmsl @@ -0,0 +1,15 @@ +ymmsl_version: v0.1 + +model: + name: dispatch + components: + c1: component + c2: component + conduits: + c1.final_out: c2.init_in + +resources: + c1: + threads: 1 + c2: + threads: 1 diff --git a/integration_test/cluster_test/implementations.ymmsl b/integration_test/cluster_test/implementations.ymmsl new file mode 100644 index 00000000..04737a2f --- /dev/null +++ b/integration_test/cluster_test/implementations.ymmsl @@ -0,0 +1,8 @@ +ymmsl_version: v0.1 + +implementations: + component: + virtual_env: /home/cerulean/venv + executable: python + args: + - /home/cerulean/cluster_test/component.py diff --git a/integration_test/cluster_test/multiple.sh b/integration_test/cluster_test/multiple.sh new file mode 100755 index 00000000..a5122dd2 --- /dev/null +++ b/integration_test/cluster_test/multiple.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +#SBATCH --time=0:1:00 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=1 + +set -e + +source /home/cerulean/venv/bin/activate + +muscle_manager --log-level=DEBUG --start-all /home/cerulean/cluster_test/multiple.ymmsl /home/cerulean/cluster_test/settings.ymmsl /home/cerulean/cluster_test/implementations.ymmsl + diff --git a/integration_test/cluster_test/multiple.ymmsl b/integration_test/cluster_test/multiple.ymmsl new file mode 100644 index 00000000..60260aad --- /dev/null +++ b/integration_test/cluster_test/multiple.ymmsl @@ -0,0 +1,32 @@ +ymmsl_version: v0.1 + +model: + name: multiple + components: + c1: component + c2: component + c3: component + c4: component + c5: component + c6: component + conduits: + c1.inter_out: c2.inter_in + c2.inter_out: c3.inter_in + c3.inter_out: c4.inter_in + c4.inter_out: c5.inter_in + c5.inter_out: c6.inter_in + c6.inter_out: c1.inter_in + +resources: + c1: + threads: 1 + c2: + threads: 1 + c3: + threads: 1 + c4: + threads: 1 + c5: + threads: 1 + c6: + threads: 1 diff --git a/integration_test/cluster_test/settings.ymmsl b/integration_test/cluster_test/settings.ymmsl new file mode 100644 index 00000000..be4fb16f --- /dev/null +++ b/integration_test/cluster_test/settings.ymmsl @@ -0,0 +1,5 @@ +ymmsl_version: v0.1 + +settings: + muscle_remote_log_level: DEBUG + steps: 10 diff --git a/integration_test/cluster_test/single.sh b/integration_test/cluster_test/single.sh new file mode 100755 index 00000000..8197854e --- /dev/null +++ b/integration_test/cluster_test/single.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +#SBATCH --time=0:1:00 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=1 + +set -e + +source /home/cerulean/venv/bin/activate + +muscle_manager --log-level=DEBUG --start-all /home/cerulean/cluster_test/single.ymmsl /home/cerulean/cluster_test/settings.ymmsl /home/cerulean/cluster_test/implementations.ymmsl + diff --git a/integration_test/cluster_test/single.ymmsl b/integration_test/cluster_test/single.ymmsl new file mode 100644 index 00000000..304579fc --- /dev/null +++ b/integration_test/cluster_test/single.ymmsl @@ -0,0 +1,10 @@ +ymmsl_version: v0.1 + +model: + name: single + components: + c1: component + +resources: + c1: + threads: 1 diff --git a/integration_test/conftest.py b/integration_test/conftest.py index 18ab5ce4..78ac48e5 100644 --- a/integration_test/conftest.py +++ b/integration_test/conftest.py @@ -28,6 +28,10 @@ 'MUSCLE_ENABLE_CPP_MPI' not in os.environ, reason='MPI support was not detected') +skip_unless_cluster = pytest.mark.skipif( + 'MUSCLE_TEST_CLUSTER' not in os.environ, + reason='Cluster tests were not explicitly enabled') + @pytest.fixture def yatiml_log_warning(): diff --git a/integration_test/test_cluster.Dockerfile b/integration_test/test_cluster.Dockerfile new file mode 100644 index 00000000..5e2cf213 --- /dev/null +++ b/integration_test/test_cluster.Dockerfile @@ -0,0 +1,9 @@ +FROM ghcr.io/naturalhpc/cerulean-test-docker-images/cerulean-fake-slurm-23-11:latest + +RUN apt-get update && \ + apt-get install -y python3-venv libopenmpi-dev + +RUN apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /home/cerulean + diff --git a/integration_test/test_cluster.py b/integration_test/test_cluster.py new file mode 100644 index 00000000..7cf06112 --- /dev/null +++ b/integration_test/test_cluster.py @@ -0,0 +1,207 @@ +# This ensures that pytest can import this module in the non-cluster test env +# in which these dependencies don't exist, because these tests won' be run. +try: + import cerulean +except ImportError: + pass + +import logging +from pathlib import Path +import pytest +import time + +from .conftest import skip_unless_cluster + + +logger = logging.getLogger(__name__) + + +def _run(term, timeout, command): + exit_code, out, err = term.run(timeout, command, []) + if exit_code != 0: + logger.error(err) + assert exit_code == 0 + return out + + +@pytest.fixture(scope='session') +def local_term(): + return cerulean.LocalTerminal() + + +@pytest.fixture(scope='session') +def local_fs(): + return cerulean.LocalFileSystem() + + +@pytest.fixture(scope='session') +def virtual_cluster_image(local_term): + IMAGE_NAME = 'muscle3_test_cluster' + _run(local_term, 180, ( + f'docker buildx build -t {IMAGE_NAME}' + ' -f integration_test/test_cluster.Dockerfile .')) + return IMAGE_NAME + + +def _ssh_term(timeout_msg): + cred = cerulean.PasswordCredential('cerulean', 'kingfisher') + ready = False + start = time.monotonic() + while not ready: + if (time.monotonic() - start) > 60.0: + raise Exception(timeout_msg) + + try: + term = cerulean.SshTerminal('localhost', 10022, cred) + ready = True + except Exception: + time.sleep(3.0) + + return term + + +@pytest.fixture(scope='session') +def virtual_cluster_container(local_term, virtual_cluster_image): + # clean up stray container from previous run, if any + _run(local_term, 60, 'docker rm -f muscle3_test_slurm') + + _run(local_term, 60, ( + 'docker run -d --name muscle3_test_slurm -p 10022:22' + f' {virtual_cluster_image}')) + + _ssh_term('Virtual cluster container start timed out') + yield None + + # _run(local_term, 60, 'docker rm -f muscle3_test_slurm') + + +@pytest.fixture(scope='session') +def setup_connection(virtual_cluster_container): + # Session-wide connection used for container setup actions only + # Tests each have their own connection, see virtual_cluster() below + term = _ssh_term('Connection to virtual cluster container timed out') + with cerulean.SftpFileSystem(term, True) as fs: + yield term, fs + + +@pytest.fixture(scope='session') +def repo_root(local_fs): + root_dir = Path(__file__).parents[1] + return local_fs / str(root_dir) + + +@pytest.fixture(scope='session') +def muscle3_venv(repo_root, setup_connection): + remote_term, remote_fs = setup_connection + + _run(remote_term, 10, 'python3 -m venv /home/cerulean/venv') + in_venv = 'source /home/cerulean/venv/bin/activate && ' + _run(remote_term, 30, ( + f'/bin/bash -c "{in_venv} python3 -m pip install pip wheel setuptools"')) + + muscle3_tgt = remote_fs / 'home/cerulean/muscle3' + muscle3_tgt.mkdir() + (muscle3_tgt / 'libmuscle').mkdir() + + for f in ( + 'muscle3', 'libmuscle/python', 'setup.py', 'MANIFEST.in', 'LICENSE', + 'NOTICE', 'VERSION', 'README.rst'): + cerulean.copy(repo_root / f, muscle3_tgt / f) + + _run(remote_term, 60, f'/bin/bash -c "{in_venv} pip install ./muscle3"') + return in_venv + + +@pytest.fixture(scope='session') +def create_remote_test_files(repo_root, setup_connection): + remote_term, remote_fs = setup_connection + + remote_home = remote_fs / 'home' / 'cerulean' + + cerulean.copy( + repo_root / 'integration_test' / 'cluster_test', remote_home, + copy_permissions=True) + + +@pytest.fixture +def virtual_cluster(virtual_cluster_container, muscle3_venv, create_remote_test_files): + term = _ssh_term('Connection to vitrual cluster container timed out') + with cerulean.SftpFileSystem(term, True) as fs: + sched = cerulean.SlurmScheduler(term) + yield term, fs, sched + + +@pytest.fixture +def remote_home(virtual_cluster): + _, remote_fs, _ = virtual_cluster + return remote_fs / 'home' / 'cerulean' + + +@pytest.fixture +def remote_test_files(remote_home): + return remote_home / 'cluster_test' + + +@pytest.fixture +def remote_out_dir(remote_home): + return remote_home / 'test_results' + + +def _make_job(name, remote_test_files, remote_out_dir): + job_dir = remote_out_dir / f'test_{name}' + + job = cerulean.JobDescription() + job.name = name + job.working_directory = job_dir + job.command = remote_test_files / f'{name}.sh' + job.stdout_file = job_dir / 'stdout.txt' + job.stderr_file = job_dir / 'stderr.txt' + job.queue_name = 'debug' + job.time_reserved = 60 + job.system_out_file = job_dir / 'sysout.txt' + job.system_err_file = job_dir / 'syserr.txt' + + return job + + +_SCHED_OVERHEAD = 60 + + +@skip_unless_cluster +def test_single(virtual_cluster, remote_test_files, remote_out_dir): + remote_term, remote_fs, sched = virtual_cluster + + job = _make_job('single', remote_test_files, remote_out_dir) + job.num_nodes = 1 + job.mpi_processes_per_node = 1 + job.extra_scheduler_options = '--ntasks-per-core=1' + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 + + +@skip_unless_cluster +def test_dispatch(virtual_cluster, remote_test_files, remote_out_dir): + remote_term, remote_fs, sched = virtual_cluster + + job = _make_job('dispatch', remote_test_files, remote_out_dir) + job.num_nodes = 2 + job.mpi_processes_per_node = 1 + job.extra_scheduler_options = '--ntasks-per-core=1' + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 + + +@skip_unless_cluster +def test_multiple(virtual_cluster, remote_test_files, remote_out_dir): + remote_term, remote_fs, sched = virtual_cluster + + job = _make_job('multiple', remote_test_files, remote_out_dir) + job.num_nodes = 3 + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 diff --git a/tox.ini b/tox.ini index e9d89a3e..548c5a64 100644 --- a/tox.ini +++ b/tox.ini @@ -4,10 +4,11 @@ skip_missing_interpreters = true [testenv] deps = - mypy flake8 + mypy pytest pytest-cov + requests # missing dependency in cerulean... types-psutil ymmsl @@ -22,6 +23,24 @@ commands = pytest {posargs} flake8 libmuscle/python/libmuscle integration_test scripts/ +[testenv:cluster] +deps = + cerulean + docker + pytest + pytest-cov + requests # missing dependency in cerulean... + types-psutil + ymmsl + +setenv = + MUSCLE_TEST_CLUSTER=1 + +commands = + pytest -k 'test_cluster' {posargs} + # pytest --log-cli-level=DEBUG -s -k 'test_cluster' {posargs} + + [gh-actions] python = 3.7: py37 From c7f0329a1c2677a342fb87a605f46d7e831497f3 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 17 Jul 2024 18:05:05 +0200 Subject: [PATCH 02/49] Refactor QCGPJInstantiator --- .../libmuscle/manager/instance_manager.py | 6 +-- .../python/libmuscle/manager/instantiator.py | 43 +++++++++++++++++- .../libmuscle/manager/qcgpj_instantiator.py | 44 +++---------------- 3 files changed, 51 insertions(+), 42 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/instance_manager.py b/libmuscle/python/libmuscle/manager/instance_manager.py index 8d06c45e..4241b17e 100644 --- a/libmuscle/python/libmuscle/manager/instance_manager.py +++ b/libmuscle/python/libmuscle/manager/instance_manager.py @@ -11,9 +11,9 @@ from libmuscle.manager.instance_registry import InstanceRegistry from libmuscle.manager.instantiator import ( CancelAllRequest, CrashedResult, InstantiatorRequest, - InstantiationRequest, ProcessStatus, ShutdownRequest) + InstantiationRequest, Process, ProcessStatus, ShutdownRequest) from libmuscle.manager.logger import last_lines -from libmuscle.manager.qcgpj_instantiator import Process, QCGPJInstantiator +from libmuscle.manager.qcgpj_instantiator import QCGPJInstantiator from libmuscle.manager.run_dir import RunDir from libmuscle.planner.planner import Planner, Resources @@ -61,7 +61,7 @@ class InstanceManager: def __init__( self, configuration: Configuration, run_dir: RunDir, instance_registry: InstanceRegistry) -> None: - """Create a ProcessManager. + """Create an InstanceManager. Args: configuration: The global configuration diff --git a/libmuscle/python/libmuscle/manager/instantiator.py b/libmuscle/python/libmuscle/manager/instantiator.py index 9afca712..41fa5124 100644 --- a/libmuscle/python/libmuscle/manager/instantiator.py +++ b/libmuscle/python/libmuscle/manager/instantiator.py @@ -1,9 +1,10 @@ import enum import logging import multiprocessing as mp +import os from pathlib import Path import traceback -from typing import Optional +from typing import Dict, Optional from ymmsl import Implementation, Reference, ResourceRequirements @@ -133,3 +134,43 @@ def emit(self, record: logging.LogRecord) -> None: record.exc_info = None self._queue.put(record) + + +def reconfigure_logging(queue: mp.Queue) -> None: + """Reconfigure logging to send to queue. + + This reconfigures the logging subsystem to intercept all log + messages and send them to the given queue, rather than to the + previously configured handler. + """ + root_logger = logging.getLogger() + for h in list(root_logger.handlers): + root_logger.removeHandler(h) + + handler = QueueingLogHandler(queue) + root_logger.addHandler(handler) + + +def create_instance_env( + instance: Reference, overlay: Dict[str, str]) -> Dict[str, str]: + """Creates an environment for an instance. + + This takes the current (manager) environment variables and makes + a copy, then adds or extends it according to the overlay given. + + Keys from overlay that start with will have the corresponding + value appended to the matching (by key, without the +) value in + env, otherwise the value in env gets overwritten. + """ + env = os.environ.copy() + env['MUSCLE_INSTANCE'] = str(instance) + + for key, value in overlay.items(): + if key.startswith('+'): + if key[1:] in env: + env[key[1:]] += value + else: + env[key[1:]] = value + else: + env[key] = value + return env diff --git a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py index 9b5836d4..ae58089b 100644 --- a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py +++ b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py @@ -26,8 +26,8 @@ from ymmsl import ExecutionModel, MPICoresResReq, Reference, ThreadedResReq from libmuscle.manager.instantiator import ( - CancelAllRequest, CrashedResult, InstantiationRequest, Process, - ProcessStatus, QueueingLogHandler, ShutdownRequest) + CancelAllRequest, CrashedResult, create_instance_env, InstantiationRequest, + Process, ProcessStatus, reconfigure_logging, ShutdownRequest) from libmuscle.planner.planner import Resources @@ -95,7 +95,7 @@ class QCGPJInstantiator(mp.Process): def __init__( self, resources: mp.Queue, requests: mp.Queue, results: mp.Queue, log_records: mp.Queue, run_dir: Path) -> None: - """Create a QCGPJProcessManager. + """Create a QCGPJInstantiator. Args: resources: Queue for returning the available resources @@ -103,7 +103,7 @@ def __init__( results: Queue to communicate finished processes over log_messages: Queue to push log messages to """ - super().__init__(name='QCGPJProcessManager') + super().__init__(name='QCGPJInstantiator') self._resources_out = resources self._requests_in = requests self._results_out = results @@ -120,7 +120,7 @@ def run(self) -> None: qcgpj_dir.mkdir(exist_ok=True) os.chdir(qcgpj_dir) - self._reconfigure_logging() + reconfigure_logging(self._log_records_out) # Executor needs to be instantiated before we go async qcg_config: Dict[str, str] = {qcg_Config.AUX_DIR: str(qcgpj_dir)} @@ -196,15 +196,6 @@ async def _main(self) -> None: _logger.debug('Stopping executor') await self._executor.stop() - def _reconfigure_logging(self) -> None: - """Reconfigure logging to send to log_records_out.""" - root_logger = logging.getLogger() - for h in list(root_logger.handlers): - root_logger.removeHandler(h) - - handler = QueueingLogHandler(self._log_records_out) - root_logger.addHandler(handler) - def _send_resources(self) -> None: """Converts and sends QCG available resources.""" resources = Resources() @@ -247,7 +238,7 @@ def _create_job( """Creates a QCG allocation and job for a request.""" total_cores = sum(map(len, request.resources.cores.values())) - env = self._create_env(request.instance, request.implementation.env) + env = create_instance_env(request.instance, request.implementation.env) if request.implementation.script: execution = self._qcg_job_execution_with_script(request, env) @@ -272,29 +263,6 @@ def _create_job( qcg_iteration = qcg_SchedulingIteration(sjob, None, None, resources, []) return qcg_allocation, qcg_iteration - def _create_env( - self, instance: Reference, overlay: Dict[str, str] - ) -> Dict[str, str]: - """Updates the environment with the implementation's env. - - This updates env in-place. Keys from overlay that start with - + will have the corresponding value appended to the matching - (by key, without the +) value in env, otherwise the value in - env gets overwritten. - """ - env = os.environ.copy() - env['MUSCLE_INSTANCE'] = str(instance) - - for key, value in overlay.items(): - if key.startswith('+'): - if key[1:] in env: - env[key[1:]] += value - else: - env[key[1:]] = value - else: - env[key] = value - return env - def _qcg_job_execution_with_script( self, request: InstantiationRequest, env: Dict[str, str] ) -> qcg_JobExecution: From 5cbe41799759ba999e649ae99761acd5b056d674 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 20 Sep 2024 19:07:42 +0200 Subject: [PATCH 03/49] Improve docstsrings in Instantiator --- libmuscle/python/libmuscle/manager/instantiator.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/instantiator.py b/libmuscle/python/libmuscle/manager/instantiator.py index 41fa5124..db83a52d 100644 --- a/libmuscle/python/libmuscle/manager/instantiator.py +++ b/libmuscle/python/libmuscle/manager/instantiator.py @@ -72,7 +72,12 @@ class InstantiationRequest(InstantiatorRequest): Attributes: instance: The name of the instance implementation: The implementation to start for it - resources: The resources to start it on + res_req: The resource requirements for this instance + resources: The specific resources to start it on + instance_dir: The main directory for this instance + work_dir: The directory in which to start it + stdout_path: Path of file to redirect stdout to + stderr_path: Path of file to redirect stderr to """ def __init__( self, instance: Reference, implementation: Implementation, @@ -85,7 +90,7 @@ def __init__( instance: The name of the instance implementation: The implementation to start for it res_req: The resource requirements for this instance - resources: The resources to instantiate on + resources: The specific resources to instantiate on instance_dir: The main directory for this instance work_dir: The directory in which to start it stdout_path: Path of file to redirect stdout to @@ -158,7 +163,7 @@ def create_instance_env( This takes the current (manager) environment variables and makes a copy, then adds or extends it according to the overlay given. - Keys from overlay that start with will have the corresponding + Keys from overlay that start with + will have the corresponding value appended to the matching (by key, without the +) value in env, otherwise the value in env gets overwritten. """ From 9d54910842e69997fb40a5159d13f85438204f9f Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 20 Sep 2024 20:05:00 +0200 Subject: [PATCH 04/49] Add ports and implementations to cluster test ymmsl files --- integration_test/cluster_test/dispatch.ymmsl | 13 +++++-- integration_test/cluster_test/multiple.ymmsl | 37 ++++++++++++++++---- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/integration_test/cluster_test/dispatch.ymmsl b/integration_test/cluster_test/dispatch.ymmsl index a786e2a9..d8b5a715 100644 --- a/integration_test/cluster_test/dispatch.ymmsl +++ b/integration_test/cluster_test/dispatch.ymmsl @@ -3,8 +3,17 @@ ymmsl_version: v0.1 model: name: dispatch components: - c1: component - c2: component + c1: + ports: + f_init: init_in + o_f: final_out + implementation: component_python + c2: + ports: + f_init: init_in + o_f: final_out + implementation: component_python + conduits: c1.final_out: c2.init_in diff --git a/integration_test/cluster_test/multiple.ymmsl b/integration_test/cluster_test/multiple.ymmsl index 60260aad..64cb8b42 100644 --- a/integration_test/cluster_test/multiple.ymmsl +++ b/integration_test/cluster_test/multiple.ymmsl @@ -3,12 +3,37 @@ ymmsl_version: v0.1 model: name: multiple components: - c1: component - c2: component - c3: component - c4: component - c5: component - c6: component + c1: + ports: + o_i: inter_out + s: inter_in + implementation: component_python + c2: + ports: + o_i: inter_out + s: inter_in + implementation: component_python + c3: + ports: + o_i: inter_out + s: inter_in + implementation: component_python + c4: + ports: + o_i: inter_out + s: inter_in + implementation: component_python + c5: + ports: + o_i: inter_out + s: inter_in + implementation: component_python + c6: + ports: + o_i: inter_out + s: inter_in + implementation: component_python + conduits: c1.inter_out: c2.inter_in c2.inter_out: c3.inter_in From 3b0ec5691c297fb4ddebc9eba30b9cefe22acafd Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 20 Sep 2024 20:18:43 +0200 Subject: [PATCH 05/49] Move fake cluster into subdirectory --- integration_test/fake_cluster/Dockerfile | 46 +++++ integration_test/fake_cluster/__init__.py | 0 integration_test/fake_cluster/slurm.conf | 163 ++++++++++++++++++ .../fake_cluster/start-services.sh | 70 ++++++++ integration_test/test_cluster.Dockerfile | 9 - 5 files changed, 279 insertions(+), 9 deletions(-) create mode 100644 integration_test/fake_cluster/Dockerfile create mode 100644 integration_test/fake_cluster/__init__.py create mode 100644 integration_test/fake_cluster/slurm.conf create mode 100644 integration_test/fake_cluster/start-services.sh delete mode 100644 integration_test/test_cluster.Dockerfile diff --git a/integration_test/fake_cluster/Dockerfile b/integration_test/fake_cluster/Dockerfile new file mode 100644 index 00000000..523b137e --- /dev/null +++ b/integration_test/fake_cluster/Dockerfile @@ -0,0 +1,46 @@ +FROM ghcr.io/naturalhpc/cerulean-test-docker-images/cerulean-fake-slurm-23-11:latest + +RUN apt-get update && \ + apt-get remove -y openmpi-bin && \ + apt-get install -y python3-venv gcc g++ gfortran git build-essential xz-utils \ + bzip2 cmake + +RUN cd /opt && \ + git clone --depth=100 --branch=releases/v0.22 https://github.com/spack/spack.git + +RUN . /opt/spack/share/spack/setup-env.sh && \ + spack config add "modules:default:enable:[tcl]" && \ + spack install lmod && \ + echo >>/etc/profile && \ + echo ". $(spack location -i lmod)/lmod/lmod/init/bash" >>/etc/profile && \ + echo ". /opt/spack/share/spack/setup-env.sh" >>/etc/profile + +# OpenMPI uses libmunge from munge, which needs to look for the munge unix socket +# in /run because that's where the apt-get installed munge we're actually running +# puts it. Munge doesn't have a configuration file, but it does have a compiled-in +# constant that can be set when building. So that's what we do here. +RUN bash -l -c 'spack install munge localstatedir=/' +RUN bash -l -c 'spack install openmpi+legacylaunchers+pmi schedulers=slurm' +RUN bash -l -c 'spack install mpich+slurm' +RUN bash -l -c 'spack install intel-oneapi-mpi' + +# Enable Spack when running ssh -c +RUN echo >>/etc/ssh/sshd_config && \ + echo 'SetEnv BASH_ENV=/etc/profile' >>/etc/ssh/sshd_config + +# Point workers to muscle3-headnode +COPY integration_test/fake_cluster/slurm.conf /usr/local/etc/slurm/slurm.conf + +# Replace start-up scripts so we can run nodes separately +COPY integration_test/fake_cluster/start-services.sh /etc/start-services.sh +RUN chmod +x /etc/start-services.sh + +# Disable ssh debug output +RUN sed -i -e 's/^LogLevel DEBUG3$//' /etc/ssh/sshd_config +RUN sed -i -e 's^Subsystem sftp /usr/lib/openssh/sftp-server -l DEBUG3^Subsystem sftp /usr/lib/openssh/sftp-server^' /etc/ssh/sshd_config + + +RUN apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /home/cerulean + diff --git a/integration_test/fake_cluster/__init__.py b/integration_test/fake_cluster/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/integration_test/fake_cluster/slurm.conf b/integration_test/fake_cluster/slurm.conf new file mode 100644 index 00000000..1959f614 --- /dev/null +++ b/integration_test/fake_cluster/slurm.conf @@ -0,0 +1,163 @@ +# slurm.conf file generated by configurator.html. +# Put this file on all nodes of your cluster. +# See the slurm.conf man page for more information. +# +ControlMachine=muscle3-headnode +#ControlAddr= +#BackupController= +#BackupAddr= +# +AuthType=auth/munge +#CheckpointType=checkpoint/none +CredType=cred/none +CryptoType=crypto/openssl +JobCredentialPrivateKey=/usr/local/etc/slurm/slurm.key +JobCredentialPublicCertificate=/usr/local/etc/slurm/slurm.cert +#DisableRootJobs=NO +#EnforcePartLimits=NO +#Epilog= +#EpilogSlurmctld= +#FirstJobId=1 +#MaxJobId=999999 +#GresTypes= +#GroupUpdateForce=0 +GroupUpdateTime=2 +#JobCheckpointDir=/var/slurm/checkpoint +#JobCredentialPrivateKey= +#JobCredentialPublicCertificate= +#JobFileAppend=0 +#JobRequeue=1 +#JobSubmitPlugins=1 +#KillOnBadExit=0 +#Licenses=foo*4,bar +# don't send any emails: +MailProg=/bin/true +#MaxJobCount=5000 +#MaxStepCount=40000 +#MaxTasksPerNode=128 +MpiDefault=none +#MpiParams=ports=#-# +#PluginDir= +#PlugStackConfig= +#PrivateData=jobs +ProctrackType=proctrack/linuxproc +#Prolog= +#PrologSlurmctld= +#PropagatePrioProcess=0 +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +ReturnToService=1 +#SallocDefaultCommand= +#SlurmctldPidFile=/var/run/slurmctld.pid +SlurmctldPort=6817 +SlurmdPidFile=/var/run/slurmd.%n.pid +SlurmdPort=6818 +SlurmdSpoolDir=/var/spool/slurmd.%n +SlurmUser=root +SlurmdUser=root +#SrunEpilog= +#SrunProlog= +StateSaveLocation=/var/spool/slurmctld/state +SwitchType=switch/none +#TaskEpilog= +TaskPlugin=task/none +#TaskPluginParam= +#TaskProlog= +#TopologyPlugin=topology/tree +#TmpFs=/tmp +#TrackWCKey=no +#TreeWidth= +#UnkillableStepProgram= +#UsePAM=0 +# +# +# TIMERS +BatchStartTimeout=2 +#CompleteWait=0 +EpilogMsgTime=1 +#GetEnvTimeout=2 +#HealthCheckInterval=0 +#HealthCheckProgram= +InactiveLimit=0 +KillWait=2 +MessageTimeout=2 +#ResvOverRun=0 +MinJobAge=2 +#OverTimeLimit=0 +SlurmctldTimeout=2 +SlurmdTimeout=2 +#UnkillableStepTimeout=60 +#VSizeFactor=0 +Waittime=0 +# +# +# SCHEDULING +#DefMemPerCPU=0 +#MaxMemPerCPU=0 +#SchedulerRootFilter=1 +SchedulerTimeSlice=5 +SchedulerType=sched/backfill +SchedulerParameters=bf_interval=1,bf_resolution=1,sched_interval=1 +SelectType=select/linear +#SelectTypeParameters= +# +# +# JOB PRIORITY +#PriorityType=priority/basic +#PriorityDecayHalfLife= +#PriorityCalcPeriod= +#PriorityFavorSmall= +#PriorityMaxAge= +#PriorityUsageResetPeriod= +#PriorityWeightAge= +#PriorityWeightFairshare= +#PriorityWeightJobSize= +#PriorityWeightPartition= +#PriorityWeightQOS= +# +# +# LOGGING AND ACCOUNTING +#AccountingStorageEnforce=0 +AccountingStorageType=accounting_storage/slurmdbd +AccountingStoragePort=6819 +AccountingStorageUser=root +AccountingStoreFlags=job_comment +ClusterName=mycluster +#DebugFlags= +#JobCompHost=localhost +#JobCompLoc=slurm_acct_db +JobCompLoc=/var/log/slurm/job_completions +JobCompType=jobcomp/filetxt +#JobCompPass=xenon-slurm-pw +#JobCompPort= +#JobCompUser=root +JobAcctGatherFrequency=2 +JobAcctGatherType=jobacct_gather/linux +SlurmctldDebug=3 +#SlurmctldLogFile= +SlurmdDebug=3 +SlurmdLogFile=/var/log/slurm/slurmd.%n.log +#SlurmSchedLogFile= +#SlurmSchedLogLevel= +# +# +# POWER SAVE SUPPORT FOR IDLE NODES (optional) +#SuspendProgram= +#ResumeProgram= +#SuspendTimeout= +#ResumeTimeout= +#ResumeRate= +#SuspendExcNodes= +#SuspendExcParts= +#SuspendRate= +#SuspendTime= +# +# +# COMPUTE NODES +NodeName=muscle3-node-0 Procs=2 NodeAddr=muscle3-node-0 Port=17001 State=UNKNOWN +NodeName=muscle3-node-1 Procs=2 NodeAddr=muscle3-node-1 Port=17002 State=UNKNOWN +NodeName=muscle3-node-2 Procs=2 NodeAddr=muscle3-node-2 Port=17003 State=UNKNOWN +NodeName=muscle3-node-3 Procs=2 NodeAddr=muscle3-node-3 Port=17004 State=UNKNOWN +NodeName=muscle3-node-4 Procs=2 NodeAddr=muscle3-node-4 Port=17005 State=UNKNOWN +PartitionName=debug Nodes=muscle3-node-[0-4] Default=YES MaxTime=INFINITE State=UP +PartitionName=batch Nodes=muscle3-node-[0-2] Default=NO MaxTime=INFINITE State=UP diff --git a/integration_test/fake_cluster/start-services.sh b/integration_test/fake_cluster/start-services.sh new file mode 100644 index 00000000..4f131964 --- /dev/null +++ b/integration_test/fake_cluster/start-services.sh @@ -0,0 +1,70 @@ +#!/bin/bash +echo -e "\nstarting syslog-ng..." +syslog-ng + + +echo -e "\nstarting munged..." +setuser munge /usr/sbin/munged --foreground > /var/log/munged.out.log 2> /var/log/munged.err.log & + +echo -n -e "\nwaiting for munged to start..." +while [ ! -e /run/munge/munge.socket.2 ] ; do + sleep 1 + echo '.' +done +echo + + +NODENAME=$(hostname) + +if [ "a${NODENAME}" == "amuscle3-headnode" ] ; then + # Run as a headnode + echo -e "\nstarting mariadb..." + setuser mysql /usr/bin/mariadbd-safe >/var/log/mariadb.out.log 2>/var/log/mariadb.err.log & + + echo -n -e "\nwaiting for mariadb to start..." + while ! nc -z localhost 3306 ; do + sleep 1 + echo '.' + done + echo + + + echo -e "\nstarting slurmdbd..." + /usr/local/sbin/slurmdbd -D >/var/log/slurmdbd.out.log 2>/var/log/slurmdbd.err.log & + + echo -n -e "\nwaiting for slurmdbd to start..." + while ! nc -z localhost 6819 ; do + sleep 1 + echo '.' + done + echo + + + echo -e "\nstarting slurmctld..." + /usr/local/sbin/slurmctld -D -c -vvvv > /var/log/slurmctld.out.log 2> /var/log/slurmctld.err.log & + + echo -n -e "\nwaiting for slurmctld to start..." + while ! nc -z localhost 6817 ; do + sleep 1 + echo '.' + done + echo + + + echo -e "\nmaking accounting readable to users..." + /bin/chmod -R og+rX /var/log/slurm + +else + # Run as a compute node + + echo -e "\nstarting compute node..." + /usr/local/sbin/slurmd -D -N ${NODENAME} > /var/log/slurmd.out.log 2> /var/log/slurmd.err.log & +fi + +echo -e "\nstarting sshd..." +/usr/sbin/sshd -De > /var/log/sshd.out.log 2> /var/log/sshd.err.log & + +echo -e "\nStartup complete" + +sleep infinity + diff --git a/integration_test/test_cluster.Dockerfile b/integration_test/test_cluster.Dockerfile deleted file mode 100644 index 5e2cf213..00000000 --- a/integration_test/test_cluster.Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM ghcr.io/naturalhpc/cerulean-test-docker-images/cerulean-fake-slurm-23-11:latest - -RUN apt-get update && \ - apt-get install -y python3-venv libopenmpi-dev - -RUN apt-get clean && rm -rf /var/lib/apt/lists/* - -WORKDIR /home/cerulean - From 83406ccbc3abc5204240027230429086c36312e5 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 20 Sep 2024 20:36:38 +0200 Subject: [PATCH 06/49] Add shared filesystem to fake cluster --- integration_test/cluster_test/implementations.ymmsl | 4 ++-- integration_test/cluster_test/multiple.sh | 6 ++++-- integration_test/cluster_test/single.sh | 6 ++++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/integration_test/cluster_test/implementations.ymmsl b/integration_test/cluster_test/implementations.ymmsl index 04737a2f..c90db7f9 100644 --- a/integration_test/cluster_test/implementations.ymmsl +++ b/integration_test/cluster_test/implementations.ymmsl @@ -2,7 +2,7 @@ ymmsl_version: v0.1 implementations: component: - virtual_env: /home/cerulean/venv + virtual_env: /home/cerulean/shared/venv executable: python args: - - /home/cerulean/cluster_test/component.py + - /home/cerulean/shared/cluster_test/component.py diff --git a/integration_test/cluster_test/multiple.sh b/integration_test/cluster_test/multiple.sh index a5122dd2..225caa43 100755 --- a/integration_test/cluster_test/multiple.sh +++ b/integration_test/cluster_test/multiple.sh @@ -6,7 +6,9 @@ set -e -source /home/cerulean/venv/bin/activate +source /home/cerulean/shared/venv/bin/activate -muscle_manager --log-level=DEBUG --start-all /home/cerulean/cluster_test/multiple.ymmsl /home/cerulean/cluster_test/settings.ymmsl /home/cerulean/cluster_test/implementations.ymmsl +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/multiple.ymmsl $CT/settings.ymmsl $CT/implementations.ymmsl diff --git a/integration_test/cluster_test/single.sh b/integration_test/cluster_test/single.sh index 8197854e..1a3d0984 100755 --- a/integration_test/cluster_test/single.sh +++ b/integration_test/cluster_test/single.sh @@ -6,7 +6,9 @@ set -e -source /home/cerulean/venv/bin/activate +source /home/cerulean/shared/venv/bin/activate -muscle_manager --log-level=DEBUG --start-all /home/cerulean/cluster_test/single.ymmsl /home/cerulean/cluster_test/settings.ymmsl /home/cerulean/cluster_test/implementations.ymmsl +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/single.ymmsl $CT/settings.ymmsl $CT/implementations.ymmsl From a4638723d32a59537d5b78599da6b441953b407f Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 20 Sep 2024 20:38:12 +0200 Subject: [PATCH 07/49] Clean up slurm script --- integration_test/cluster_test/multiple.sh | 4 ---- integration_test/cluster_test/single.sh | 4 ---- 2 files changed, 8 deletions(-) diff --git a/integration_test/cluster_test/multiple.sh b/integration_test/cluster_test/multiple.sh index 225caa43..49093155 100755 --- a/integration_test/cluster_test/multiple.sh +++ b/integration_test/cluster_test/multiple.sh @@ -1,9 +1,5 @@ #!/bin/bash -#SBATCH --time=0:1:00 -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=1 - set -e source /home/cerulean/shared/venv/bin/activate diff --git a/integration_test/cluster_test/single.sh b/integration_test/cluster_test/single.sh index 1a3d0984..00f7e0b9 100755 --- a/integration_test/cluster_test/single.sh +++ b/integration_test/cluster_test/single.sh @@ -1,9 +1,5 @@ #!/bin/bash -#SBATCH --time=0:1:00 -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=1 - set -e source /home/cerulean/shared/venv/bin/activate From 3ded0199ca8cbd64e9967f4815313278e497a062 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 20 Sep 2024 20:40:46 +0200 Subject: [PATCH 08/49] Add initial NativeInstantiator --- .../libmuscle/manager/instance_manager.py | 8 + .../libmuscle/native_instantiator/__init__.py | 0 .../native_instantiator.py | 230 ++++++++++++++ .../native_instantiator/process_manager.py | 68 +++++ .../native_instantiator/resource_detector.py | 45 +++ .../native_instantiator/run_script.py | 244 +++++++++++++++ .../libmuscle/native_instantiator/slurm.py | 280 ++++++++++++++++++ .../test/test_process_manager.py | 120 ++++++++ .../native_instantiator/test/test_slurm.py | 72 +++++ setup.py | 1 + tox.ini | 7 +- 11 files changed, 1073 insertions(+), 2 deletions(-) create mode 100644 libmuscle/python/libmuscle/native_instantiator/__init__.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/native_instantiator.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/process_manager.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/resource_detector.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/run_script.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/slurm.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/test/test_process_manager.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/test/test_slurm.py diff --git a/libmuscle/python/libmuscle/manager/instance_manager.py b/libmuscle/python/libmuscle/manager/instance_manager.py index 4241b17e..bc6e8edd 100644 --- a/libmuscle/python/libmuscle/manager/instance_manager.py +++ b/libmuscle/python/libmuscle/manager/instance_manager.py @@ -15,6 +15,7 @@ from libmuscle.manager.logger import last_lines from libmuscle.manager.qcgpj_instantiator import QCGPJInstantiator from libmuscle.manager.run_dir import RunDir +from libmuscle.native_instantiator.native_instantiator import NativeInstantiator from libmuscle.planner.planner import Planner, Resources @@ -77,9 +78,16 @@ def __init__( self._results_in: Queue[_ResultType] = Queue() self._log_records_in: Queue[logging.LogRecord] = Queue() + # TODO: Instantiator factory function + # TODO: Add argument that specifies whether to use QCG or not + ''' self._instantiator = QCGPJInstantiator( self._resources_in, self._requests_out, self._results_in, self._log_records_in, self._run_dir.path) + ''' + self._instantiator = NativeInstantiator( + self._resources_in, self._requests_out, self._results_in, + self._log_records_in, self._run_dir.path) self._instantiator.start() self._log_handler = LogHandlingThread(self._log_records_in) diff --git a/libmuscle/python/libmuscle/native_instantiator/__init__.py b/libmuscle/python/libmuscle/native_instantiator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py new file mode 100644 index 00000000..d34d5482 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py @@ -0,0 +1,230 @@ +import logging +import multiprocessing as mp +from os import chdir +from pathlib import Path +import queue +import sys +from time import sleep +import traceback +from typing import Dict, List, Optional + +from libmuscle.manager.instantiator import ( + CancelAllRequest, CrashedResult, create_instance_env, InstantiationRequest, + Process, ProcessStatus, reconfigure_logging, ShutdownRequest) +from libmuscle.native_instantiator.process_manager import ProcessManager +from libmuscle.native_instantiator.resource_detector import ResourceDetector +from libmuscle.native_instantiator.run_script import make_script, prep_resources +from libmuscle.planner.planner import Resources +from ymmsl import MPICoresResReq, MPINodesResReq, ResourceRequirements, ThreadedResReq + + +_logger = logging.getLogger(__name__) + + +class NativeInstantiator(mp.Process): + """Instantiates instances on the local machine.""" + def __init__( + self, resources: mp.Queue, requests: mp.Queue, results: mp.Queue, + log_records: mp.Queue, run_dir: Path) -> None: + """Create a NativeInstantiator + + Args: + resources: Queue for returning the available resources + requests: Queue to take requests from + results: Queue to communicate finished processes over + log_messages: Queue to push log messages to + run_dir: Run directory for the current run + """ + super().__init__(name='NativeInstantiator') + self._resources_out = resources + self._requests_in = requests + self._results_out = results + self._log_records_out = log_records + self._run_dir = run_dir + + self._resource_detector = ResourceDetector() + self._process_manager = ProcessManager() + self._processes: Dict[str, Process] = dict() + + def run(self) -> None: + """Entry point for the process""" + try: + m3_dir = self._run_dir / 'muscle3' + m3_dir.mkdir(exist_ok=True) + chdir(m3_dir) + + reconfigure_logging(self._log_records_out) + self._send_resources() + self._main() + + except: # noqa + for line in traceback.format_exception(*sys.exc_info()): + _logger.error(line) + self._resources_out.put(CrashedResult()) + self._results_out.put(CrashedResult()) + + def _main(self) -> None: + """Main function for the background process. + + This accepts requests for instantiating jobs, stopping them, or shutting down. + Results of finished jobs are returned via the results queue. + """ + shutting_down = False + done = False + while not done: + while not shutting_down: + try: + request = self._requests_in.get_nowait() + if isinstance(request, ShutdownRequest): + _logger.debug('Got ShutdownRequest') + shutting_down = True + + elif isinstance(request, CancelAllRequest): + _logger.debug('Got CancelAllRequest') + self._process_manager.cancel_all() + _logger.debug('Done CancelAllRequest') + + elif isinstance(request, InstantiationRequest): + if not shutting_down: + self._instantiate(request) + + except queue.Empty: + break + + self._report_failed_processes() + self._report_finished_processes() + + if shutting_down: + _logger.debug(f'Done: {self._processes}') + done = not self._processes + + if not done: + sleep(0.1) + + def _send_resources(self) -> None: + """Detect resources and report them to the manager.""" + resources = Resources() + + res = zip(self._resource_detector.nodes, self._resource_detector.cores_per_node) + for node, num_cores in res: + resources.cores[node] = set(range(num_cores)) + + self._resources_out.put(resources) + + def _instantiate(self, request: InstantiationRequest) -> None: + """Instantiate an implementation according to the request.""" + name = str(request.instance) + + env = create_instance_env(request.instance, request.implementation.env) + self._add_resources(env, request.res_req) + + rankfile: Optional[Path] = None + if self._resource_detector.on_cluster(): + _logger.debug('On cluster...') + rankfile_contents, resource_env = prep_resources( + request.implementation.execution_model, request.resources) + + _logger.debug(f'Rankfile: {rankfile_contents}') + _logger.debug(f'Resource env: {resource_env}') + + if rankfile_contents: + rankfile = self._write_rankfile(request, rankfile_contents) + + if resource_env: + env.update(resource_env) + + # env['MUSCLE_THREADS_PER_MPI_PROCESS'] = str( + # request.res_req.threads_per_mpi_process) + # env['MUSCLE_OPENMPI_RANK_FILE'] = str(rank_file) + # env['MUSCLE_INTELMPI_RESOURCES'] = ' '.join(mpi_res_args) + + run_script_file = self._write_run_script(request, rankfile) + args = [str(run_script_file)] + + self._processes[name] = Process(request.instance, request.resources) + + try: + self._process_manager.start( + name, request.work_dir, args, env, + request.stdout_path, request.stderr_path) + self._processes[name].status = ProcessStatus.RUNNING + + except Exception as e: + self._processes[name].status = ProcessStatus.ERROR + self._processes[name].error_msg = f'Instance failed to start: {e}' + + def _write_rankfile(self, request: InstantiationRequest, rankfile: str) -> Path: + """Create and write out the rankfile and return its location. + + Also known as a machinefile or hostfile depending on the MPI implementation. + """ + rankfile_file = request.instance_dir / 'rankfile' + + with rankfile_file.open('w') as f: + f.write(rankfile) + + return rankfile_file + + def _write_run_script( + self, request: InstantiationRequest, rankfile: Optional[Path]) -> Path: + """Create and write out the run script and return its location.""" + if request.implementation.script: + run_script = request.implementation.script + else: + run_script = make_script( + request.implementation, request.res_req, + not self._resource_detector.on_cluster(), rankfile) + + run_script_file = request.instance_dir / 'run_script.sh' + + with run_script_file.open('w') as f: + f.write(run_script) + + run_script_file.chmod(0o700) + return run_script_file + + def _add_resources( + self, env: Dict[str, str], res_req: ResourceRequirements) -> None: + """Add resource env vars to the given env.""" + if isinstance(res_req, ThreadedResReq): + num_threads = res_req.threads + elif isinstance(res_req, (MPICoresResReq, MPINodesResReq)): + num_threads = res_req.threads_per_mpi_process + + env['MUSCLE_THREADS'] = str(num_threads) + env['OMP_NUM_THREADS'] = str(num_threads) + + num_mpi_processes: Optional[int] = None + if isinstance(res_req, MPICoresResReq): + num_mpi_processes = res_req.mpi_processes + elif isinstance(res_req, MPINodesResReq): + num_mpi_processes = res_req.nodes * res_req.mpi_processes_per_node + + if num_mpi_processes is not None: + env['MUSCLE_MPI_PROCESSES'] = str(num_mpi_processes) + + def _report_failed_processes(self) -> None: + """Get processes that failed to start and report their status.""" + failed_processes: List[str] = list() + + for name, process in self._processes.items(): + if process.status == ProcessStatus.ERROR: + self._results_out.put(process) + failed_processes.append(name) + + for name in failed_processes: + del self._processes[name] + + def _report_finished_processes(self) -> None: + """Get finished processes and report back their status.""" + for name, exit_code in self._process_manager.get_finished(): + process = self._processes[name] + if process.status == ProcessStatus.RUNNING: + if exit_code == 0: + process.status = ProcessStatus.SUCCESS + else: + process.status = ProcessStatus.ERROR + process.error_msg = 'Instance returned a non-zero exit code' + process.exit_code = exit_code + self._results_out.put(process) + del self._processes[name] diff --git a/libmuscle/python/libmuscle/native_instantiator/process_manager.py b/libmuscle/python/libmuscle/native_instantiator/process_manager.py new file mode 100644 index 00000000..bfd8f3ca --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/process_manager.py @@ -0,0 +1,68 @@ +import logging +from pathlib import Path +from subprocess import Popen +from typing import Dict, List, Tuple + + +_logger = logging.getLogger(__name__) + + +class ProcessManager: + """Manages a set of running processes.""" + def __init__(self) -> None: + """Create a ProcessManager.""" + self._processes: Dict[str, Popen] = dict() + + def start( + self, name: str, work_dir: Path, args: List[str], env: Dict[str, str], + stdout: Path, stderr: Path) -> None: + """Start a process. + + The files that the output is directed to will be overwritten if they already + exist. + + Args: + name: Name under which this process will be known + work_dir: Working directory in which to start + args: Executable and arguments to run + env: Environment variables to set + stdout: File to redirect stdout to + stderr: File to redirect stderr to + + Raises: + RuntimeError: If there is already a process with the given name. + OSError: If the process could not be started. + """ + if name in self._processes: + raise RuntimeError(f'Process {name} already exists') + _logger.debug(f'Starting process {args} with env {env} in {work_dir}') + with stdout.open('w') as out, stderr.open('w') as err: + self._processes[name] = Popen( + args, cwd=work_dir, env=env, stdout=out, stderr=err) + + def cancel_all(self) -> None: + """Stops all running processes. + + This does not wait for them to terminate, it just sends the signal to kill + them. + """ + for process in self._processes.values(): + process.kill() + + def get_finished(self) -> List[Tuple[str, int]]: + """Returns names and exit codes of finished processes. + + This returns all processes that have finished running since the previous call; + each started process will be returned exactly once. + """ + result: List[Tuple[str, int]] = list() + + for name, process in self._processes.items(): + exit_code = process.poll() + if exit_code is not None: + result.append((name, exit_code)) + + for name, _ in result: + del self._processes[name] + + return result diff --git a/libmuscle/python/libmuscle/native_instantiator/resource_detector.py b/libmuscle/python/libmuscle/native_instantiator/resource_detector.py new file mode 100644 index 00000000..8ff22db9 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/resource_detector.py @@ -0,0 +1,45 @@ +from enum import Enum +import logging +from os import sched_getaffinity + +from libmuscle.native_instantiator import slurm + + +_logger = logging.getLogger(__name__) + + +class Scheduler(Enum): + NONE = 0 + SLURM = 1 + + +class ResourceDetector: + """Detects available compute resources. + + This detects whether we're running locally or in a SLURM allocation, and returns + available resources on request. + """ + def __init__(self) -> None: + """Create a ResourceDetector. + + Detects available resources and initialises the object, which can then be + queried. + """ + if slurm.in_slurm_allocation(): + _logger.info('Detected a SLURM allocation') + self.scheduler = Scheduler.SLURM + self.nodes = slurm.get_nodes() + self.cores_per_node = slurm.get_cores_per_node() + _logger.info( + f'We have {len(self.nodes)} nodes and a total of' + f' {sum(self.cores_per_node)} cores available') + else: + _logger.info('Running locally without a cluster scheduler') + self.scheduler = Scheduler.NONE + self.nodes = ['localhost'] + self.cores_per_node = [len(sched_getaffinity(0))] + _logger.info(f'We have {sum(self.cores_per_node)} cores available') + + def on_cluster(self) -> bool: + _logger.debug(f'On cluster: {self.scheduler}') + return self.scheduler != Scheduler.NONE diff --git a/libmuscle/python/libmuscle/native_instantiator/run_script.py b/libmuscle/python/libmuscle/native_instantiator/run_script.py new file mode 100644 index 00000000..62aa7f77 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/run_script.py @@ -0,0 +1,244 @@ +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +from libmuscle.planner.planner import Resources +from ymmsl import ( + ExecutionModel, Implementation, MPICoresResReq, MPINodesResReq, + ResourceRequirements, ThreadedResReq) + + +def openmpi_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: + """Create resource description for OpenMPI mpirun + + Args: + resources: The resources to describe + + Return: + The contents of the rankfile, and a set of environment variables + """ + ranklines: List[str] = list() + all_cores = ( + (node, core) for node, cores in resources.cores.items() for core in cores) + + for i, (node, core) in enumerate(all_cores): + ranklines.append(f'rank {i}={node} slot={core}') + + rankfile = '\n'.join(ranklines) + '\n' + + return rankfile, dict() + + +def impi_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: + """Create resource description for Intel MPI mpirun + + Args: + resources: The resources to describe + + Return: + The contents of the machinefile, and a set of environment variables + """ + # I_MPI_PIN_PROCESSOR_LIST=0,1,5,6 + # pins rank 0 to core 0, rank 1 to core 1, rank 2 to core 5, rank 3 to core 6 + raise NotImplementedError() + + +def mpich_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: + """Create resource description for MPICH mpirun + + Args: + resources: The resources to describe + + Return: + The contents of the machinefile, and a set of environment variables + """ + # No env vars, but rankfile + raise NotImplementedError() + + +def srun_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: + """Create resource description for srun + + Args: + resources: The resources to describe + + Return: + The contents of the hostfile, and a set of environment variables + """ + # SLURM_HOSTFILE to point to the rankfile + # CPU_BIND=verbose,mask_cpu=0x01,0x02,0x04,0x01 to specify cores 0,1,2,0 for ranks + # 0-3 + raise NotImplementedError() + + +def prep_resources( + model: ExecutionModel, resources: Resources + ) -> Tuple[str, Dict[str, str]]: + """Create resource description for the given execution model. + + Args: + model: The execution model to generate a description for + resources: The resources to describe + + Return: + The contents of the rank/machine/hostfile, and a set of environment variables. + """ + if model == ExecutionModel.DIRECT: + return '', dict() + elif model == ExecutionModel.OPENMPI: + return openmpi_prep_resources(resources) + elif model == ExecutionModel.INTELMPI: + return impi_prep_resources(resources) + elif model == ExecutionModel.SRUNMPI: + return srun_prep_resources(resources) + # elif model == ExecutionModel.MPICH: + # return mpich_prep_resources(resources) + raise RuntimeError( + f'Impossible execution model {model}, please create an issue on GitHub') + + +def num_mpi_tasks(res_req: ResourceRequirements) -> int: + """Determine the number of MPI tasks to be started. + + For non-MPI resource requirements, returns 1. + + Args: + res_req: Resource requirements to analyse. + """ + if isinstance(res_req, ThreadedResReq): + return 1 + elif isinstance(res_req, MPICoresResReq): + return res_req.mpi_processes + elif isinstance(res_req, MPINodesResReq): + return res_req.nodes * res_req.mpi_processes_per_node + raise RuntimeError('Invalid ResourceRequirements') + + +def local_command(implementation: Implementation) -> str: + """Make a format string for the command to run. + + This interprets the execution_model and produces an appropriate shell command to + start the implementation. This function produces commands for running locally: + pinning is disabled and there's only one node. + + Args: + implementation: The implementation to start. + + Return: + A format string with embedded {ntasks} and {rankfile}. + """ + if implementation.execution_model == ExecutionModel.DIRECT: + fstr = '{command} {args}' + elif implementation.execution_model == ExecutionModel.OPENMPI: + # Native name is orterun for older and prterun for newer OpenMPI. + # So we go with mpirun, which works for either. + fstr = 'mpirun -np {{ntasks}} --oversubscribe {command} {args}' + elif implementation.execution_model == ExecutionModel.INTELMPI: + fstr = 'mpirun -n {{ntasks}} {command} {args}' + elif implementation.execution_model == ExecutionModel.SRUNMPI: + fstr = 'srun -n {{ntasks}} -m arbitrary {command} {args}' + # elif implementation.execution_model == ExecutionModel.MPICH + # fstr = 'mpiexec -n {{ntasks}} {command} {args}' + + if implementation.args is None: + args = '' + elif isinstance(implementation.args, str): + args = implementation.args + elif isinstance(implementation.args, list): + args = ' '.join(implementation.args) + + return fstr.format( + command=implementation.executable, + args=args + ) + + +def cluster_command(implementation: Implementation) -> str: + """Make a format string for the command to run. + + This interprets the execution_model and produces an appropriate shell command to + start the implementation. This function produces commands for running on a cluster, + with processes distributed across nodes and CPU pinning enabled. + + Args: + implementation: The implementation to start. + + Return: + A format string with embedded {ntasks} and {rankfile}. + """ + if implementation.execution_model == ExecutionModel.DIRECT: + fstr = '{command} {args}' + elif implementation.execution_model == ExecutionModel.OPENMPI: + # Native name is orterun for older and prterun for newer OpenMPI. + # So we go with mpirun, which works for either. + fstr = ( + 'mpirun -v -np {{ntasks}}' + ' -d --debug-daemons' + ' --rankfile {{rankfile}} --oversubscribe' + # ' --map-by rankfile:file={{rankfile}}:oversubscribe' + ' --display-map --display-allocation {command} {args}') + # ' --bind-to core --display-map --display-allocation {command} {args}') + elif implementation.execution_model == ExecutionModel.INTELMPI: + fstr = 'mpirun -n {{ntasks}} -machinefile {{rankfile}} {command} {args}' + elif implementation.execution_model == ExecutionModel.SRUNMPI: + fstr = 'srun -n {{ntasks}} -m arbitrary {command} {args}' + # elif implementation.execution_model == ExecutionModel.MPICH + # fstr = 'mpiexec -n {{ntasks}} -f {{rankfile}} {command} {args}' + + if implementation.args is None: + args = '' + elif isinstance(implementation.args, str): + args = implementation.args + elif isinstance(implementation.args, list): + args = ' '.join(implementation.args) + + return fstr.format( + command=implementation.executable, + args=args + ) + + +def make_script( + implementation: Implementation, res_req: ResourceRequirements, + local: bool, rankfile: Optional[Path] = None) -> str: + """Make a launch script for a given implementation. + + Args: + implementation: The implementation to launch + res_req: The job's resource requirements + local: Whether this is to run locally (True) or on a cluster (False) + rankfile: Location of the rankfile, if any + + Return: + A string with embedded newlines containing the shell script. + """ + lines: List[str] = list() + + lines.append('#!/bin/bash') + lines.append('') + + # The environment is passed when starting the script, rather than as a set of + # export statements here. + + if implementation.modules: + if isinstance(implementation.modules, str): + lines.append(f'module load {implementation.modules}') + else: + for module in implementation.modules: + lines.append(f'module load {module}') + lines.append('') + + if implementation.virtual_env: + lines.append(f'. {implementation.virtual_env}/bin/activate') + lines.append('') + + if local: + cmd = local_command(implementation) + else: + cmd = cluster_command(implementation) + + ntasks = num_mpi_tasks(res_req) + lines.append(cmd.format(ntasks=ntasks, rankfile=rankfile)) + + lines.append('') + + return '\n'.join(lines) diff --git a/libmuscle/python/libmuscle/native_instantiator/slurm.py b/libmuscle/python/libmuscle/native_instantiator/slurm.py new file mode 100644 index 00000000..59258cc9 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/slurm.py @@ -0,0 +1,280 @@ +from itertools import product +import logging +import os +from parsimonious import Grammar, NodeVisitor +from parsimonious.nodes import Node +from typing import Any, cast, List, Sequence, Tuple + + +_logger = logging.getLogger(__name__) + + +_node_range_expression_grammar = Grammar( + """ + nre = nre_parts ("," nre_parts)* + nre_parts = nre_part+ + nre_part = identifier ("[" index_set "]")? + index_set = index_range ("," index_range)* + index_range = integer ("-" integer)? + identifier = ~"[a-z 0-9 _-]+"i + integer = padded_int / int + int = ~"[0-9]+" + padded_int = ~"0[0-9]+" + """ + ) + + +class NREVisitor(NodeVisitor): + """Processes a parsed NRE and produces a list of nodes. + + Node range expressions are used by SLURM to describe collections of nodes. See + parse_slurm_nodelist() below. + """ + def visit_nre( + self, node: Node, + visited_children: Tuple[List[str], Sequence[Tuple[Any, List[str]]]] + ) -> List[str]: + """Return a list of nodes corresponding to the NRE.""" + nodes = visited_children[0].copy() + for _, more_nodes in visited_children[1]: + nodes.extend(more_nodes) + return nodes + + def visit_nre_parts( + self, node: Node, visited_children: Sequence[Tuple[str, List[str]]] + ) -> List[str]: + """Return a list of node ids for the part.""" + fmt = ''.join([c[0] + '{}' for c in visited_children]) + index_lists = [c[1] for c in visited_children] + return [fmt.format(*idxs) for idxs in product(*index_lists)] + + def visit_nre_part( + self, node: Node, visited_children: Tuple[ + str, Sequence[Tuple[Any, List[str], Any]]] + ) -> Tuple[str, List[str]]: + """Return the identifier part and a list of indexes for the set.""" + identifier = visited_children[0] + if not visited_children[1]: + index_set = [''] + else: + index_set = visited_children[1][0][1] + return identifier, index_set + + def visit_index_set( + self, node: Node, + visited_children: Tuple[List[str], Sequence[Tuple[Any, List[str]]]] + ) -> List[str]: + """Return a list of indexes corresponding to the set.""" + indexes = visited_children[0].copy() + for _, more_indexes in visited_children[1]: + indexes.extend(more_indexes) + return indexes + + def visit_index_range( + self, node: Node, + visited_children: Tuple[ + Tuple[int, int], + Sequence[ + Tuple[Any, Tuple[int, int]] + ]] + ) -> List[str]: + """Return a list of indexes corresponding to the range.""" + + def format_str(width: int) -> str: + if width == -1: + return '{}' + return f'{{:0{width}}}' + + start_value, width = visited_children[0] + if visited_children[1]: + end_value, _ = visited_children[1][0][1] + fmt = format_str(width) + return [fmt.format(i) for i in range(start_value, end_value + 1)] + + fmt = format_str(width) + return [fmt.format(start_value)] + + def visit_identifier(self, node: Node, _: Sequence[Any]) -> str: + return node.text + + def visit_integer( + self, node: Node, visited_children: Sequence[Tuple[int, int]] + ) -> Tuple[int, int]: + """Returns the value of the int, and a field width or -1.""" + return visited_children[0] + + def visit_int(self, node: Node, _: Sequence[Any]) -> Tuple[int, int]: + """Returns the value and a field width of -1.""" + return int(node.text), -1 + + def visit_padded_int(self, node: Node, _: Sequence[Any]) -> Tuple[int, int]: + """Returns the value of the int and the field width.""" + return int(node.text), len(node.text) + + def generic_visit( + self, node: Node, visited_children: Sequence[Any]) -> Sequence[Any]: + return visited_children + + +_nre_visitor = NREVisitor() + + +def parse_slurm_nodelist(s: str) -> List[str]: + """Parse a SLURM node range expression and produce node names. + + Exactly what the syntax is for a "node range expression" isn't entirely + clear. Some examples are given throughout the documentation: + + linux[00-17] + lx[10-20] + tux[2,1-2] + tux[1-2,2] + tux[1-3] + linux[0-64,128] + alpha,beta,gamma + lx[15,18,32-33] + linux[0000-1023] + rack[0-63]_blade[0-41] + + unit[0-31]rack is invalid + + If a range uses leading zeros, then so should the generated indexes. + See _node_range_expression_grammar above for my best guess at the + correct grammar. + + This function takes a string containing an NRE and returns the + corresponding list of node names. + """ + ast = _node_range_expression_grammar.parse(s) + return cast(List[str], _nre_visitor.visit(ast)) + + +_nodes_cores_expression_grammar = Grammar( + """ + nce = nce_run ("," nce_run)* + nce_run = int ("(" run_length ")")? + run_length = "x" int + int = ~"[0-9]+" + """ + ) + + +class NCEVisitor(NodeVisitor): + """Processes a parsed NCE and produces a list of cpu counts per node. + + Nodes cores expressions are used by SLURM to describe cores on a collection of + nodes. See parse_slurm_nodes_cores() below. + """ + def visit_nce( + self, node: Node, + visited_children: Tuple[List[int], Sequence[Tuple[Any, List[int]]]] + ) -> List[int]: + """Return a list of nodes corresponding to the NRE.""" + nodes_cores = visited_children[0].copy() + for _, more_nodes_cores in visited_children[1]: + nodes_cores.extend(more_nodes_cores) + return nodes_cores + + def visit_nce_run( + self, node: Node, + visited_children: Tuple[int, Sequence[Tuple[Any, int, Any]]] + ) -> List[int]: + """Return a list of core counts produced by this run.""" + num_cores = visited_children[0] + result = [num_cores] + + if visited_children[1]: + result *= visited_children[1][0][1] + + return result + + def visit_run_length( + self, node: Node, visited_children: Tuple[str, int]) -> int: + """Return the number of repetitions.""" + return visited_children[1] + + def visit_int(self, node: Node, _: Sequence[Any]) -> int: + """Returns the value as an int""" + return int(node.text) + + def generic_visit( + self, node: Node, visited_children: Sequence[Any]) -> Sequence[Any]: + return visited_children + + +_nce_visitor = NCEVisitor() + + +def parse_slurm_nodes_cores(s: str) -> List[int]: + """Parse a SLURM nodes cores expression and produce node names. + + The sbatch documentation page describes the format under + SLURM_JOB_CPUS_PER_NODE as CPU_count[(xnumber_of_nodes)][,CPU_count + [(xnumber_of_nodes)] ...]. and gives the example of '72(x2),36' describing a set of + three nodes, the first two with 72 cores and the third with 36. + + See _nodes_cores_expression_grammar above for the corresponding grammar. + + This function takes a string containing an NCE and returns the corresponding list of + node names. + """ + ast = _nodes_cores_expression_grammar.parse(s) + return cast(List[int], _nce_visitor.visit(ast)) + + +def in_slurm_allocation() -> bool: + """Check whether we're in a SLURM allocation. + + Returns true iff SLURM was detected. + """ + return 'SLURM_JOB_ID' in os.environ + + +def get_nodes() -> List[str]: + """Get a list of node names from SLURM_JOB_NODELIST. + + This inspects SLURM_JOB_NODELIST or SLURM_NODELIST and returns an + expanded list of node names. + + If SLURM_JOB_NODELIST is "node[020-023]" then this returns + ["node020", "node021", "node022", "node023"]. + """ + nodelist = os.environ.get('SLURM_JOB_NODELIST') + if not nodelist: + nodelist = os.environ.get('SLURM_NODELIST') + if not nodelist: + raise RuntimeError('SLURM_(JOB_)NODELIST not set, are we running locally?') + + _logger.debug(f'SLURM node list: {nodelist}') + + return parse_slurm_nodelist(nodelist) + + +def get_cores_per_node() -> List[int]: + """Return the number of CPU cores per node. + + This returns a list with the number of cores of each node in the result of + get_nodes(), which gets read from SLURM_JOB_CPUS_PER_NODE. + """ + sjcpn = os.environ.get('SLURM_JOB_CPUS_PER_NODE') + _logger.debug(f'SLURM_JOB_CPUS_PER_NODE: {sjcpn}') + + if sjcpn: + return parse_slurm_nodes_cores(sjcpn) + else: + scon = os.environ.get('SLURM_CPUS_ON_NODE') + _logger.debug(f'SLURM_CPUS_ON_NODE: {scon}') + + snn = os.environ.get('SLURM_JOB_NUM_NODES') + if not snn: + snn = os.environ.get('SLURM_NNODES') + _logger.debug(f'SLURM num nodes: {snn}') + + if scon and snn: + return [int(scon)] * int(snn) + + raise RuntimeError( + 'SLURM_JOB_CPUS_PER_NODE is not set in the environment, and also' + ' SLURM_CPUS_ON_NODE is missing or neither SLURM_JOB_NUM_NODES nor' + ' SLURM_NNODES is set. Please create an issue on GitHub with the output' + ' of "sbatch --version" on this cluster.') diff --git a/libmuscle/python/libmuscle/native_instantiator/test/test_process_manager.py b/libmuscle/python/libmuscle/native_instantiator/test/test_process_manager.py new file mode 100644 index 00000000..93dabcfb --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/test/test_process_manager.py @@ -0,0 +1,120 @@ +from time import monotonic, sleep + +import pytest + +from libmuscle.native_instantiator.process_manager import ProcessManager + + +@pytest.fixture +def lpm(): + return ProcessManager() + + +def _poll_completion(lpm, num_jobs): + completed_jobs = list() + while len(completed_jobs) < num_jobs: + done = lpm.get_finished() + while not done: + sleep(0.1) + done = lpm.get_finished() + completed_jobs.extend(done) + + return completed_jobs + + +def test_run_process(lpm, tmp_path): + lpm.start( + 'test', tmp_path, ['bash', '-c', 'exit 0'], {}, + tmp_path / 'out', tmp_path / 'err') + completed_jobs = _poll_completion(lpm, 1) + assert completed_jobs[0] == ('test', 0) + + +def test_existing_process(lpm, tmp_path): + lpm.start( + 'test', tmp_path, ['bash', '-c', 'exit 0'], {}, + tmp_path / 'out', tmp_path / 'err') + with pytest.raises(RuntimeError): + lpm.start( + 'test', tmp_path, ['bash', '-c', 'exit 0'], {}, + tmp_path / 'out', tmp_path / 'err') + + completed_jobs = _poll_completion(lpm, 1) + + assert completed_jobs[0] == ('test', 0) + + +def test_env(lpm, tmp_path): + env = {'ENVVAR': 'TESTING123'} + lpm.start( + 'test', tmp_path, ['bash', '-c', 'echo ${ENVVAR}'], env, + tmp_path / 'out', tmp_path / 'err') + _poll_completion(lpm, 1) + + with (tmp_path / 'out').open('r') as f: + lines = f.readlines() + + assert lines[0] == 'TESTING123\n' + + +def test_exit_code(lpm, tmp_path): + lpm.start( + 'test_exit_code', tmp_path, ['bash', '-c', 'exit 3'], {}, + tmp_path / 'out', tmp_path / 'err') + done = lpm.get_finished() + while not done: + sleep(0.02) + done = lpm.get_finished() + + assert done[0] == ('test_exit_code', 3) + + +def test_multiple(lpm, tmp_path): + for i in range(3): + lpm.start( + f'test_{i}', tmp_path, ['bash', '-c', 'sleep 1'], {}, + tmp_path / f'out{i}', tmp_path / f'err{i}') + + completed_jobs = _poll_completion(lpm, 3) + + assert sorted(completed_jobs) == [('test_0', 0), ('test_1', 0), ('test_2', 0)] + + +def test_cancel_all(lpm, tmp_path): + begin_time = monotonic() + + for i in range(2): + lpm.start( + f'test_{i}', tmp_path, ['bash', '-c', 'sleep 1'], {}, + tmp_path / f'out{i}', tmp_path / f'err{i}') + + lpm.cancel_all() + + completed_jobs = _poll_completion(lpm, 2) + + end_time = monotonic() + + assert sorted(completed_jobs) == [('test_0', -9), ('test_1', -9)] + assert end_time - begin_time < 1.0 + + +def test_output_redirect(lpm, tmp_path): + lpm.start( + 'test', tmp_path, ['bash', '-c', 'ls'], {}, + tmp_path / 'out', tmp_path / 'err') + _poll_completion(lpm, 1) + with (tmp_path / 'out').open('r') as f: + assert f.readlines() + with (tmp_path / 'err').open('r') as f: + assert f.readlines() == [] + + +def test_error_redirect(lpm, tmp_path): + lpm.start( + 'test', tmp_path, ['bash', '-c', 'ls 1>&2'], {}, + tmp_path / 'out', tmp_path / 'err') + _poll_completion(lpm, 1) + with (tmp_path / 'out').open('r') as f: + assert f.readlines() == [] + with (tmp_path / 'err').open('r') as f: + assert f.readlines() diff --git a/libmuscle/python/libmuscle/native_instantiator/test/test_slurm.py b/libmuscle/python/libmuscle/native_instantiator/test/test_slurm.py new file mode 100644 index 00000000..d3610b65 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/test/test_slurm.py @@ -0,0 +1,72 @@ +from libmuscle.native_instantiator.slurm import ( + parse_slurm_nodelist, parse_slurm_nodes_cores) + +import pytest + + +NRES_ = [ + # from various bits of SLURM documentation + ( + 'linux[00-17]', [ + 'linux00', 'linux01', 'linux02', 'linux03', 'linux04', 'linux05', + 'linux06', 'linux07', 'linux08', 'linux09', 'linux10', 'linux11', + 'linux12', 'linux13', 'linux14', 'linux15', 'linux16', 'linux17']), + ( + 'lx[10-20]', [ + 'lx10', 'lx11', 'lx12', 'lx13', 'lx14', 'lx15', 'lx16', 'lx17', 'lx18', + 'lx19', 'lx20']), + ('tux[2,1-2]', ['tux2', 'tux1', 'tux2']), + ('tux[1-2,2]', ['tux1', 'tux2', 'tux2']), + ('tux[1-3]', ['tux1', 'tux2', 'tux3']), + ( + 'linux[0-64,128]', [ + 'linux0', 'linux1', 'linux2', 'linux3', 'linux4', 'linux5', 'linux6', + 'linux7', 'linux8', 'linux9', 'linux10', 'linux11', 'linux12', + 'linux13', 'linux14', 'linux15', 'linux16', 'linux17', 'linux18', + 'linux19', 'linux20', 'linux21', 'linux22', 'linux23', 'linux24', + 'linux25', 'linux26', 'linux27', 'linux28', 'linux29', 'linux30', + 'linux31', 'linux32', 'linux33', 'linux34', 'linux35', 'linux36', + 'linux37', 'linux38', 'linux39', 'linux40', 'linux41', 'linux42', + 'linux43', 'linux44', 'linux45', 'linux46', 'linux47', 'linux48', + 'linux49', 'linux50', 'linux51', 'linux52', 'linux53', 'linux54', + 'linux55', 'linux56', 'linux57', 'linux58', 'linux59', 'linux60', + 'linux61', 'linux62', 'linux63', 'linux64', 'linux128']), + ('alpha,beta,gamma', ['alpha', 'beta', 'gamma']), + ('lx[15,18,32-33]', ['lx15', 'lx18', 'lx32', 'lx33']), + ('linux[0000-1023]', [f'linux{i:04}' for i in range(1024)]), + ( + 'rack[0-63]_blade[0-41]', [ + f'rack{i}_blade{j}' for i in range(64) for j in range(42)]), + # my additions + ('linux', ['linux']), + ('linux[0]', ['linux0']), + ('linux[0,1]', ['linux0', 'linux1']), + ('linux[0-2]', ['linux0', 'linux1', 'linux2']), + ( + 'rack[00-12,14]_blade[0-2],alpha,tux[1-3,6]', ( + [f'rack{i:02}_blade{j}' for i in range(13) for j in range(3)] + [ + 'rack14_blade0', 'rack14_blade1', 'rack14_blade2', 'alpha', + 'tux1', 'tux2', 'tux3', 'tux6'])), + ('node-0', ['node-0']), + ('node-[0-3]', ['node-0', 'node-1', 'node-2', 'node-3']), + ] + + +@pytest.mark.parametrize('nre,expected', NRES_) +def test_parse_slurm_nodelist(nre, expected): + assert parse_slurm_nodelist(nre) == expected + + +NCES_ = [ + ('8', [8]), + ('8(x2)', [8, 8]), + ('16,24', [16, 24]), + ('16,24(x3)', [16, 24, 24, 24]), + ('1(x1),2', [1, 2]), + ('72(x2),36', [72, 72, 36]) + ] + + +@pytest.mark.parametrize('nce,expected', NCES_) +def test_parse_slurm_nodes_cores(nce, expected): + assert parse_slurm_nodes_cores(nce) == expected diff --git a/setup.py b/setup.py index a8d3fda7..d31fa790 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ 'matplotlib>=3,<4', 'msgpack>=1,<2', 'psutil>=5.0.0', + 'parsimonious', "numpy>=1.22", 'qcg-pilotjob==0.13.1', 'typing_extensions>=4.4.0,<5', diff --git a/tox.ini b/tox.ini index 6291e5f5..970daf2d 100644 --- a/tox.ini +++ b/tox.ini @@ -4,11 +4,13 @@ skip_missing_interpreters = true [testenv] deps = + cerulean # not actually used for these non-cluster tests flake8 mypy pytest pytest-cov requests # missing dependency in cerulean... + types-parsimonious types-psutil ymmsl @@ -30,6 +32,7 @@ deps = pytest pytest-cov requests # missing dependency in cerulean... + types-parsimonious types-psutil ymmsl @@ -37,8 +40,8 @@ setenv = MUSCLE_TEST_CLUSTER=1 commands = - pytest -k 'test_cluster' {posargs} - # pytest --log-cli-level=DEBUG -s -k 'test_cluster' {posargs} + pytest -k 'test_cluster' --log-disable=paramiko.transport {posargs} + # pytest --log-cli-level=DEBUG --log-disable=paramiko.transport --log-disable=paramiko.transport.sftp --log-disable=cerulean.copy_files -s -k 'test_cluster' {posargs} [gh-actions] From 8776e99476bd685785eff83a091665979c24bbf5 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 20 Sep 2024 20:46:20 +0200 Subject: [PATCH 09/49] Refactor cluster tests into subdirectory --- integration_test/cluster_test/__init__.py | 0 integration_test/cluster_test/conftest.py | 200 +++++++++++++++++ integration_test/cluster_test/test_cluster.py | 206 +++++++++++++++++ integration_test/conftest.py | 4 - integration_test/test_cluster.py | 207 ------------------ 5 files changed, 406 insertions(+), 211 deletions(-) create mode 100644 integration_test/cluster_test/__init__.py create mode 100644 integration_test/cluster_test/conftest.py create mode 100644 integration_test/cluster_test/test_cluster.py delete mode 100644 integration_test/test_cluster.py diff --git a/integration_test/cluster_test/__init__.py b/integration_test/cluster_test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/integration_test/cluster_test/conftest.py b/integration_test/cluster_test/conftest.py new file mode 100644 index 00000000..a4f5cba4 --- /dev/null +++ b/integration_test/cluster_test/conftest.py @@ -0,0 +1,200 @@ +import logging +import os +from pathlib import Path +from tempfile import TemporaryDirectory +import time + +import cerulean +import pytest + + +logger_ = logging.getLogger(__name__) + + +REMOTE_SHARED = '/home/cerulean/shared' + + +skip_unless_cluster = pytest.mark.skipif( + 'MUSCLE_TEST_CLUSTER' not in os.environ, + reason='Cluster tests were not explicitly enabled') + + +def run_cmd(term, timeout, command): + exit_code, out, err = term.run(timeout, command, []) + if exit_code != 0: + logger_.error(err) + assert exit_code == 0 + return out + + +@pytest.fixture(scope='session') +def local_term(): + return cerulean.LocalTerminal() + + +@pytest.fixture(scope='session') +def local_fs(): + return cerulean.LocalFileSystem() + + +@pytest.fixture(scope='session') +def fake_cluster_image(local_term): + IMAGE_NAME = 'muscle3_test_cluster' + run_cmd(local_term, 5400, ( + f'docker buildx build -t {IMAGE_NAME}' + ' -f integration_test/fake_cluster/Dockerfile .')) + return IMAGE_NAME + + +def ssh_term(timeout_msg): + cred = cerulean.PasswordCredential('cerulean', 'kingfisher') + ready = False + start = time.monotonic() + while not ready: + if (time.monotonic() - start) > 60.0: + raise Exception(timeout_msg) + + try: + term = cerulean.SshTerminal('localhost', 10022, cred) + ready = True + except Exception: + time.sleep(3.0) + + return term + + +@pytest.fixture(scope='session') +def shared_dir(): + # Note that pytest's tmp_path is function-scoped, so cannot be used here + with TemporaryDirectory(ignore_cleanup_errors=True) as tmp_dir: + path = Path(tmp_dir) + path.chmod(0o1777) + yield path + + +@pytest.fixture(scope='session') +def cleanup_docker(local_term): + for i in range(5): + node_name = f'muscle3-node-{i}' + run_cmd(local_term, 60, f'docker rm -f {node_name}') + + run_cmd(local_term, 60, 'docker rm -f muscle3-headnode') + run_cmd(local_term, 60, 'docker network rm -f muscle3-net') + + +@pytest.fixture(scope='session') +def fake_cluster_network(local_term, cleanup_docker): + name = 'muscle3-net' + run_cmd(local_term, 60, f'docker network create {name}') + yield name + run_cmd(local_term, 60, 'docker network rm -f muscle3-net') + + +@pytest.fixture(scope='session') +def fake_cluster_nodes( + local_term, fake_cluster_image, fake_cluster_network, shared_dir): + + node_names = list() + + for i in range(5): + node_name = f'muscle3-node-{i}' + ssh_port = 10030 + i + + run_cmd(local_term, 60, ( + f'docker run -d --name={node_name} --hostname={node_name}' + f' --network={fake_cluster_network} -p {ssh_port}:22' + f' --mount type=bind,source={shared_dir},target={REMOTE_SHARED}' + f' {fake_cluster_image}')) + + node_names.append(node_name) + + yield None + + run_cmd(local_term, 60, f'docker rm -f {" ".join(node_names)}') + + +@pytest.fixture(scope='session') +def fake_cluster_headnode( + local_term, fake_cluster_image, fake_cluster_network, fake_cluster_nodes, + shared_dir): + + run_cmd(local_term, 60, ( + 'docker run -d --name=muscle3-headnode --hostname=muscle3-headnode' + f' --network={fake_cluster_network} -p 10022:22' + f' --mount type=bind,source={shared_dir},target={REMOTE_SHARED}' + f' {fake_cluster_image}')) + + ssh_term('Virtual cluster container start timed out') + yield None + + run_cmd(local_term, 60, 'docker rm -f muscle3-headnode') + + +@pytest.fixture(scope='session') +def setup_connection(fake_cluster_headnode): + # Session-wide connection used for container setup actions only + # Tests each have their own connection, see fake_cluster() below + term = ssh_term('Connection to virtual cluster container timed out') + with cerulean.SftpFileSystem(term, True) as fs: + yield term, fs + + # We abuse this to clean up the contents of the shared directory. + # Because it's been made inside of the container, it has a different owner + # than what we're running with on the host, and the host user cannot remove + # the files. + + run_cmd(term, 60, f'rm -rf {REMOTE_SHARED}/*') + + +@pytest.fixture(scope='session') +def repo_root(local_fs): + root_dir = Path(__file__).parents[2] + return local_fs / str(root_dir) + + +@pytest.fixture(scope='session') +def remote_source(repo_root, setup_connection): + remote_term, remote_fs = setup_connection + + muscle3_tgt = remote_fs / 'home' / 'cerulean' / 'muscle3' + muscle3_tgt.mkdir() + (muscle3_tgt / 'libmuscle').mkdir() + + for f in ( + 'muscle3', 'libmuscle', 'scripts', 'docs', 'setup.py', 'Makefile', + 'MANIFEST.in', 'LICENSE', 'NOTICE', 'VERSION', 'README.rst'): + cerulean.copy( + repo_root / f, muscle3_tgt / f, overwrite='always', copy_into=False) + + return muscle3_tgt + + +@pytest.fixture(scope='session') +def muscle3_venv(repo_root, remote_source, setup_connection): + remote_term, remote_fs = setup_connection + + run_cmd(remote_term, 10, f'python3 -m venv {REMOTE_SHARED}/venv') + in_venv = f'source {REMOTE_SHARED}/venv/bin/activate && ' + + run_cmd(remote_term, 30, ( + f'/bin/bash -c "{in_venv} python3 -m pip install pip wheel setuptools"')) + + run_cmd(remote_term, 60, f'/bin/bash -c "{in_venv} pip install {remote_source}"') + return in_venv + + +@pytest.fixture(scope='session') +def muscle3_native_openmpi(remote_source, setup_connection): + remote_term, remote_fs = setup_connection + + prefix = remote_fs / REMOTE_SHARED / 'muscle3-openmpi' + prefix.mkdir() + + run_cmd(remote_term, 600, ( + f'/bin/bash -l -c "' + f'module load openmpi && ' + f'cd {remote_source} && ' + f'make distclean && ' + f'PREFIX={prefix} make install"')) + + return prefix diff --git a/integration_test/cluster_test/test_cluster.py b/integration_test/cluster_test/test_cluster.py new file mode 100644 index 00000000..d9b1d85f --- /dev/null +++ b/integration_test/cluster_test/test_cluster.py @@ -0,0 +1,206 @@ +import cerulean +import logging +import pytest + +from integration_test.cluster_test.conftest import ( + REMOTE_SHARED, run_cmd, ssh_term, skip_unless_cluster) + + +logger_ = logging.getLogger(__name__) + + +@pytest.fixture(scope='session') +def copy_test_files(repo_root, setup_connection): + remote_term, remote_fs = setup_connection + remote_home = remote_fs / REMOTE_SHARED + + cerulean.copy( + repo_root / 'integration_test' / 'cluster_test', remote_home, + copy_permissions=True) + + return remote_home / 'cluster_test' + + +@pytest.fixture(scope='session') +def build_native_components( + muscle3_native_openmpi, setup_connection, copy_test_files): + remote_term, remote_fs = setup_connection + remote_source = copy_test_files + + run_cmd(remote_term, 30, ( + f"/bin/bash -l -c '" + f"module load openmpi && " + f". {muscle3_native_openmpi}/bin/muscle3.env && " + f"make -C {remote_source}'")) + + +@pytest.fixture +def fake_cluster( + fake_cluster_headnode, muscle3_venv, build_native_components, copy_test_files): + term = ssh_term('Connection to virtual cluster container timed out') + with cerulean.SftpFileSystem(term, True) as fs: + local_sched = cerulean.DirectGnuScheduler(term) + slurm_sched = cerulean.SlurmScheduler(term) + yield term, fs, local_sched, slurm_sched + + +@pytest.fixture +def remote_home(fake_cluster): + remote_fs = fake_cluster[1] + return remote_fs / REMOTE_SHARED + + +@pytest.fixture +def remote_test_files(remote_home): + return remote_home / 'cluster_test' + + +@pytest.fixture +def remote_out_dir(remote_home): + return remote_home / 'test_results' + + +def _make_job(name, mode, remote_test_files, remote_out_dir): + job_dir = remote_out_dir / f'test_{name}_{mode}' + job_dir.mkdir(0o755, True, True) + + job = cerulean.JobDescription() + job.name = name + job.working_directory = job_dir + job.command = str(remote_test_files / f'{name}.sh') + job.stdout_file = job_dir / 'stdout.txt' + job.stderr_file = job_dir / 'stderr.txt' + job.queue_name = 'debug' + job.time_reserved = 60 + job.system_out_file = job_dir / 'sysout.txt' + job.system_err_file = job_dir / 'syserr.txt' + + return job + + +def _sched(fake_cluster, mode): + if mode == 'local': + return fake_cluster[2] + else: + return fake_cluster[3] + + +def run_cmd_dir(remote_out_dir, testname, mode): + results_name = f'test_{testname}_{mode}' + + for p in (remote_out_dir / results_name).iterdir(): + if p.name.startswith('run_'): + return p + + +def _get_stdout(remote_out_dir, testname, mode, instance): + run_dir = run_cmd_dir(remote_out_dir, testname, mode) + stdout_file = run_dir / 'instances' / instance / 'stdout.txt' + assert stdout_file.exists() # test output redirection + return stdout_file.read_text() + + +def _get_outfile(remote_out_dir, testname, mode, instance, rank): + run_dir = run_cmd_dir(remote_out_dir, testname, mode) + work_dir = run_dir / 'instances' / instance / 'workdir' + out_file = work_dir / f'out_{rank}.txt' + assert out_file.exists() # test working directory + return out_file.read_text() + + +_SCHED_OVERHEAD = 60 + + +@skip_unless_cluster +@pytest.mark.parametrize('mode', ['local', 'slurm']) +def test_single(fake_cluster, remote_test_files, remote_out_dir, mode): + sched = _sched(fake_cluster, mode) + + job = _make_job('single', mode, remote_test_files, remote_out_dir) + if mode == 'slurm': + job.num_nodes = 1 + job.mpi_processes_per_node = 1 + job.extra_scheduler_options = '--ntasks-per-core=1 --nodelist=muscle3-node-0' + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 + + output = _get_stdout(remote_out_dir, 'single', mode, 'c1') + + if mode == 'local': + assert output == 'muscle3-headnode\n' + else: + assert output == 'muscle3-node-0\n' + + +@skip_unless_cluster +@pytest.mark.parametrize('mode', ['local', 'slurm']) +def test_dispatch(fake_cluster, remote_test_files, remote_out_dir, mode): + sched = _sched(fake_cluster, mode) + + job = _make_job('dispatch', mode, remote_test_files, remote_out_dir) + if mode == 'slurm': + job.num_nodes = 1 + job.mpi_processes_per_node = 1 + job.extra_scheduler_options = '--ntasks-per-core=1 --nodelist=muscle3-node-1' + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 + + c1_out = _get_stdout(remote_out_dir, 'dispatch', mode, 'c1') + c2_out = _get_stdout(remote_out_dir, 'dispatch', mode, 'c2') + if mode == 'local': + assert c1_out == 'muscle3-headnode\n' + assert c2_out == 'muscle3-headnode\n' + else: + assert c1_out == 'muscle3-node-1\n' + assert c2_out == 'muscle3-node-1\n' + + +@skip_unless_cluster +@pytest.mark.parametrize('mode', ['local']) +# SLURM mode is not implemented yet +def test_multiple(fake_cluster, remote_test_files, remote_out_dir, mode): + sched = _sched(fake_cluster, mode) + + job = _make_job('multiple', mode, remote_test_files, remote_out_dir) + if mode == 'slurm': + job.num_nodes = 3 + job.extra_scheduler_options = '--nodelist=muscle3-node-[0-2]' + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 + + for i in range(1, 7): + if mode == 'local': + assert _get_stdout( + remote_out_dir, 'multiple', mode, f'c{i}') == 'muscle3-headnode\n' + else: + out = _get_stdout(remote_out_dir, 'multiple', mode, f'c{i}') + assert out == f'muscle3-node-{(i - 1) // 2}\n' + + +@skip_unless_cluster +@pytest.mark.parametrize('mode', ['local', 'slurm']) +def test_double_mpi(fake_cluster, remote_test_files, remote_out_dir, mode): + sched = _sched(fake_cluster, mode) + + job = _make_job('double_mpi', mode, remote_test_files, remote_out_dir) + if mode == 'slurm': + job.num_nodes = 2 + job.extra_scheduler_options = '--nodelist=muscle3-node-[3-4]' + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 + + for i in range(1, 3): + for rank in range(2): + output = _get_outfile(remote_out_dir, 'double_mpi', mode, f'c{i}', rank) + if mode == 'local': + assert output == 'muscle3-headnode\n' + else: + assert output == f'muscle3-node-{i + 2}\n' diff --git a/integration_test/conftest.py b/integration_test/conftest.py index 78ac48e5..18ab5ce4 100644 --- a/integration_test/conftest.py +++ b/integration_test/conftest.py @@ -28,10 +28,6 @@ 'MUSCLE_ENABLE_CPP_MPI' not in os.environ, reason='MPI support was not detected') -skip_unless_cluster = pytest.mark.skipif( - 'MUSCLE_TEST_CLUSTER' not in os.environ, - reason='Cluster tests were not explicitly enabled') - @pytest.fixture def yatiml_log_warning(): diff --git a/integration_test/test_cluster.py b/integration_test/test_cluster.py deleted file mode 100644 index 7cf06112..00000000 --- a/integration_test/test_cluster.py +++ /dev/null @@ -1,207 +0,0 @@ -# This ensures that pytest can import this module in the non-cluster test env -# in which these dependencies don't exist, because these tests won' be run. -try: - import cerulean -except ImportError: - pass - -import logging -from pathlib import Path -import pytest -import time - -from .conftest import skip_unless_cluster - - -logger = logging.getLogger(__name__) - - -def _run(term, timeout, command): - exit_code, out, err = term.run(timeout, command, []) - if exit_code != 0: - logger.error(err) - assert exit_code == 0 - return out - - -@pytest.fixture(scope='session') -def local_term(): - return cerulean.LocalTerminal() - - -@pytest.fixture(scope='session') -def local_fs(): - return cerulean.LocalFileSystem() - - -@pytest.fixture(scope='session') -def virtual_cluster_image(local_term): - IMAGE_NAME = 'muscle3_test_cluster' - _run(local_term, 180, ( - f'docker buildx build -t {IMAGE_NAME}' - ' -f integration_test/test_cluster.Dockerfile .')) - return IMAGE_NAME - - -def _ssh_term(timeout_msg): - cred = cerulean.PasswordCredential('cerulean', 'kingfisher') - ready = False - start = time.monotonic() - while not ready: - if (time.monotonic() - start) > 60.0: - raise Exception(timeout_msg) - - try: - term = cerulean.SshTerminal('localhost', 10022, cred) - ready = True - except Exception: - time.sleep(3.0) - - return term - - -@pytest.fixture(scope='session') -def virtual_cluster_container(local_term, virtual_cluster_image): - # clean up stray container from previous run, if any - _run(local_term, 60, 'docker rm -f muscle3_test_slurm') - - _run(local_term, 60, ( - 'docker run -d --name muscle3_test_slurm -p 10022:22' - f' {virtual_cluster_image}')) - - _ssh_term('Virtual cluster container start timed out') - yield None - - # _run(local_term, 60, 'docker rm -f muscle3_test_slurm') - - -@pytest.fixture(scope='session') -def setup_connection(virtual_cluster_container): - # Session-wide connection used for container setup actions only - # Tests each have their own connection, see virtual_cluster() below - term = _ssh_term('Connection to virtual cluster container timed out') - with cerulean.SftpFileSystem(term, True) as fs: - yield term, fs - - -@pytest.fixture(scope='session') -def repo_root(local_fs): - root_dir = Path(__file__).parents[1] - return local_fs / str(root_dir) - - -@pytest.fixture(scope='session') -def muscle3_venv(repo_root, setup_connection): - remote_term, remote_fs = setup_connection - - _run(remote_term, 10, 'python3 -m venv /home/cerulean/venv') - in_venv = 'source /home/cerulean/venv/bin/activate && ' - _run(remote_term, 30, ( - f'/bin/bash -c "{in_venv} python3 -m pip install pip wheel setuptools"')) - - muscle3_tgt = remote_fs / 'home/cerulean/muscle3' - muscle3_tgt.mkdir() - (muscle3_tgt / 'libmuscle').mkdir() - - for f in ( - 'muscle3', 'libmuscle/python', 'setup.py', 'MANIFEST.in', 'LICENSE', - 'NOTICE', 'VERSION', 'README.rst'): - cerulean.copy(repo_root / f, muscle3_tgt / f) - - _run(remote_term, 60, f'/bin/bash -c "{in_venv} pip install ./muscle3"') - return in_venv - - -@pytest.fixture(scope='session') -def create_remote_test_files(repo_root, setup_connection): - remote_term, remote_fs = setup_connection - - remote_home = remote_fs / 'home' / 'cerulean' - - cerulean.copy( - repo_root / 'integration_test' / 'cluster_test', remote_home, - copy_permissions=True) - - -@pytest.fixture -def virtual_cluster(virtual_cluster_container, muscle3_venv, create_remote_test_files): - term = _ssh_term('Connection to vitrual cluster container timed out') - with cerulean.SftpFileSystem(term, True) as fs: - sched = cerulean.SlurmScheduler(term) - yield term, fs, sched - - -@pytest.fixture -def remote_home(virtual_cluster): - _, remote_fs, _ = virtual_cluster - return remote_fs / 'home' / 'cerulean' - - -@pytest.fixture -def remote_test_files(remote_home): - return remote_home / 'cluster_test' - - -@pytest.fixture -def remote_out_dir(remote_home): - return remote_home / 'test_results' - - -def _make_job(name, remote_test_files, remote_out_dir): - job_dir = remote_out_dir / f'test_{name}' - - job = cerulean.JobDescription() - job.name = name - job.working_directory = job_dir - job.command = remote_test_files / f'{name}.sh' - job.stdout_file = job_dir / 'stdout.txt' - job.stderr_file = job_dir / 'stderr.txt' - job.queue_name = 'debug' - job.time_reserved = 60 - job.system_out_file = job_dir / 'sysout.txt' - job.system_err_file = job_dir / 'syserr.txt' - - return job - - -_SCHED_OVERHEAD = 60 - - -@skip_unless_cluster -def test_single(virtual_cluster, remote_test_files, remote_out_dir): - remote_term, remote_fs, sched = virtual_cluster - - job = _make_job('single', remote_test_files, remote_out_dir) - job.num_nodes = 1 - job.mpi_processes_per_node = 1 - job.extra_scheduler_options = '--ntasks-per-core=1' - - job_id = sched.submit(job) - assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None - assert sched.get_exit_code(job_id) == 0 - - -@skip_unless_cluster -def test_dispatch(virtual_cluster, remote_test_files, remote_out_dir): - remote_term, remote_fs, sched = virtual_cluster - - job = _make_job('dispatch', remote_test_files, remote_out_dir) - job.num_nodes = 2 - job.mpi_processes_per_node = 1 - job.extra_scheduler_options = '--ntasks-per-core=1' - - job_id = sched.submit(job) - assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None - assert sched.get_exit_code(job_id) == 0 - - -@skip_unless_cluster -def test_multiple(virtual_cluster, remote_test_files, remote_out_dir): - remote_term, remote_fs, sched = virtual_cluster - - job = _make_job('multiple', remote_test_files, remote_out_dir) - job.num_nodes = 3 - - job_id = sched.submit(job) - assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None - assert sched.get_exit_code(job_id) == 0 From 0e2a94aff9bc8c8ca4eaf7f42a9433d0180b135e Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 20 Sep 2024 20:48:52 +0200 Subject: [PATCH 10/49] Add MPI C++ component for cluster test --- integration_test/cluster_test/Makefile | 12 +++ integration_test/cluster_test/component.cpp | 73 +++++++++++++++++++ integration_test/cluster_test/double_mpi.sh | 12 +++ .../cluster_test/double_mpi.ymmsl | 25 +++++++ .../cluster_test/implementations.ymmsl | 9 ++- integration_test/cluster_test/single.ymmsl | 2 +- 6 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 integration_test/cluster_test/Makefile create mode 100644 integration_test/cluster_test/component.cpp create mode 100755 integration_test/cluster_test/double_mpi.sh create mode 100644 integration_test/cluster_test/double_mpi.ymmsl diff --git a/integration_test/cluster_test/Makefile b/integration_test/cluster_test/Makefile new file mode 100644 index 00000000..4ef1fd9e --- /dev/null +++ b/integration_test/cluster_test/Makefile @@ -0,0 +1,12 @@ +.PHONY: all +all: component_openmpi + + +CXXFLAGS += $(shell pkg-config --cflags libmuscle_mpi ymmsl) +LDLIBS += $(shell pkg-config --libs libmuscle_mpi ymmsl) + +CXXFLAGS += -g + +component_openmpi: component.cpp + mpic++ -o $@ $(CXXFLAGS) $^ $(LDLIBS) + diff --git a/integration_test/cluster_test/component.cpp b/integration_test/cluster_test/component.cpp new file mode 100644 index 00000000..42b0cb48 --- /dev/null +++ b/integration_test/cluster_test/component.cpp @@ -0,0 +1,73 @@ +#include +#include +#include + +#include + +#include "mpi.h" + +#include "libmuscle/libmuscle.hpp" +#include "ymmsl/ymmsl.hpp" + +using std::ofstream; +using std::to_string; + +using libmuscle::Instance; +using libmuscle::Message; +using ymmsl::Operator; + + +/** A simple dummy component. */ +void component(int argc, char * argv[]) { + const int root_rank = 0; + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + char nodeid[1024]; + gethostname(nodeid, sizeof(nodeid)); + + { + ofstream outfile("out_" + to_string(rank) + ".txt"); + outfile << nodeid << std::endl; + } + + Instance instance(argc, argv, { + {Operator::F_INIT, {"init_in"}}, + {Operator::O_I, {"inter_out"}}, + {Operator::S, {"inter_in"}}, + {Operator::O_F, {"final_out"}}}, + MPI_COMM_WORLD, root_rank); + + // outfile << "Starting reuse loop" << std::endl; + while (instance.reuse_instance()) { + // F_INIT + + int64_t steps = instance.get_setting_as("steps"); + + instance.receive("init_in", Message(0.0)); + + for (int step = 0; step < steps; ++step) { + // O_I + if (rank == root_rank) { + instance.send("inter_out", Message(step)); + } + + // S + instance.receive("inter_in", Message(0.0)); + } + + // O_F + if (rank == root_rank) { + instance.send("final_out", Message(steps)); + } + } +} + + +int main(int argc, char * argv[]) { + MPI_Init(&argc, &argv); + component(argc, argv); + MPI_Finalize(); + return EXIT_SUCCESS; +} + diff --git a/integration_test/cluster_test/double_mpi.sh b/integration_test/cluster_test/double_mpi.sh new file mode 100755 index 00000000..1357283b --- /dev/null +++ b/integration_test/cluster_test/double_mpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/double_mpi.ymmsl $CT/settings.ymmsl $CT/implementations.ymmsl + diff --git a/integration_test/cluster_test/double_mpi.ymmsl b/integration_test/cluster_test/double_mpi.ymmsl new file mode 100644 index 00000000..9d04b238 --- /dev/null +++ b/integration_test/cluster_test/double_mpi.ymmsl @@ -0,0 +1,25 @@ +ymmsl_version: v0.1 + +model: + name: double + components: + c1: + ports: + o_i: inter_out + s: inter_in + implementation: component_cpp_openmpi + c2: + ports: + o_i: inter_out + s: inter_in + implementation: component_cpp_openmpi + + conduits: + c1.inter_out: c2.inter_in + c2.inter_out: c1.inter_in + +resources: + c1: + mpi_processes: 2 + c2: + mpi_processes: 2 diff --git a/integration_test/cluster_test/implementations.ymmsl b/integration_test/cluster_test/implementations.ymmsl index c90db7f9..6dab9d57 100644 --- a/integration_test/cluster_test/implementations.ymmsl +++ b/integration_test/cluster_test/implementations.ymmsl @@ -1,8 +1,15 @@ ymmsl_version: v0.1 implementations: - component: + component_python: virtual_env: /home/cerulean/shared/venv executable: python args: - /home/cerulean/shared/cluster_test/component.py + + component_cpp_openmpi: + modules: openmpi + env: + +LD_LIBRARY_PATH: /home/cerulean/shared/muscle3-openmpi/lib + execution_model: openmpi + executable: /home/cerulean/shared/cluster_test/component_openmpi diff --git a/integration_test/cluster_test/single.ymmsl b/integration_test/cluster_test/single.ymmsl index 304579fc..957023f2 100644 --- a/integration_test/cluster_test/single.ymmsl +++ b/integration_test/cluster_test/single.ymmsl @@ -3,7 +3,7 @@ ymmsl_version: v0.1 model: name: single components: - c1: component + c1: component_python resources: c1: From f0b676cb37e7930cf5b1c6d02fe00c2c2d339f07 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 20 Sep 2024 20:51:57 +0200 Subject: [PATCH 11/49] Print where we are running so we can test that --- integration_test/cluster_test/component.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/integration_test/cluster_test/component.py b/integration_test/cluster_test/component.py index e14d0523..aa8dd260 100644 --- a/integration_test/cluster_test/component.py +++ b/integration_test/cluster_test/component.py @@ -1,4 +1,5 @@ import logging +import socket from libmuscle import Instance, Message from ymmsl import Operator @@ -10,6 +11,8 @@ def component() -> None: This sends and receives on all operators, allowing different coupling patterns with a single program. """ + print(socket.gethostname()) + instance = Instance({ Operator.F_INIT: ['init_in'], Operator.O_I: ['inter_out'], From 2c72c9e61ab4701523aec867c3e1e602e666c51a Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 20 Sep 2024 20:52:18 +0200 Subject: [PATCH 12/49] Fix dispatch clustert test case --- integration_test/cluster_test/dispatch.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/integration_test/cluster_test/dispatch.sh b/integration_test/cluster_test/dispatch.sh index 10fb1fb9..aef00e66 100755 --- a/integration_test/cluster_test/dispatch.sh +++ b/integration_test/cluster_test/dispatch.sh @@ -1,12 +1,10 @@ #!/bin/bash -#SBATCH --time=0:1:00 -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=2 - set -e -source /home/cerulean/venv/bin/activate +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test -muscle_manager --log-level=DEBUG --start-all /home/cerulean/cluster_test/dispatch.ymmsl /home/cerulean/cluster_test/settings.ymmsl /home/cerulean/cluster_test/implementations.ymmsl +muscle_manager --log-level=DEBUG --start-all $CT/dispatch.ymmsl $CT/settings.ymmsl $CT/implementations.ymmsl From 8b66edc6f8a979f02ea2b11c8d6274d787ba3779 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Mon, 23 Sep 2024 13:40:24 +0200 Subject: [PATCH 13/49] Remove leftover RequestHandler interface from PostOffice --- libmuscle/python/libmuscle/post_office.py | 25 +---------------------- 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/libmuscle/python/libmuscle/post_office.py b/libmuscle/python/libmuscle/post_office.py index e15057fc..59f6c90a 100644 --- a/libmuscle/python/libmuscle/post_office.py +++ b/libmuscle/python/libmuscle/post_office.py @@ -2,15 +2,12 @@ import time from typing import Dict -import msgpack from ymmsl import Reference -from libmuscle.mcp.protocol import RequestType -from libmuscle.mcp.transport_server import RequestHandler from libmuscle.outbox import Outbox -class PostOffice(RequestHandler): +class PostOffice: """A PostOffice is an object that holds messages to be retrieved. A PostOffice holds outboxes with messages for receivers. It also @@ -23,26 +20,6 @@ def __init__(self) -> None: self._outbox_lock = Lock() - def handle_request(self, request: bytes) -> bytes: - """Handle a request. - - This receives an MCP request and handles it by blocking until - the requested message is available, then returning it. - - Args: - request: A received request - - Returns: - An encoded response - """ - req = msgpack.unpackb(request, raw=False) - if len(req) != 2 or req[0] != RequestType.GET_NEXT_MESSAGE.value: - raise RuntimeError( - 'Invalid request type. Did the streams get crossed?') - recv_port = Reference(req[1]) - self._ensure_outbox_exists(recv_port) - return self._outboxes[recv_port].retrieve() - def get_message(self, receiver: Reference) -> bytes: """Get a message from a receiver's outbox. From 4d2665a505cf75104d3e301e53a1018c15a44ede Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 17 Oct 2024 09:46:55 +0200 Subject: [PATCH 14/49] Add MUSCLE Agent Protocol --- libmuscle/python/libmuscle/mcp/protocol.py | 24 ++- .../agent/agent_commands.py | 25 +++ .../native_instantiator/agent/map_client.py | 102 +++++++++++ .../native_instantiator/map_server.py | 172 ++++++++++++++++++ libmuscle/python/libmuscle/post_office.py | 9 + 5 files changed, 328 insertions(+), 4 deletions(-) create mode 100644 libmuscle/python/libmuscle/native_instantiator/agent/agent_commands.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/agent/map_client.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/map_server.py diff --git a/libmuscle/python/libmuscle/mcp/protocol.py b/libmuscle/python/libmuscle/mcp/protocol.py index 5d1217ed..b6f662a2 100644 --- a/libmuscle/python/libmuscle/mcp/protocol.py +++ b/libmuscle/python/libmuscle/mcp/protocol.py @@ -8,10 +8,10 @@ class RequestType(Enum): Call protocol in which a request is sent to the server and a response is sent back to the calling client. In MCP, both of these are chunks of bytes. - The MUSCLE Manager Protocol and MUSCLE Peer Protocol define the encoded - messages sent in those chunks, using MsgPack encoding. To distinguish - different kinds of requests, a request type identifier is used, as - represented by this class. + The MUSCLE Manager Protocol, MUSCLE Peer Protocol and MUSCLE Agent Protocol + define the encoded messages sent in those chunks, using MsgPack encoding. + To distinguish different kinds of requests, a request type identifier is + used, as represented by this class. """ # MUSCLE Manager Protocol REGISTER_INSTANCE = 1 @@ -26,6 +26,11 @@ class RequestType(Enum): # MUSCLE Peer Protocol GET_NEXT_MESSAGE = 21 + # MUSCLE Agent Protocol + REPORT_RESOURCES = 41 + GET_COMMAND = 42 + REPORT_RESULT = 43 + class ResponseType(Enum): """Identifier for different types of response @@ -37,3 +42,14 @@ class ResponseType(Enum): SUCCESS = 0 ERROR = 1 PENDING = 2 + + +class AgentCommandType(Enum): + """Identifier for different types of commands + + These are requested from the manager by the agent, and tell it what to do. Part + of the MUSCLE Agent Protocol, used in the response to RequestType.GET_COMMAND. + """ + START = 1 + CANCEL_ALL = 2 + SHUTDOWN = 3 diff --git a/libmuscle/python/libmuscle/native_instantiator/agent/agent_commands.py b/libmuscle/python/libmuscle/native_instantiator/agent/agent_commands.py new file mode 100644 index 00000000..56a830d1 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/agent/agent_commands.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + + +class AgentCommand: + pass + + +@dataclass +class StartCommand(AgentCommand): + name: str + work_dir: Path + args: List[str] + env: Dict[str, str] + stdout: Path + stderr: Path + + +class CancelAllCommand(AgentCommand): + pass + + +class ShutdownCommand(AgentCommand): + pass diff --git a/libmuscle/python/libmuscle/native_instantiator/agent/map_client.py b/libmuscle/python/libmuscle/native_instantiator/agent/map_client.py new file mode 100644 index 00000000..d360b0a5 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/agent/map_client.py @@ -0,0 +1,102 @@ +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +import msgpack + +from libmuscle.mcp.protocol import AgentCommandType, RequestType, ResponseType +from libmuscle.mcp.tcp_transport_client import TcpTransportClient +from libmuscle.native_instantiator.agent.agent_commands import ( + AgentCommand, StartCommand, CancelAllCommand, ShutdownCommand) + + +class MAPClient: + """The client for the MUSCLE Agent Protocol. + + This class connects to the AgentManager and communicates with it. + """ + def __init__(self, node_id: str, location: str) -> None: + """Create a MAPClient + + Args: + node_id: Id of the local node + location: A connection string of the form hostname:port + """ + self._node_id = node_id + self._transport_client = TcpTransportClient(location) + + def close(self) -> None: + """Close the connection + + This closes the connection. After this no other member functions can be called. + """ + self._transport_client.close() + + def report_resources(self, resources: Dict[str, Any]) -> None: + """Report local resources + + The only key in the dict is currently 'cpu', and it maps to a list of frozensets + of hwthread ids that we can bind to with taskset or in a rankfile. + + Args: + resources: Available resource ids by type + """ + enc_cpu_resources = [ + list(hwthreads) for hwthreads in resources['cpu']] + request = [ + RequestType.REPORT_RESOURCES.value, + self._node_id, {'cpu': enc_cpu_resources}] + self._call_agent_manager(request) + + def get_command(self) -> Optional[AgentCommand]: + """Get a command from the agent manager. + + Returns: + A command, or None if there are no commands pending. + """ + request = [RequestType.GET_COMMAND.value, self._node_id] + response = self._call_agent_manager(request) + + if response[0] == ResponseType.PENDING.value: + return None + else: + command = msgpack.unpackb(response[1], raw=False) + + if command[0] == AgentCommandType.START.value: + name = command[1] + workdir = Path(command[2]) + args = command[3] + env = command[4] + stdout = Path(command[5]) + stderr = Path(command[6]) + + return StartCommand(name, workdir, args, env, stdout, stderr) + + elif command[0] == AgentCommandType.CANCEL_ALL.value: + return CancelAllCommand() + + elif command[0] == AgentCommandType.SHUTDOWN.value: + return ShutdownCommand() + + raise Exception('Unknown AgentCommand') + + def report_result(self, names_exit_codes: List[Tuple[str, int]]) -> None: + """Report results of finished processes. + + Args: + names_exit_codes: A list of names and exit codes of finished processes. + """ + request = [RequestType.REPORT_RESULT.value, names_exit_codes] + self._call_agent_manager(request) + + def _call_agent_manager(self, request: Any) -> Any: + """Call the manager and do en/decoding. + + Args: + request: The request to encode and send + + Returns: + The decoded response + """ + encoded_request = msgpack.packb(request, use_bin_type=True) + response, _ = self._transport_client.call(encoded_request) + return msgpack.unpackb(response, raw=False) diff --git a/libmuscle/python/libmuscle/native_instantiator/map_server.py b/libmuscle/python/libmuscle/native_instantiator/map_server.py new file mode 100644 index 00000000..6ab847c0 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/map_server.py @@ -0,0 +1,172 @@ +import errno +import logging +from typing import Any, Dict, cast, List, Optional + +import msgpack + +from libmuscle.mcp.protocol import AgentCommandType, RequestType, ResponseType +from libmuscle.mcp.tcp_transport_server import TcpTransportServer +from libmuscle.mcp.transport_server import RequestHandler +from libmuscle.native_instantiator.agent.agent_commands import ( + AgentCommand, CancelAllCommand, ShutdownCommand, StartCommand) +from libmuscle.native_instantiator.iagent_manager import IAgentManager +from libmuscle.post_office import PostOffice + +from ymmsl import Reference + + +_logger = logging.getLogger(__name__) + + +class MAPRequestHandler(RequestHandler): + """Handles Agent requests.""" + def __init__(self, agent_manager: IAgentManager, post_office: PostOffice) -> None: + """Create a MAPRequestHandler. + + Args: + agent_manager: The AgentManager to forward reports to + post_office: The PostOffice to get commands from + """ + self._agent_manager = agent_manager + self._post_office = post_office + + def handle_request(self, request: bytes) -> bytes: + """Handles an agent request. + + Args: + request: The encoded request + + Returns: + response: An encoded response + """ + req_list = msgpack.unpackb(request, raw=False) + req_type = req_list[0] + req_args = req_list[1:] + if req_type == RequestType.REPORT_RESOURCES.value: + response = self._report_resources(*req_args) + elif req_type == RequestType.GET_COMMAND.value: + response = self._get_command(*req_args) + elif req_type == RequestType.REPORT_RESULT.value: + response = self._report_result(*req_args) + + return cast(bytes, msgpack.packb(response, use_bin_type=True)) + + def _report_resources( + self, node_id: str, resources: Dict[str, Any]) -> Any: + """Handle a report resources request. + + This is used by the agent to report available resources on its node when + it starts up. + + Args: + node_id: Hostname (id) of the node + resources: Resource dictionary, containing a single key 'cpu' which + maps to a list of lists of hwthread ids representing cores. + """ + dec_cpu_resources = [frozenset(hwthreads) for hwthreads in resources['cpu']] + self._agent_manager.report_resources(node_id, {'cpu': dec_cpu_resources}) + return [ResponseType.SUCCESS.value] + + def _get_command(self, node_id: str) -> Any: + """Handle a get command request. + + This is used by the agent to ask if there's anything we would like it to do. + Command sounds a bit brusque, but we already have the agent sending requests + to this handler, so I needed a different word to distinguish them. Requests + are sent by the agent to the manager (because it's the client in an RPC setup), + commands are returned by the manager to the agent (because it tells it what to + do). + + Args: + node_id: Hostname (id) of the agent's node + """ + node_ref = Reference(node_id.replace('-', '_')) + next_request: Optional[bytes] = None + if self._post_office.have_message(node_ref): + next_request = self._post_office.get_message(node_ref) + + if next_request is not None: + return [ResponseType.SUCCESS.value, next_request] + + return [ResponseType.PENDING.value] + + def _report_result(self, instances: List[List[Any]]) -> Any: + """Handle a report result rquest. + + This is sent by the agent if an instance it launched exited. + + Args: + instances: List of instance descriptions, comprising an id str and exit + code int. Really a List[Tuple[str, int]] but msgpack doesn't know + about tuples. + """ + self._agent_manager.report_result(list(map(tuple, instances))) + return [ResponseType.SUCCESS.value] + + +class MAPServer: + """The MUSCLE Agent Protocol server. + + This class accepts connections from the agents and services them using a + MAPRequestHandler. + """ + def __init__(self, agent_manager: IAgentManager) -> None: + """Create a MAPServer. + + This starts a TCP Transport server and connects it to a MAPRequestHandler, + which uses the given agent manager to service the requests. By default, we + listen on port 9009, unless it's not available in which case we use a random + other one. + + Args: + agent_manager: AgentManager to forward requests to + """ + self._post_office = PostOffice() + self._handler = MAPRequestHandler(agent_manager, self._post_office) + try: + self._server = TcpTransportServer(self._handler, 9009) + except OSError as e: + if e.errno != errno.EADDRINUSE: + raise + self._server = TcpTransportServer(self._handler) + + def get_location(self) -> str: + """Return this server's network location. + + This is a string of the form tcp::. + """ + return self._server.get_location() + + def stop(self) -> None: + """Stop the server. + + This makes the server stop serving requests, and shuts down its + background threads. + """ + self._server.close() + + def deposit_command(self, node_id: str, command: AgentCommand) -> None: + """Deposit a command for the given agent. + + This takes the given command and queues it for the given agent to pick up next + time it asks us for one. + + Args: + node_id: Id of the node whose agent should execute the command + command: The command to send + """ + agent = Reference(node_id.replace('-', '_')) + + if isinstance(command, StartCommand): + command_obj = [ + AgentCommandType.START.value, command.name, str(command.work_dir), + command.args, command.env, str(command.stdout), str(command.stderr) + ] + elif isinstance(command, CancelAllCommand): + command_obj = [AgentCommandType.CANCEL_ALL.value] + elif isinstance(command, ShutdownCommand): + command_obj = [AgentCommandType.SHUTDOWN.value] + + encoded_command = cast(bytes, msgpack.packb(command_obj, use_bin_type=True)) + + self._post_office.deposit(agent, encoded_command) diff --git a/libmuscle/python/libmuscle/post_office.py b/libmuscle/python/libmuscle/post_office.py index 59f6c90a..2ec2056c 100644 --- a/libmuscle/python/libmuscle/post_office.py +++ b/libmuscle/python/libmuscle/post_office.py @@ -20,6 +20,15 @@ def __init__(self) -> None: self._outbox_lock = Lock() + def have_message(self, receiver: Reference) -> bool: + """Return whether there's a message for the given receiver. + + Args: + receiver: The receiver of the message. + """ + self._ensure_outbox_exists(receiver) + return not self._outboxes[receiver].is_empty() + def get_message(self, receiver: Reference) -> bytes: """Get a message from a receiver's outbox. From b10948f66dafab89bc4fcfa232214dc2f1b43827 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 17 Oct 2024 09:59:23 +0200 Subject: [PATCH 15/49] Use sets of hwthreads to designate CPU resources This commit does not pass tests, you need the next one as well. I split them up because it was getting very big, and I didn't clean it up because this has taken enough time already. --- integration_test/fake_cluster/slurm.conf | 20 +- .../python/libmuscle/manager/profile_store.py | 4 +- .../libmuscle/manager/qcgpj_instantiator.py | 3 +- .../manager/test/test_profile_database.py | 10 +- .../native_instantiator.py | 281 ++++++++++++++++-- .../native_instantiator/run_script.py | 76 +++-- libmuscle/python/libmuscle/planner/planner.py | 27 +- .../libmuscle/planner/test/test_planner.py | 128 ++++---- libmuscle/python/libmuscle/test/conftest.py | 9 + muscle3/muscle3.py | 4 +- 10 files changed, 430 insertions(+), 132 deletions(-) diff --git a/integration_test/fake_cluster/slurm.conf b/integration_test/fake_cluster/slurm.conf index 1959f614..647b5315 100644 --- a/integration_test/fake_cluster/slurm.conf +++ b/integration_test/fake_cluster/slurm.conf @@ -60,7 +60,7 @@ SlurmdUser=root StateSaveLocation=/var/spool/slurmctld/state SwitchType=switch/none #TaskEpilog= -TaskPlugin=task/none +TaskPlugin=task/cgroup #TaskPluginParam= #TaskProlog= #TopologyPlugin=topology/tree @@ -98,8 +98,8 @@ Waittime=0 SchedulerTimeSlice=5 SchedulerType=sched/backfill SchedulerParameters=bf_interval=1,bf_resolution=1,sched_interval=1 -SelectType=select/linear -#SelectTypeParameters= +SelectType=select/cons_tres +SelectTypeParameters=CR_Core # # # JOB PRIORITY @@ -133,9 +133,9 @@ JobCompType=jobcomp/filetxt #JobCompUser=root JobAcctGatherFrequency=2 JobAcctGatherType=jobacct_gather/linux -SlurmctldDebug=3 +SlurmctldDebug=debug5 #SlurmctldLogFile= -SlurmdDebug=3 +SlurmdDebug=debug3 SlurmdLogFile=/var/log/slurm/slurmd.%n.log #SlurmSchedLogFile= #SlurmSchedLogLevel= @@ -154,10 +154,10 @@ SlurmdLogFile=/var/log/slurm/slurmd.%n.log # # # COMPUTE NODES -NodeName=muscle3-node-0 Procs=2 NodeAddr=muscle3-node-0 Port=17001 State=UNKNOWN -NodeName=muscle3-node-1 Procs=2 NodeAddr=muscle3-node-1 Port=17002 State=UNKNOWN -NodeName=muscle3-node-2 Procs=2 NodeAddr=muscle3-node-2 Port=17003 State=UNKNOWN -NodeName=muscle3-node-3 Procs=2 NodeAddr=muscle3-node-3 Port=17004 State=UNKNOWN -NodeName=muscle3-node-4 Procs=2 NodeAddr=muscle3-node-4 Port=17005 State=UNKNOWN +NodeName=muscle3-node-0 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 NodeAddr=muscle3-node-0 Port=17001 State=UNKNOWN +NodeName=muscle3-node-1 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 NodeAddr=muscle3-node-1 Port=17002 State=UNKNOWN +NodeName=muscle3-node-2 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 NodeAddr=muscle3-node-2 Port=17003 State=UNKNOWN +NodeName=muscle3-node-3 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 NodeAddr=muscle3-node-3 Port=17004 State=UNKNOWN +NodeName=muscle3-node-4 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 NodeAddr=muscle3-node-4 Port=17005 State=UNKNOWN PartitionName=debug Nodes=muscle3-node-[0-4] Default=YES MaxTime=INFINITE State=UP PartitionName=batch Nodes=muscle3-node-[0-2] Default=NO MaxTime=INFINITE State=UP diff --git a/libmuscle/python/libmuscle/manager/profile_store.py b/libmuscle/python/libmuscle/manager/profile_store.py index 036dea85..0fba694e 100644 --- a/libmuscle/python/libmuscle/manager/profile_store.py +++ b/libmuscle/python/libmuscle/manager/profile_store.py @@ -90,9 +90,9 @@ def store_resources(self, resources: Dict[Reference, Resources]) -> None: instance_oid = self._get_instance_oid(cur, instance_id) tuples = [ - (instance_oid, node, core) + (instance_oid, node, hwthread) for node, cores in res.cores.items() - for core in cores] + for core in cores for hwthread in core] cur.executemany( "INSERT INTO assigned_cores (instance_oid, node, core)" diff --git a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py index ae58089b..9130779f 100644 --- a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py +++ b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py @@ -200,7 +200,8 @@ def _send_resources(self) -> None: """Converts and sends QCG available resources.""" resources = Resources() for node in self._qcg_resources.nodes: - resources.cores[node.name] = {int(n.split(',')[0]) for n in node.free_ids} + resources.cores[node.name] = { + frozenset(n.split(',')) for n in node.free_ids} self._resources_out.put(resources) diff --git a/libmuscle/python/libmuscle/manager/test/test_profile_database.py b/libmuscle/python/libmuscle/manager/test/test_profile_database.py index 2d6d472c..33bbb9dd 100644 --- a/libmuscle/python/libmuscle/manager/test/test_profile_database.py +++ b/libmuscle/python/libmuscle/manager/test/test_profile_database.py @@ -8,6 +8,8 @@ from ymmsl import Operator, Port, Reference +from libmuscle.test.conftest import frozenset_of as s + import pytest from pathlib import Path @@ -22,12 +24,12 @@ def db_file(tmp_path) -> Path: store.store_instances([Reference('instance1'), Reference('instance2')]) resources1 = Resources({ - 'node001': {0, 1}, - 'node002': {0, 1}}) + 'node001': {s(0), s(1)}, + 'node002': {s(0), s(1)}}) resources2 = Resources({ - 'node001': {0}, - 'node002': {0, 1, 2}}) + 'node001': {s(0)}, + 'node002': {s(0), s(1), s(2)}}) store.store_resources({ Reference('instance1'): resources1, diff --git a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py index d34d5482..391d89fe 100644 --- a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py +++ b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py @@ -1,3 +1,191 @@ +"""Module for examining resources and instantiating instances on them + +There's a huge comment here because there's a big mess here that took me forever to +figure out, so now I'm going to document it for the future. + + +Identifying hardware resources + +Today's computers all contain multi-core CPUs, often with symmetric multithreading +(SMT), also known as hyperthreading. This means that we have hardware threads +(hwthreads) and also cores, and then there's caches and memory as well but we're not +going into NUMA here. + +Cores and hwthreads are identified by number, but they have multiple different numbers +that are referred to by different names in different contexts, making everything very +confusing. So here are some definitions to disambiguate things. Note that this is still +a rather simplified representation, but it's enough for what we're doing here in +MUSCLE3. + + +Hardware threads + +A *hardware thread (hwthread)* is, at the hardware level, an instruction decoder. It +points to wherever in the code we are currently executing, and it can read the next +couple of instructions and figure out how to execute them. It can't actually execute +anything however, because it doesn't have the hardware that does that. + +Intel refers to hwthreads as "logical processors" and so does Linux, hwloc calls them +"processing units" or PUs and so does OpenMPI unless it uses the term hwthread just to +confuse things a bit more. + +Cores + +A *core* contains at least one hwthread, and at least one functional unit, which is a +hardware component that actually does calculations and other data processing. Within a +core, the hwthread(s) read instructions and pass them to the functional units to be +executed. If a core has more than one hwthread, then the CPU supports SMT. + +Intel refers to cores as "physical processors", hwloc calls them cores and so do most +other sources. We'll use cores here. + +Since a hwthread cannot do anything on its own, it's always part of a core. + +CPUs + +The term CPU is used in many ways by various bits of documentation, sometimes referring +to a hwthread or a core, but here we'll take it to mean a collection of cores in a +plastic box. Similar terms are *package* (referring to that plastic box with very many +metal pins) and *socket* (the thing the package mounts into), or *processor*, which was +originally used to refer to all of the above when CPUs still had only one core with only +one hwthread, and has now become ambiguous. + +Weird things can happen here, I've seen CPUs that as far as I can tell are a single +package, but nevertheless claim to have two sockets. I suspect that that's two physical +chips in a single plastic box, but I don't know for sure. + +Here, we're concerned with hwthreads and cores and how to identify them and assign +instances to them. + + +Linux + +On modern operating systems, hardware access is mediated by the operating system, and +we're mainly concerned with Linux here because that is what all the clusters are running +(see the note on macOS below). Information about the CPU(s) can be obtained on Linux +from the /proc/cpuinfo file, or equivalently but more modernly, from the files in +/sys/devices/system/cpu/cpu/topology/. + +Linux collects information about processors because it needs to run processes (programs, +software threads) on them on behalf of the user. Processes are assigned to hwthreads, so +that is what Linux considers a *processor*. /proc/cpuinfo lists all these processors, +and they each have their own directory /sys/devices/system/cpu/cpu. + +On Linux, processors have an id, which is that number in the directory, and is +listed under "processor" in /proc/cpuinfo. Since this number identifies a hwthread and +is assigned by Linux rather than being baked into the hardware, I'm calling it a +"logical hwthread id", this being a logical id of a hwthread, not an id of a logical +hwthread. It's also the id of a logical processor in Intel-speak. + +Hwthreads actually have a second number associated with them, which does come from the +hardware. In /proc/cpuinfo, that's listed under "apicid"; it doesn't seem to be +available from sysfs. Hwloc call this the "physical PU (its name for a hwthread) id", +and OpenMPI's mpirun manpage also refers to it as a "physical processor location". + +There's great potential for confusion here: the "physical PU id" and "physical processor +location" both identify a hardware-specified number (a physical id or a physical +location) for a hwthread. This is something completely different than what Intel calls a +"physical processor", which they use to refer to a core. + +MUSCLE3 uses logical hwthread ids everywhere, it does not use physical ids. + +Linux knows about how hwthreads are grouped into bigger things of course. Cores are +identified in Linux using the "core id", which is listed in /proc/cpuinfo and in +/sys/devices/system/cpu/cpu/topology/core_id. So for each hwthread, identified by its +logical id, we can look up which core it is a part of. The core id is a logical id, +assigned by Linux, not by the hardware. While logical hwthread ids seem to always be +consecutive at least on the hardware I've seen so far, core ids may have gaps. + +MUSCLE3 does not use core ids, although it uses groups of hwthread ids that contain all +the hwthreads for a given core. + + +Resource binding + +Running processes need something to run on, a hwthread. The assignment of process to +hwthread is done by the operating system's scheduler: when a process is ready to run, +the scheduler will try to find it a free hwthread to run on. + +The scheduler can be constrained in which hwthreads it considers for a given process, +which is known as binding the process. This may have performance benefits, because +moving a process from one hwthread to another takes time. In MUSCLE3, when running on a +cluster, each process is assigned its own specific set of hwthreads to run on, and we +try to bind the instance to the assigned hwthreads. + +Taskset + +How this is done depends on how the instance is started. For non-MPI instances, we use a +Linux utility named 'taskset' that starts another program with a giving binding. The +binding is expressed as an *affinity mask*, a string of bits that say whether a given +processor (hwthread) can be used by the process or not. Each position in the string of +bits corresponds to the hwthread with that logical id. + +OpenMPI + +OpenMPI can bind cores in various ways, we use a rankfile and the --use-hwthread-cpus +option to specify the logical hwthread ids we want to bind each MPI process (rank) to. +Note that OpenMPI by default binds to cores, and can also bind to various other things +including sockets. + +MPICH + +MPICH doesn't support binding, as far as I can see. + +Intel MPI + +Intel MPI uses logical hwthread ids-based masks, specified in an environment variable, +to go with a machinefile that lists the nodes to put each process on. + +Slurm srun + +Slurm's srun has a CPU_BIND environment variable that likewise contains logical hwthread +ids-based masks, and a hostfile that lists the nodes to put each process on. + +Here are some disambiguation tables to help with the confusion: + + +``` +MUSCLE3 hwthread logical hwthread id physical hwthread id + +Linux processor processor apicid + (/proc/cpuinfo only) + +cgroups always uses these + +taskset always uses these + +hwloc PU PU L# PU P# + +OpenMPI hwthread used in rankfile if used in rankfile if + --use-hwthread-cpus rmaps_rank_file_physical + is specified MCA param set + +Intel logical logical processor + processor number + +srun used by --bind-to + +psutil logical returned by Process.cpu_affinity() + core counted by psutil.cpu_count(logical=True) +``` + + +``` +MUSCLE3 core (uses list of hwthread ids) + +Linux core core id + +Hwloc core core L# + +OpenMPI core used in rankfile if + --use-hwthread-cpus not + specified + +psutil physical counted by psutil.cpu_count(logical=False) + core +``` + +""" import logging import multiprocessing as mp from os import chdir @@ -11,8 +199,8 @@ from libmuscle.manager.instantiator import ( CancelAllRequest, CrashedResult, create_instance_env, InstantiationRequest, Process, ProcessStatus, reconfigure_logging, ShutdownRequest) -from libmuscle.native_instantiator.process_manager import ProcessManager -from libmuscle.native_instantiator.resource_detector import ResourceDetector +from libmuscle.native_instantiator.agent_manager import AgentManager +from libmuscle.native_instantiator.global_resources import global_resources from libmuscle.native_instantiator.run_script import make_script, prep_resources from libmuscle.planner.planner import Resources from ymmsl import MPICoresResReq, MPINodesResReq, ResourceRequirements, ThreadedResReq @@ -42,8 +230,6 @@ def __init__( self._log_records_out = log_records self._run_dir = run_dir - self._resource_detector = ResourceDetector() - self._process_manager = ProcessManager() self._processes: Dict[str, Process] = dict() def run(self) -> None: @@ -53,6 +239,8 @@ def run(self) -> None: m3_dir.mkdir(exist_ok=True) chdir(m3_dir) + self._agent_manager = AgentManager(m3_dir) + reconfigure_logging(self._log_records_out) self._send_resources() self._main() @@ -81,10 +269,10 @@ def _main(self) -> None: elif isinstance(request, CancelAllRequest): _logger.debug('Got CancelAllRequest') - self._process_manager.cancel_all() - _logger.debug('Done CancelAllRequest') + self._agent_manager.cancel_all() elif isinstance(request, InstantiationRequest): + _logger.debug('Got InstantiationRequest') if not shutting_down: self._instantiate(request) @@ -95,19 +283,65 @@ def _main(self) -> None: self._report_finished_processes() if shutting_down: - _logger.debug(f'Done: {self._processes}') + _logger.debug(f'Remaining processes: {self._processes}') done = not self._processes if not done: sleep(0.1) + self._agent_manager.shutdown() + def _send_resources(self) -> None: - """Detect resources and report them to the manager.""" + """Detect resources and report them to the manager. + + We have potentially two sources of truth here: the Slurm environment variables + and what the agents report based on what they're bound to. These should be + consistent, but we check that and then try to be conservative to try to not + step outside our bounds even if the cluster doesn't constrain processes to their + assigned processors. + """ resources = Resources() - res = zip(self._resource_detector.nodes, self._resource_detector.cores_per_node) - for node, num_cores in res: - resources.cores[node] = set(range(num_cores)) + agent_cores = self._agent_manager.get_resources() + + env_ncores = dict( + zip(global_resources.nodes, global_resources.cores_per_node) + ) + + for node in env_ncores: + if node not in agent_cores: + _logger.warning( + f'The environment suggests we should have node {node},' + ' but no agent reported running on it. We won''t be able' + ' to use this node.') + else: + resources.cores[node] = set(agent_cores[node]) + + env_nncores = env_ncores[node] + ag_nncores = len(agent_cores[node]) + if ag_nncores < env_nncores: + _logger.warning( + f'Node {node} should have {env_nncores} cores available,' + f' but the agent reports only {ag_nncores} available to it.' + f' We\'ll use the {ag_nncores} we seem to have.') + + resources.cores[node] = set(agent_cores[node]) + + elif env_nncores < ag_nncores: + _logger.warning( + f'Node {node} should have {env_nncores} cores available,' + f' but the agent reports {ag_nncores} available to it.' + ' Maybe the cluster does not constrain resources? We\'ll' + f' use the {env_nncores} that we should have got.') + resources.cores[node] = set(agent_cores[node][:env_nncores]) + + for node in agent_cores: + if node not in env_ncores: + _logger.warning( + f'An agent is running on node {node} but the environment' + ' does not list it as ours. It seems that the node\'s' + ' hostname does not match what SLURM calls it. We will not use' + ' this node, because we\'re not sure it\'s really ours.') self._resources_out.put(resources) @@ -119,37 +353,31 @@ def _instantiate(self, request: InstantiationRequest) -> None: self._add_resources(env, request.res_req) rankfile: Optional[Path] = None - if self._resource_detector.on_cluster(): - _logger.debug('On cluster...') + if global_resources.on_cluster(): rankfile_contents, resource_env = prep_resources( request.implementation.execution_model, request.resources) - _logger.debug(f'Rankfile: {rankfile_contents}') - _logger.debug(f'Resource env: {resource_env}') - if rankfile_contents: rankfile = self._write_rankfile(request, rankfile_contents) + env['MUSCLE_RANKFILE'] = str(rankfile) - if resource_env: - env.update(resource_env) - - # env['MUSCLE_THREADS_PER_MPI_PROCESS'] = str( - # request.res_req.threads_per_mpi_process) - # env['MUSCLE_OPENMPI_RANK_FILE'] = str(rank_file) - # env['MUSCLE_INTELMPI_RESOURCES'] = ' '.join(mpi_res_args) + env.update(resource_env) run_script_file = self._write_run_script(request, rankfile) args = [str(run_script_file)] self._processes[name] = Process(request.instance, request.resources) + _logger.debug(f'Instantiating {name} on {request.resources}') try: - self._process_manager.start( + self._agent_manager.start( + next(iter(request.resources.cores.keys())), name, request.work_dir, args, env, request.stdout_path, request.stderr_path) self._processes[name].status = ProcessStatus.RUNNING except Exception as e: + _logger.warning(f'Instance {name} failed to start: {e}') self._processes[name].status = ProcessStatus.ERROR self._processes[name].error_msg = f'Instance failed to start: {e}' @@ -168,12 +396,13 @@ def _write_rankfile(self, request: InstantiationRequest, rankfile: str) -> Path: def _write_run_script( self, request: InstantiationRequest, rankfile: Optional[Path]) -> Path: """Create and write out the run script and return its location.""" + # TODO: Only write out once for each implementation if request.implementation.script: run_script = request.implementation.script else: run_script = make_script( request.implementation, request.res_req, - not self._resource_detector.on_cluster(), rankfile) + not global_resources.on_cluster(), rankfile) run_script_file = request.instance_dir / 'run_script.sh' @@ -217,7 +446,7 @@ def _report_failed_processes(self) -> None: def _report_finished_processes(self) -> None: """Get finished processes and report back their status.""" - for name, exit_code in self._process_manager.get_finished(): + for name, exit_code in self._agent_manager.get_finished(): process = self._processes[name] if process.status == ProcessStatus.RUNNING: if exit_code == 0: diff --git a/libmuscle/python/libmuscle/native_instantiator/run_script.py b/libmuscle/python/libmuscle/native_instantiator/run_script.py index 62aa7f77..1c615823 100644 --- a/libmuscle/python/libmuscle/native_instantiator/run_script.py +++ b/libmuscle/python/libmuscle/native_instantiator/run_script.py @@ -7,6 +7,32 @@ ResourceRequirements, ThreadedResReq) +def direct_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: + """Create resources for a non-MPI program with taskset. + + Taskset expects a set of cores on the command line, which we put into a + MUSCLE_CORES environment variable here. + + Args: + resources: The resources to describe + + Return: + No rank file, and a set of environment variables. + """ + env: Dict[str, str] = dict() + only_node_hwthreads_list = [ + hwthread + for core in next(iter(resources.cores.values())) + for hwthread in core] + + env['MUSCLE_BIND_LIST'] = ','.join(map(str, only_node_hwthreads_list)) + + mask_int = sum((1 << c for c in only_node_hwthreads_list)) + env['MUSCLE_BIND_MASK'] = format(mask_int, 'X') + + return '', env + + def openmpi_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: """Create resource description for OpenMPI mpirun @@ -18,10 +44,12 @@ def openmpi_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: """ ranklines: List[str] = list() all_cores = ( - (node, core) for node, cores in resources.cores.items() for core in cores) + (node, ','.join(sorted(map(str, hwthreads)))) + for node, cores in resources.cores.items() + for hwthreads in cores) - for i, (node, core) in enumerate(all_cores): - ranklines.append(f'rank {i}={node} slot={core}') + for i, (node, hwthreads) in enumerate(all_cores): + ranklines.append(f'rank {i}={node} slot={hwthreads}') rankfile = '\n'.join(ranklines) + '\n' @@ -83,7 +111,7 @@ def prep_resources( The contents of the rank/machine/hostfile, and a set of environment variables. """ if model == ExecutionModel.DIRECT: - return '', dict() + return direct_prep_resources(resources) elif model == ExecutionModel.OPENMPI: return openmpi_prep_resources(resources) elif model == ExecutionModel.INTELMPI: @@ -131,11 +159,11 @@ def local_command(implementation: Implementation) -> str: elif implementation.execution_model == ExecutionModel.OPENMPI: # Native name is orterun for older and prterun for newer OpenMPI. # So we go with mpirun, which works for either. - fstr = 'mpirun -np {{ntasks}} --oversubscribe {command} {args}' + fstr = 'mpirun -np $MUSCLE_MPI_PROCESSES --oversubscribe {command} {args}' elif implementation.execution_model == ExecutionModel.INTELMPI: - fstr = 'mpirun -n {{ntasks}} {command} {args}' + fstr = 'mpirun -n $MUSCLE_MPI_PROCESSES {command} {args}' elif implementation.execution_model == ExecutionModel.SRUNMPI: - fstr = 'srun -n {{ntasks}} -m arbitrary {command} {args}' + fstr = 'srun -n $MUSCLE_MPI_PROCESSES -m arbitrary {command} {args}' # elif implementation.execution_model == ExecutionModel.MPICH # fstr = 'mpiexec -n {{ntasks}} {command} {args}' @@ -163,26 +191,31 @@ def cluster_command(implementation: Implementation) -> str: implementation: The implementation to start. Return: - A format string with embedded {ntasks} and {rankfile}. + A string with the command to use to start the implementation. """ + # TODO: enable debug options iff the manager log level is set to DEBUG + # TODO: don't use taskset if it's not available if implementation.execution_model == ExecutionModel.DIRECT: - fstr = '{command} {args}' + fstr = 'taskset $MUSCLE_BIND_MASK {command} {args}' elif implementation.execution_model == ExecutionModel.OPENMPI: # Native name is orterun for older and prterun for newer OpenMPI. # So we go with mpirun, which works for either. fstr = ( - 'mpirun -v -np {{ntasks}}' + 'mpirun -v -np $MUSCLE_MPI_PROCESSES' ' -d --debug-daemons' - ' --rankfile {{rankfile}} --oversubscribe' - # ' --map-by rankfile:file={{rankfile}}:oversubscribe' - ' --display-map --display-allocation {command} {args}') - # ' --bind-to core --display-map --display-allocation {command} {args}') + ' --rankfile $MUSCLE_RANKFILE --use-hwthread-cpus --oversubscribe' + # ' --map-by rankfile:file=$MUSCLE_RANKFILE:oversubscribe' + # ' --display-map --display-allocation {command} {args}' + ' --bind-to core --display-map --display-allocation {command} {args}' + ) elif implementation.execution_model == ExecutionModel.INTELMPI: - fstr = 'mpirun -n {{ntasks}} -machinefile {{rankfile}} {command} {args}' + fstr = ( + 'mpirun -n $MUSCLE_MPI_PROCESSES -machinefile $MUSCLE_RANKFILE' + ' {command} {args}') elif implementation.execution_model == ExecutionModel.SRUNMPI: - fstr = 'srun -n {{ntasks}} -m arbitrary {command} {args}' + fstr = 'srun -n $MUSCLE_MPI_PROCESSES -m arbitrary {command} {args}' # elif implementation.execution_model == ExecutionModel.MPICH - # fstr = 'mpiexec -n {{ntasks}} -f {{rankfile}} {command} {args}' + # fstr = 'mpiexec -n $MUSCLE_MPI_PROCESSES -f $MUSCLE_RANKFILE {command} {args}' if implementation.args is None: args = '' @@ -200,7 +233,7 @@ def cluster_command(implementation: Implementation) -> str: def make_script( implementation: Implementation, res_req: ResourceRequirements, local: bool, rankfile: Optional[Path] = None) -> str: - """Make a launch script for a given implementation. + """Make a run script for a given implementation. Args: implementation: The implementation to launch @@ -232,12 +265,9 @@ def make_script( lines.append('') if local: - cmd = local_command(implementation) + lines.append(local_command(implementation)) else: - cmd = cluster_command(implementation) - - ntasks = num_mpi_tasks(res_req) - lines.append(cmd.format(ntasks=ntasks, rankfile=rankfile)) + lines.append(cluster_command(implementation)) lines.append('') diff --git a/libmuscle/python/libmuscle/planner/planner.py b/libmuscle/python/libmuscle/planner/planner.py index 47d4b903..2d63828e 100644 --- a/libmuscle/python/libmuscle/planner/planner.py +++ b/libmuscle/python/libmuscle/planner/planner.py @@ -1,6 +1,6 @@ from copy import copy, deepcopy import logging -from typing import Dict, Iterable, List, Mapping, Optional, Set, Tuple +from typing import Dict, Iterable, FrozenSet, List, Mapping, Optional, Set, Tuple from ymmsl import ( Component, Configuration, Model, MPICoresResReq, MPINodesResReq, @@ -391,17 +391,17 @@ class Resources: resources we're talking about. Attributes: - cores: A dictionary mapping designated nodes to designated - cores on them. + cores: A dictionary mapping designated nodes to designated cores on them. Cores + are represented by sets of hwthreads they have. """ - def __init__(self, cores: Optional[Dict[str, Set[int]]] = None) -> None: + def __init__(self, cores: Optional[Dict[str, Set[FrozenSet[int]]]] = None) -> None: """Create a Resources object with the given cores. Args: cores: Cores to be designated by this object. """ if cores is None: - self.cores: Dict[str, Set[int]] = {} + self.cores: Dict[str, Set[FrozenSet[int]]] = {} else: self.cores = cores @@ -444,22 +444,22 @@ def __isub__(self, other: 'Resources') -> 'Resources': def __str__(self) -> str: """Return a human-readable string representation.""" - def collapse_ranges(cores: Set[int]) -> str: + def collapse_ranges(cores: Set[FrozenSet[int]]) -> str: if len(cores) == 0: return '' result = list() - scores = sorted(cores) + hwthreads = sorted((hwthread for core in cores for hwthread in core)) start = 0 i = 1 - while i <= len(scores): - if (i == len(scores)) or (scores[i-1] != scores[i] - 1): + while i <= len(hwthreads): + if (i == len(hwthreads)) or (hwthreads[i-1] != hwthreads[i] - 1): if start == i - 1: # run of one - result.append(str(scores[i-1])) + result.append(str(hwthreads[i-1])) else: # run of at least two - result.append(f'{scores[start]}-{scores[i-1]}') + result.append(f'{hwthreads[start]}-{hwthreads[i-1]}') start = i i += 1 return ','.join(result) @@ -477,7 +477,7 @@ def nodes(self) -> Iterable[str]: return self.cores.keys() def total_cores(self) -> int: - """Returns the total number of cores designated.""" + """Returns the total number of cores (not hwthreads) designated.""" return sum([len(cs) for cs in self.cores.values()]) def isdisjoint(self, other: 'Resources') -> bool: @@ -701,7 +701,8 @@ def _expand_resources( f' {req.threads_per_mpi_process} threads per process,' f' which is impossible with {num_cores} cores per' ' node.') - self._all_resources.cores[new_node] = set(range(num_cores)) + self._all_resources.cores[new_node] = { + frozenset([i]) for i in range(num_cores)} def _allocate_instance( self, instance: Reference, component: Component, diff --git a/libmuscle/python/libmuscle/planner/test/test_planner.py b/libmuscle/python/libmuscle/planner/test/test_planner.py index 95e8e7fb..25883aab 100644 --- a/libmuscle/python/libmuscle/planner/test/test_planner.py +++ b/libmuscle/python/libmuscle/planner/test/test_planner.py @@ -9,13 +9,15 @@ Component, Conduit, Configuration, Implementation, Model, MPICoresResReq, Ports, Reference, ResourceRequirements, ThreadedResReq) +from libmuscle.test.conftest import frozenset_of as s + @pytest.fixture def all_resources() -> Resources: return Resources({ - 'node001': {1, 2, 3, 4}, - 'node002': {1, 2, 3, 4}, - 'node003': {1, 2, 3, 4}}) + 'node001': {s(1), s(2), s(3), s(4)}, + 'node002': {s(1), s(2), s(3), s(4)}, + 'node003': {s(1), s(2), s(3), s(4)}}) @pytest.fixture @@ -96,42 +98,48 @@ def test_model_graph( def test_resources(all_resources: Resources) -> None: res1 = all_resources assert res1.cores == { - 'node001': {1, 2, 3, 4}, - 'node002': {1, 2, 3, 4}, - 'node003': {1, 2, 3, 4}} + 'node001': {s(1), s(2), s(3), s(4)}, + 'node002': {s(1), s(2), s(3), s(4)}, + 'node003': {s(1), s(2), s(3), s(4)}} assert set(res1.nodes()) == {'node001', 'node002', 'node003'} res2 = Resources({ - 'node004': {1, 2, 3, 4, 5, 6}, 'node005': {1, 2, 3, 4, 5, 6}}) + 'node004': {s(1), s(2), s(3), s(4), s(5), s(6)}, + 'node005': {s(1), s(2), s(3), s(4), s(5), s(6)}}) res1 += res2 assert res1.cores == { - 'node001': {1, 2, 3, 4}, 'node002': {1, 2, 3, 4}, - 'node003': {1, 2, 3, 4}, 'node004': {1, 2, 3, 4, 5, 6}, - 'node005': {1, 2, 3, 4, 5, 6}} - - res3 = Resources({'node003': {1, 2, 3, 4}, 'node005': {4, 5, 6}}) + 'node001': {s(1), s(2), s(3), s(4)}, + 'node002': {s(1), s(2), s(3), s(4)}, + 'node003': {s(1), s(2), s(3), s(4)}, + 'node004': {s(1), s(2), s(3), s(4), s(5), s(6)}, + 'node005': {s(1), s(2), s(3), s(4), s(5), s(6)}} + + res3 = Resources({ + 'node003': {s(1), s(2), s(3), s(4)}, 'node005': {s(4), s(5), s(6)}}) res1 -= res3 assert res1.cores == { - 'node001': {1, 2, 3, 4}, 'node002': {1, 2, 3, 4}, - 'node004': {1, 2, 3, 4, 5, 6}, 'node005': {1, 2, 3}} + 'node001': {s(1), s(2), s(3), s(4)}, + 'node002': {s(1), s(2), s(3), s(4)}, + 'node004': {s(1), s(2), s(3), s(4), s(5), s(6)}, + 'node005': {s(1), s(2), s(3)}} assert res1.nodes() == { 'node001', 'node002', 'node004', 'node005'} res4 = copy(res3) - res4.cores['node003'] = {8} + res4.cores['node003'] = {s(8)} - assert res3.cores['node003'] == {1, 2, 3, 4} - assert res4.cores['node003'] == {8} + assert res3.cores['node003'] == {s(1), s(2), s(3), s(4)} + assert res4.cores['node003'] == {s(8)} all_resources = Resources.union([res1, res2, res3, res4]) - assert all_resources.cores['node001'] == {1, 2, 3, 4} - assert all_resources.cores['node002'] == {1, 2, 3, 4} - assert all_resources.cores['node003'] == {1, 2, 3, 4, 8} - assert all_resources.cores['node004'] == {1, 2, 3, 4, 5, 6} - assert all_resources.cores['node005'] == {1, 2, 3, 4, 5, 6} + assert all_resources.cores['node001'] == {s(1), s(2), s(3), s(4)} + assert all_resources.cores['node002'] == {s(1), s(2), s(3), s(4)} + assert all_resources.cores['node003'] == {s(1), s(2), s(3), s(4), s(8)} + assert all_resources.cores['node004'] == {s(1), s(2), s(3), s(4), s(5), s(6)} + assert all_resources.cores['node005'] == {s(1), s(2), s(3), s(4), s(5), s(6)} def test_planner( @@ -139,9 +147,12 @@ def test_planner( planner = Planner(all_resources) allocations = planner.allocate_all(configuration) - assert allocations[Reference('init')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('macro')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('micro')].cores == {'node001': {1, 2, 3, 4}} + assert allocations[Reference('init')].cores == { + 'node001': {s(1), s(2), s(3), s(4)}} + assert allocations[Reference('macro')].cores == { + 'node001': {s(1), s(2), s(3), s(4)}} + assert allocations[Reference('micro')].cores == { + 'node001': {s(1), s(2), s(3), s(4)}} def test_planner_exclusive_macro( @@ -151,9 +162,12 @@ def test_planner_exclusive_macro( False) allocations = planner.allocate_all(configuration) - assert allocations[Reference('init')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('macro')].cores == {'node002': {1, 2, 3, 4}} - assert allocations[Reference('micro')].cores == {'node001': {1, 2, 3, 4}} + assert allocations[Reference('init')].cores == { + 'node001': {s(1), s(2), s(3), s(4)}} + assert allocations[Reference('macro')].cores == { + 'node002': {s(1), s(2), s(3), s(4)}} + assert allocations[Reference('micro')].cores == { + 'node001': {s(1), s(2), s(3), s(4)}} def test_planner_exclusive_predecessor( @@ -163,9 +177,12 @@ def test_planner_exclusive_predecessor( False) allocations = planner.allocate_all(configuration) - assert allocations[Reference('init')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('macro')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('micro')].cores == {'node001': {1, 2, 3, 4}} + assert allocations[Reference('init')].cores == { + 'node001': {s(1), s(2), s(3), s(4)}} + assert allocations[Reference('macro')].cores == { + 'node001': {s(1), s(2), s(3), s(4)}} + assert allocations[Reference('micro')].cores == { + 'node001': {s(1), s(2), s(3), s(4)}} def test_oversubscribe( @@ -177,33 +194,38 @@ def test_oversubscribe( planner = Planner(all_resources) allocations = planner.allocate_all(configuration) - assert allocations[Reference('init[0]')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('init[1]')].cores == {'node002': {1, 2, 3, 4}} - assert allocations[Reference('init[2]')].cores == {'node003': {1, 2, 3, 4}} - assert allocations[Reference('init[3]')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('init[4]')].cores == {'node002': {1, 2, 3, 4}} + assert allocations[Reference('init[0]')].cores == { + 'node001': {s(1), s(2), s(3), s(4)}} + assert allocations[Reference('init[1]')].cores == { + 'node002': {s(1), s(2), s(3), s(4)}} + assert allocations[Reference('init[2]')].cores == { + 'node003': {s(1), s(2), s(3), s(4)}} + assert allocations[Reference('init[3]')].cores == { + 'node001': {s(1), s(2), s(3), s(4)}} + assert allocations[Reference('init[4]')].cores == { + 'node002': {s(1), s(2), s(3), s(4)}} assert allocations[Reference('macro[0]')].cores == { - 'node001': {1, 2, 3, 4}} + 'node001': {s(1), s(2), s(3), s(4)}} assert allocations[Reference('macro[1]')].cores == { - 'node002': {1, 2, 3, 4}} + 'node002': {s(1), s(2), s(3), s(4)}} assert allocations[Reference('macro[2]')].cores == { - 'node003': {1, 2, 3, 4}} + 'node003': {s(1), s(2), s(3), s(4)}} assert allocations[Reference('macro[3]')].cores == { - 'node001': {1, 2, 3, 4}} + 'node001': {s(1), s(2), s(3), s(4)}} assert allocations[Reference('macro[4]')].cores == { - 'node002': {1, 2, 3, 4}} + 'node002': {s(1), s(2), s(3), s(4)}} assert allocations[Reference('micro[0]')].cores == { - 'node001': {1, 2, 3, 4}} + 'node001': {s(1), s(2), s(3), s(4)}} assert allocations[Reference('micro[1]')].cores == { - 'node002': {1, 2, 3, 4}} + 'node002': {s(1), s(2), s(3), s(4)}} assert allocations[Reference('micro[2]')].cores == { - 'node003': {1, 2, 3, 4}} + 'node003': {s(1), s(2), s(3), s(4)}} assert allocations[Reference('micro[3]')].cores == { - 'node001': {1, 2, 3, 4}} + 'node001': {s(1), s(2), s(3), s(4)}} assert allocations[Reference('micro[4]')].cores == { - 'node002': {1, 2, 3, 4}} + 'node002': {s(1), s(2), s(3), s(4)}} def test_oversubscribe_single_instance_threaded() -> None: @@ -213,12 +235,13 @@ def test_oversubscribe_single_instance_threaded() -> None: Reference('x'): ThreadedResReq(Reference('x'), 24)} config = Configuration(model, None, impl, reqs) - res = Resources({'node001': {1, 2, 3, 4}}) + res = Resources({'node001': {s(1), s(2), s(3), s(4)}}) planner = Planner(res) allocations = planner.allocate_all(config) - assert allocations[Reference('x')].cores == {'node001': {1, 2, 3, 4}} + assert allocations[Reference('x')].cores == { + 'node001': {s(1), s(2), s(3), s(4)}} def test_oversubscribe_single_instance_mpi() -> None: @@ -228,12 +251,13 @@ def test_oversubscribe_single_instance_mpi() -> None: Reference('x'): MPICoresResReq(Reference('x'), 24)} config = Configuration(model, None, impl, reqs) - res = Resources({'node001': {1, 2, 3, 4}}) + res = Resources({'node001': {s(1), s(2), s(3), s(4)}}) planner = Planner(res) allocations = planner.allocate_all(config) - assert allocations[Reference('x')].cores == {'node001': {1, 2, 3, 4}} + assert allocations[Reference('x')].cores == { + 'node001': {s(1), s(2), s(3), s(4)}} def test_virtual_allocation() -> None: @@ -243,7 +267,7 @@ def test_virtual_allocation() -> None: Reference('x'): MPICoresResReq(Reference('x'), 13)} config = Configuration(model, None, impl, reqs) - res = Resources({'node000001': {1, 2, 3, 4}}) + res = Resources({'node000001': {s(1), s(2), s(3), s(4)}}) planner = Planner(res) allocations = planner.allocate_all(config, virtual=True) @@ -260,7 +284,7 @@ def test_impossible_virtual_allocation() -> None: Reference('x'): ThreadedResReq(Reference('x'), 13)} config = Configuration(model, None, impl, reqs) - res = Resources({'node000001': {1, 2, 3, 4}}) + res = Resources({'node000001': {s(1), s(2), s(3), s(4)}}) planner = Planner(res) with pytest.raises(InsufficientResourcesAvailable): diff --git a/libmuscle/python/libmuscle/test/conftest.py b/libmuscle/python/libmuscle/test/conftest.py index 2802061d..3215517f 100644 --- a/libmuscle/python/libmuscle/test/conftest.py +++ b/libmuscle/python/libmuscle/test/conftest.py @@ -99,3 +99,12 @@ def port_exists(name): port_manager.list_ports.return_value = declared_ports port_manager.port_exists = port_exists return port_manager + + +def frozenset_of(*args): + """Create a frozenset containing the arguments. + + This is a helper to shorten notation used in some of the planning and + launching-related tests. + """ + return frozenset(args) diff --git a/muscle3/muscle3.py b/muscle3/muscle3.py index 04a8c3a8..c4f39af1 100644 --- a/muscle3/muscle3.py +++ b/muscle3/muscle3.py @@ -138,7 +138,9 @@ def resources( click.echo(_RESOURCES_INCOMPLETE_MODEL, err=True) sys.exit(1) - resources = Resources({'node000001': set(range(cores_per_node))}) + resources = Resources({ + 'node000001': {frozenset([r]) for r in range(cores_per_node)}}) + planner = Planner(resources) try: allocations = planner.allocate_all(config, True) From 68ea7da41b84ce5b6aa8c0a061e7577d3bbad4b5 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 17 Oct 2024 10:07:13 +0200 Subject: [PATCH 16/49] Add agents for the native instantiator --- .../native_instantiator/agent/__init__.py | 0 .../native_instantiator/agent/__main__.py | 164 ++++++++++++++ .../native_instantiator/agent_manager.py | 205 ++++++++++++++++++ .../native_instantiator/global_resources.py | 72 ++++++ .../native_instantiator/iagent_manager.py | 29 +++ .../native_instantiator/resource_detector.py | 45 ---- .../libmuscle/native_instantiator/slurm.py | 11 + 7 files changed, 481 insertions(+), 45 deletions(-) create mode 100644 libmuscle/python/libmuscle/native_instantiator/agent/__init__.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/agent/__main__.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/agent_manager.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/global_resources.py create mode 100644 libmuscle/python/libmuscle/native_instantiator/iagent_manager.py delete mode 100644 libmuscle/python/libmuscle/native_instantiator/resource_detector.py diff --git a/libmuscle/python/libmuscle/native_instantiator/agent/__init__.py b/libmuscle/python/libmuscle/native_instantiator/agent/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py b/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py new file mode 100644 index 00000000..712da253 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py @@ -0,0 +1,164 @@ +from itertools import groupby +import logging +import os +import psutil +from socket import gethostname +import sys +from time import sleep +from typing import Any, Dict, Set + +from libmuscle.native_instantiator.process_manager import ProcessManager +from libmuscle.native_instantiator.agent.map_client import MAPClient +from libmuscle.native_instantiator.agent.agent_commands import ( + CancelAllCommand, ShutdownCommand, StartCommand) + + +_logger = logging.getLogger(__name__) + + +class Agent: + """Runs on a compute node and starts processes there.""" + def __init__(self, node_id: str, server_location: str) -> None: + """Create an Agent. + + Args: + node_id: Id (hostname) of this node + server_location: MAP server of the manager to connect to + """ + _logger.info(f'Agent at {node_id} starting') + + self._process_manager = ProcessManager() + + self._node_id = node_id + + _logger.info(f'Connecting to manager at {server_location}') + self._server = MAPClient(self._node_id, server_location) + _logger.info('Connected to manager') + + def run(self) -> None: + """Execute commands and monitor processes.""" + _logger.info('Reporting resources') + self._server.report_resources(self._inspect_resources()) + + shutting_down = False + while not shutting_down: + command = self._server.get_command() + if isinstance(command, StartCommand): + _logger.info(f'Starting process {command.name}') + _logger.debug(f'Args: {command.args}') + _logger.debug(f'Env: {command.env}') + + self._process_manager.start( + command.name, command.work_dir, command.args, command.env, + command.stdout, command.stderr) + elif isinstance(command, CancelAllCommand): + _logger.info('Cancelling all instances') + self._process_manager.cancel_all() + + elif isinstance(command, ShutdownCommand): + # check that nothing is running + shutting_down = True + _logger.info('Agent shutting down') + + finished = self._process_manager.get_finished() + if finished: + for name, exit_code in finished: + _logger.info(f'Process {name} finished with exit code {exit_code}') + self._server.report_result(finished) + + sleep(0.1) + + def _inspect_resources(self) -> Dict[str, Any]: + """Inspect the node to find resources and report on them. + + The only resource type for now is 'cpu'. The returned dict will have that key + mapping to a list of sets of logical hwthread ids, with each set designating + a set of hwthreads that share a core. + + The terminology for identifying processors gets very convoluted, with Linux, + Slurm, OpenMPI and IntelMPI all using different terms, or sometimes the same + terms for different things. See the comment in native_instantiator.py for what + is what and how we use it. + + Returns: + A dict mapping resource types to resource descriptions. + """ + if hasattr(os, 'sched_getaffinity'): + hwthreads_by_core: Dict[int, Set[int]] = dict() + + # these are the logical hwthread ids that we can use + hwthread_ids = list(os.sched_getaffinity(0)) + + for i in hwthread_ids: + with open(f'/sys/devices/system/cpu/cpu{i}/topology/core_id', 'r') as f: + # this gets the logical core id for the hwthread + core_id = int(f.read()) + hwthreads_by_core.setdefault(core_id, set()).add(i) + + cpu_resources = sorted( + map(frozenset, hwthreads_by_core.values()), key=sorted) + + else: + # MacOS doesn't support thread affinity, but older Macs with Intel + # processors do have SMT. Getting the hwthread to core mapping is not so + # easy, and if we're running on macOS then we're not on a cluster and don't + # do binding anyway. So we're going to get the number of hwthreads and the + # number of cores here, and synthesise a mapping that may be wrong, but will + # at least represent the number of cores and threads per core correctly. + nhwthreads = psutil.cpu_count(logical=True) + ncores = psutil.cpu_count(logical=False) + + hwthreads_per_core = nhwthreads // ncores + + if ncores * hwthreads_per_core != nhwthreads: + # As far as I know, there are no Macs with heterogeneous SMT, like in + # the latest Intel CPUs. + _logger.warning( + 'Only some cores seem to have SMT, core ids are probably' + ' wrong. If this is a cluster then this will cause problems,' + ' please report an issue on GitHub and report the machine and' + ' what kind of OS and hardware it has. If we\'re running on a' + ' local machine, then this won\'t affect the run, but I\'d' + ' still appreciate an issue, because it is unexpected for sure.' + ) + + hwthread_ids = list(range(nhwthreads)) + cpu_resources = [ + frozenset(g) + for _, g in groupby( + hwthread_ids, lambda i: i // hwthreads_per_core)] + + _logger.info(f'Found CPU resources: {cpu_resources}') + return {'cpu': cpu_resources} + + +def configure_logging(node_id: str, log_level: int) -> None: + """Make us output logs to a custom log file.""" + fmt = '%(asctime)s %(levelname)s %(message)s' + formatter = logging.Formatter(fmt) + + handler = logging.FileHandler(f'muscle3_agent_{node_id}.log', mode='w') + handler.setFormatter(formatter) + + # Find and remove default handler to disable automatic console output + # Testing for 'stderr' in the stringified version is not nice, but + # seems reliable, and doesn't mess up pytest's caplog mechanism while + # it also doesn't introduce a runtime dependency on pytest. + logging.getLogger().handlers = [ + h for h in logging.getLogger().handlers + if 'stderr' not in str(h)] + + logging.getLogger().addHandler(handler) + + logging.getLogger().setLevel(log_level) + + +if __name__ == '__main__': + node_id = gethostname() + server_location = sys.argv[1] + log_level = int(sys.argv[2]) + + configure_logging(node_id, log_level) + + agent = Agent(node_id, server_location) + agent.run() diff --git a/libmuscle/python/libmuscle/native_instantiator/agent_manager.py b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py new file mode 100644 index 00000000..2e5aa361 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py @@ -0,0 +1,205 @@ +import logging +from pathlib import Path +from subprocess import Popen, TimeoutExpired +import sys +from threading import Lock +from time import sleep +from typing import Any, Dict, FrozenSet, List, Tuple + +from libmuscle.native_instantiator.agent.agent_commands import ( + CancelAllCommand, StartCommand, ShutdownCommand) +from libmuscle.native_instantiator.iagent_manager import IAgentManager +from libmuscle.native_instantiator.map_server import MAPServer +from libmuscle.native_instantiator.global_resources import global_resources + + +_logger = logging.getLogger(__name__) + + +class AgentManager(IAgentManager): + """Manage the node agents. + + Each node of our allocated resources gets an agent, which launches and monitors + processes or that node. This class launches those agents across the nodes, + and communicates with them. + + The AgentManager sits in between the NativeInstantiator and the MAPServer. It gets + called by NativeInstantiator with requests for resources and commands to start and + cancel processes on nodes, and it gets called by MAPServer with requests from the + agents. + """ + def __init__(self, agent_dir: Path) -> None: + """Create an AgentManager. + + Create the object, then launch the agents and wait for them to connect and send + information about the available resources. + + Args: + agent_dir: Directory in which agents can write log files. + """ + self._nodes: List[str] = list() + self._resources: Dict[str, Dict[str, Any]] = dict() + self._resources_lock = Lock() # protects _nodes and _resources + + self._finished_processes: List[Tuple[str, int]] = list() + self._finished_processes_lock = Lock() + + self._server = MAPServer(self) + + _logger.info('Launching MUSCLE agents...') + self._agents_process = self._launch_agents( + agent_dir, self._server.get_location()) + + expected_nodes = global_resources.nodes + + resources_complete = False + while not resources_complete: + sleep(0.1) + with self._resources_lock: + resources_complete = len(self._nodes) == len(expected_nodes) + _logger.debug(f'{len(self._resources)} agents up') + + if self._agents_process.poll() is not None: + msg = ( + 'Agents unexpectedly stopped running. This is not supposed' + ' to happen. Please see the agent log for more information,' + ' and please file an issue on GitHub.') + _logger.error(msg) + raise RuntimeError(msg) + + _logger.info(f'All agents running on {self._nodes}') + + if sorted(expected_nodes) != sorted(self._nodes): + _logger.error( + 'Agent-reported node hostnames do not match what we got from the' + ' resource manager.') + _logger.error( + 'According to the resource manager, we have' + f' {sorted(expected_nodes)}') + _logger.error( + f'The agents are reporting {sorted(self._nodes)}') + + def get_resources(self) -> Dict[str, List[FrozenSet[int]]]: + """Return detected resources. + + This returns a list of tuples of logical hwthread ids for each core per node. + + Called by NativeInstantiator. + """ + # no need to lock, _resources is already in its final state + return {node_id: res['cpu'] for node_id, res in self._resources.items()} + + def start( + self, node_id: str, name: str, work_dir: Path, args: List[str], + env: Dict[str, str], stdout: Path, stderr: Path) -> None: + """Start a process on a node. + + The files that the output is directed to will be overwritten if they already + exist. + + Args: + node_id: Id of the node to run the process on + name: Name under which this process will be known + work_dir: Working directory in which to start + args: Executable and arguments to run + env: Environment variables to set + stdout: File to redirect stdout to + stderr: File to redirect stderr to + """ + command = StartCommand(name, work_dir, args, env, stdout, stderr) + self._server.deposit_command(node_id, command) + + def cancel_all(self) -> None: + """Cancel all processes. + + This tells the agents to stop all running processes they've started. + + Called by NativeInstantiator. + """ + for node_id in self._nodes: + self._server.deposit_command(node_id, CancelAllCommand()) + + def get_finished(self) -> List[Tuple[str, int]]: + """Returns names and exit codes of finished processes. + + This returns all processes that have finished running since the previous call; + each started process will be returned exactly once. The names are the ones + passed to start(). + + Called by NativeInstantiator. + """ + with self._finished_processes_lock: + next_batch = self._finished_processes + self._finished_processes = list() + + return next_batch + + def shutdown(self) -> None: + """Shut down the manager and its agents.""" + command = ShutdownCommand() + for node_id in self._nodes: + self._server.deposit_command(node_id, command) + + try: + self._agents_process.wait(60) + except TimeoutExpired: + _logger.warning( + 'Agents did not shut down within one minute, sending signal...') + self._agents_process.kill() + + try: + self._agents_process.wait(10) + except TimeoutExpired: + _logger.warning('Agents still not down, continuing shutdown anyway.') + + self._server.stop() + + def report_resources(self, node_id: str, resources: Dict[str, Any]) -> None: + """Report resources found on a node. + + Called by MAPServer from a server thread. + + Args: + node_id: Id of the node these resources are on + resources: Dict mapping resource type to resource ids + """ + with self._resources_lock: + self._nodes.append(node_id) + self._resources[node_id] = resources + + def report_result(self, names_exit_codes: List[Tuple[str, int]]) -> None: + """Report results of finished processes. + + Called by MAPServer from a server thread. + + Args: + names_exit_codes: A list of names and exit codes of finished processes. + """ + with self._finished_processes_lock: + self._finished_processes.extend(names_exit_codes) + + def _launch_agents(self, agent_dir: Path, server_location: str) -> Popen: + """Actually launch the agents. + + This runs a local process, either to start a single agent locally, or on a + cluster to start all of them in one go. + + Args: + agent_dir: Working directory for the agents + server_location: MAPServer network location string for the agents to + connect to + """ + python = sys.executable + if not python: + raise RuntimeError( + 'Could not launch agents because sys.executable is not set.') + + log_level = logging.getLogger('libmuscle').getEffectiveLevel() + + args = [ + sys.executable, '-m', 'libmuscle.native_instantiator.agent', + server_location, str(log_level)] + + args = global_resources.agent_launch_command(args) + + return Popen(args, cwd=agent_dir) diff --git a/libmuscle/python/libmuscle/native_instantiator/global_resources.py b/libmuscle/python/libmuscle/native_instantiator/global_resources.py new file mode 100644 index 00000000..08d294a3 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/global_resources.py @@ -0,0 +1,72 @@ +from enum import Enum +import logging +from socket import gethostname +from typing import List + +import psutil + +from libmuscle.native_instantiator import slurm + + +_logger = logging.getLogger(__name__) + + +class Scheduler(Enum): + NONE = 0 + SLURM = 1 + + +class GlobalResources: + """Detects available compute resources. + + This detects whether we're running locally or in a SLURM allocation, and returns + available resources on request. This class describes all the available resources, + not the ones local to a node. + + Attributes: + scheduler: The HPC scheduler we're running under, if any. + nodes: List of hostnames of available nodes to run on. + cores_per_node: Number of cores available on each node. List alongside nodes. + """ + def __init__(self) -> None: + """Create a GlobalResources. + + Detects available resources and initialises the object, which can then be + queried. + """ + if slurm.in_slurm_allocation(): + _logger.info('Detected a SLURM allocation') + self.scheduler = Scheduler.SLURM + self.nodes = slurm.get_nodes() + self.cores_per_node = slurm.get_cores_per_node() + _logger.info( + f'We have {len(self.nodes)} nodes and a total of' + f' {sum(self.cores_per_node)} cores available') + else: + _logger.info('Running locally without a cluster scheduler') + self.scheduler = Scheduler.NONE + self.nodes = [gethostname()] + self.cores_per_node = [psutil.cpu_count(logical=False)] + _logger.info(f'We have {self.cores_per_node[0]} cores available') + + def on_cluster(self) -> bool: + """Return whether we're running on a cluster.""" + return self.scheduler != Scheduler.NONE + + def agent_launch_command(self, agent_cmd: List[str]) -> List[str]: + """Return a command for launching one agent on each node. + + Args: + agent_cmd: A command that will start the agent. + """ + if self.scheduler == Scheduler.SLURM: + return slurm.agent_launch_command(agent_cmd) + return agent_cmd + + +global_resources = GlobalResources() +"""Global resources object. + +This is a singleton, and that's fine because it's created once and then read-only. Also, +it's used in two places, and making two objects logs everything twice which is annoying. +""" diff --git a/libmuscle/python/libmuscle/native_instantiator/iagent_manager.py b/libmuscle/python/libmuscle/native_instantiator/iagent_manager.py new file mode 100644 index 00000000..93d063f8 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/iagent_manager.py @@ -0,0 +1,29 @@ +from typing import Any, Dict, List, Tuple + + +class IAgentManager: + """Interface for Agent Managers. + + Only implemented by AgentManager, and only exists to avoid a circular dependency + between AgentManager, MAPServer, and MAPRequestHandler. Ugh. + """ + def report_resources(self, node_id: str, resources: Dict[str, Any]) -> None: + """Report resources found on a node. + + Called by MAPServer from a server thread. + + Args: + node_id: Id of the node these resources are on + resources: Dict mapping resource type to resource ids + """ + raise NotImplementedError() + + def report_result(self, names_exit_codes: List[Tuple[str, int]]) -> None: + """Report results of finished processes. + + Called by MAPServer from a server thread. + + Args: + names_exit_codes: A list of names and exit codes of finished processes. + """ + raise NotImplementedError() diff --git a/libmuscle/python/libmuscle/native_instantiator/resource_detector.py b/libmuscle/python/libmuscle/native_instantiator/resource_detector.py deleted file mode 100644 index 8ff22db9..00000000 --- a/libmuscle/python/libmuscle/native_instantiator/resource_detector.py +++ /dev/null @@ -1,45 +0,0 @@ -from enum import Enum -import logging -from os import sched_getaffinity - -from libmuscle.native_instantiator import slurm - - -_logger = logging.getLogger(__name__) - - -class Scheduler(Enum): - NONE = 0 - SLURM = 1 - - -class ResourceDetector: - """Detects available compute resources. - - This detects whether we're running locally or in a SLURM allocation, and returns - available resources on request. - """ - def __init__(self) -> None: - """Create a ResourceDetector. - - Detects available resources and initialises the object, which can then be - queried. - """ - if slurm.in_slurm_allocation(): - _logger.info('Detected a SLURM allocation') - self.scheduler = Scheduler.SLURM - self.nodes = slurm.get_nodes() - self.cores_per_node = slurm.get_cores_per_node() - _logger.info( - f'We have {len(self.nodes)} nodes and a total of' - f' {sum(self.cores_per_node)} cores available') - else: - _logger.info('Running locally without a cluster scheduler') - self.scheduler = Scheduler.NONE - self.nodes = ['localhost'] - self.cores_per_node = [len(sched_getaffinity(0))] - _logger.info(f'We have {sum(self.cores_per_node)} cores available') - - def on_cluster(self) -> bool: - _logger.debug(f'On cluster: {self.scheduler}') - return self.scheduler != Scheduler.NONE diff --git a/libmuscle/python/libmuscle/native_instantiator/slurm.py b/libmuscle/python/libmuscle/native_instantiator/slurm.py index 59258cc9..d9685687 100644 --- a/libmuscle/python/libmuscle/native_instantiator/slurm.py +++ b/libmuscle/python/libmuscle/native_instantiator/slurm.py @@ -278,3 +278,14 @@ def get_cores_per_node() -> List[int]: ' SLURM_CPUS_ON_NODE is missing or neither SLURM_JOB_NUM_NODES nor' ' SLURM_NNODES is set. Please create an issue on GitHub with the output' ' of "sbatch --version" on this cluster.') + + +def agent_launch_command(agent_cmd: List[str]) -> List[str]: + """Return a command for launching one agent on each node. + + Args: + agent_cmd: A command that will start the agent. + """ + # TODO: On the latest Slurm, there's a special command for this that we should use + # if we have that. + return ['srun', '--ntasks-per-node', '1'] + agent_cmd From af4ab522c3fca0d3b484940e6000c562a42df06f Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 17 Oct 2024 10:07:44 +0200 Subject: [PATCH 17/49] Add affinity checks to test --- integration_test/cluster_test/component.cpp | 32 ++++++++++-- integration_test/cluster_test/component.py | 11 +++- integration_test/cluster_test/conftest.py | 34 ++++++++++++- integration_test/cluster_test/test_cluster.py | 51 ++++++++++++------- 4 files changed, 103 insertions(+), 25 deletions(-) diff --git a/integration_test/cluster_test/component.cpp b/integration_test/cluster_test/component.cpp index 42b0cb48..0cc9726f 100644 --- a/integration_test/cluster_test/component.cpp +++ b/integration_test/cluster_test/component.cpp @@ -2,6 +2,9 @@ #include #include +// This is a Linux-specific API, but this test always runs on Linux so that's okay. +#define _GNU_SOURCE +#include #include #include "mpi.h" @@ -17,19 +20,41 @@ using libmuscle::Message; using ymmsl::Operator; -/** A simple dummy component. */ -void component(int argc, char * argv[]) { - const int root_rank = 0; +/** Log where we are running so that the test can check for it. */ +void log_location() { int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); char nodeid[1024]; gethostname(nodeid, sizeof(nodeid)); + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set); + { ofstream outfile("out_" + to_string(rank) + ".txt"); outfile << nodeid << std::endl; + + bool first = true; + for (int i = 0; i < CPU_SETSIZE; ++i) { + if (CPU_ISSET(i, &cpu_set)) { + if (!first) + outfile << ","; + outfile << i; + first = false; + } + } + outfile << std::endl; } +} + + +/** A simple dummy component. */ +void component(int argc, char * argv[]) { + const int root_rank = 0; + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); Instance instance(argc, argv, { {Operator::F_INIT, {"init_in"}}, @@ -66,6 +91,7 @@ void component(int argc, char * argv[]) { int main(int argc, char * argv[]) { MPI_Init(&argc, &argv); + log_location(); component(argc, argv); MPI_Finalize(); return EXIT_SUCCESS; diff --git a/integration_test/cluster_test/component.py b/integration_test/cluster_test/component.py index aa8dd260..a22c7d96 100644 --- a/integration_test/cluster_test/component.py +++ b/integration_test/cluster_test/component.py @@ -1,18 +1,23 @@ import logging +import os import socket from libmuscle import Instance, Message from ymmsl import Operator +def log_location() -> None: + """Log where we are running so that the test can check for it.""" + print(socket.gethostname()) + print(','.join(map(str, sorted(os.sched_getaffinity(0))))) + + def component() -> None: """A simple dummy component. This sends and receives on all operators, allowing different coupling patterns with a single program. """ - print(socket.gethostname()) - instance = Instance({ Operator.F_INIT: ['init_in'], Operator.O_I: ['inter_out'], @@ -39,4 +44,6 @@ def component() -> None: if __name__ == '__main__': logging.basicConfig() logging.getLogger().setLevel(logging.INFO) + + log_location() component() diff --git a/integration_test/cluster_test/conftest.py b/integration_test/cluster_test/conftest.py index a4f5cba4..ec066556 100644 --- a/integration_test/cluster_test/conftest.py +++ b/integration_test/cluster_test/conftest.py @@ -142,7 +142,6 @@ def setup_connection(fake_cluster_headnode): # Because it's been made inside of the container, it has a different owner # than what we're running with on the host, and the host user cannot remove # the files. - run_cmd(term, 60, f'rm -rf {REMOTE_SHARED}/*') @@ -198,3 +197,36 @@ def muscle3_native_openmpi(remote_source, setup_connection): f'PREFIX={prefix} make install"')) return prefix + + +@pytest.fixture(scope='session') +def hwthread_to_core(): + """Translates hwthreads to core ids. + + In our tests, we use sched_getaffinity to check which cores we're bound to. This + returns numbers identifying hwthreads, but our planner binds swthreads and processes + to entire cores. So we get a comma-separated list of hwthread ids and want to + compare that to a list of core ids. + + This reads /proc/cpuinfo to get the mapping between hwthreads and cores, and returns + a function that takes a comma-separated list of hwthread ids and returns a list of + corresponding core ids. + """ + with open('/proc/cpuinfo', 'r') as f: + cpuinfo = f.readlines() + + def get_values(cpuinfo, field): + return [ + int(line.split(':')[1].strip()) + for line in cpuinfo if line.startswith(field)] + + hwthread_ids = get_values(cpuinfo, 'processor') + core_ids = get_values(cpuinfo, 'core id') + + table = dict(zip(hwthread_ids, core_ids)) + + def convert(aff_ids): + cores = {table[i] for i in map(int, aff_ids.split(','))} + return sorted(cores) + + return convert diff --git a/integration_test/cluster_test/test_cluster.py b/integration_test/cluster_test/test_cluster.py index d9b1d85f..b25ccb5a 100644 --- a/integration_test/cluster_test/test_cluster.py +++ b/integration_test/cluster_test/test_cluster.py @@ -113,7 +113,8 @@ def _get_outfile(remote_out_dir, testname, mode, instance, rank): @skip_unless_cluster @pytest.mark.parametrize('mode', ['local', 'slurm']) -def test_single(fake_cluster, remote_test_files, remote_out_dir, mode): +def test_single( + fake_cluster, remote_test_files, remote_out_dir, mode, hwthread_to_core): sched = _sched(fake_cluster, mode) job = _make_job('single', mode, remote_test_files, remote_out_dir) @@ -129,14 +130,17 @@ def test_single(fake_cluster, remote_test_files, remote_out_dir, mode): output = _get_stdout(remote_out_dir, 'single', mode, 'c1') if mode == 'local': - assert output == 'muscle3-headnode\n' + assert output.split('\n')[0] == 'muscle3-headnode' else: - assert output == 'muscle3-node-0\n' + node, hwthreads, _ = output.split('\n') + assert node == 'muscle3-node-0' + assert hwthread_to_core(hwthreads) == [0] @skip_unless_cluster @pytest.mark.parametrize('mode', ['local', 'slurm']) -def test_dispatch(fake_cluster, remote_test_files, remote_out_dir, mode): +def test_dispatch( + fake_cluster, remote_test_files, remote_out_dir, mode, hwthread_to_core): sched = _sched(fake_cluster, mode) job = _make_job('dispatch', mode, remote_test_files, remote_out_dir) @@ -152,17 +156,22 @@ def test_dispatch(fake_cluster, remote_test_files, remote_out_dir, mode): c1_out = _get_stdout(remote_out_dir, 'dispatch', mode, 'c1') c2_out = _get_stdout(remote_out_dir, 'dispatch', mode, 'c2') if mode == 'local': - assert c1_out == 'muscle3-headnode\n' - assert c2_out == 'muscle3-headnode\n' + assert c1_out.split('\n')[0] == 'muscle3-headnode' + assert c2_out.split('\n')[0] == 'muscle3-headnode' else: - assert c1_out == 'muscle3-node-1\n' - assert c2_out == 'muscle3-node-1\n' + node, hwthreads, _ = c1_out.split('\n') + assert node == 'muscle3-node-1' + assert hwthread_to_core(hwthreads) == [0] + + node, hwthreads, _ = c2_out.split('\n') + assert node == 'muscle3-node-1' + assert hwthread_to_core(hwthreads) == [0] @skip_unless_cluster -@pytest.mark.parametrize('mode', ['local']) -# SLURM mode is not implemented yet -def test_multiple(fake_cluster, remote_test_files, remote_out_dir, mode): +@pytest.mark.parametrize('mode', ['local', 'slurm']) +def test_multiple( + fake_cluster, remote_test_files, remote_out_dir, mode, hwthread_to_core): sched = _sched(fake_cluster, mode) job = _make_job('multiple', mode, remote_test_files, remote_out_dir) @@ -175,17 +184,19 @@ def test_multiple(fake_cluster, remote_test_files, remote_out_dir, mode): assert sched.get_exit_code(job_id) == 0 for i in range(1, 7): + out = _get_stdout(remote_out_dir, 'multiple', mode, f'c{i}') if mode == 'local': - assert _get_stdout( - remote_out_dir, 'multiple', mode, f'c{i}') == 'muscle3-headnode\n' + assert out.split('\n')[0] == 'muscle3-headnode' else: - out = _get_stdout(remote_out_dir, 'multiple', mode, f'c{i}') - assert out == f'muscle3-node-{(i - 1) // 2}\n' + node, hwthreads, _ = out.split('\n') + assert node == f'muscle3-node-{(i - 1) // 2}' + assert hwthread_to_core(hwthreads) == [(i - 1) % 2] @skip_unless_cluster @pytest.mark.parametrize('mode', ['local', 'slurm']) -def test_double_mpi(fake_cluster, remote_test_files, remote_out_dir, mode): +def test_double_mpi( + fake_cluster, remote_test_files, remote_out_dir, mode, hwthread_to_core): sched = _sched(fake_cluster, mode) job = _make_job('double_mpi', mode, remote_test_files, remote_out_dir) @@ -199,8 +210,10 @@ def test_double_mpi(fake_cluster, remote_test_files, remote_out_dir, mode): for i in range(1, 3): for rank in range(2): - output = _get_outfile(remote_out_dir, 'double_mpi', mode, f'c{i}', rank) + out = _get_outfile(remote_out_dir, 'double_mpi', mode, f'c{i}', rank) if mode == 'local': - assert output == 'muscle3-headnode\n' + assert out.split('\n')[0] == 'muscle3-headnode' else: - assert output == f'muscle3-node-{i + 2}\n' + node, hwthreads, _ = out.split('\n') + assert node == f'muscle3-node-{i + 2}' + assert hwthread_to_core(hwthreads) == [rank] From 33b65fff97ba92dd741fbecda78dd7ee558a31d4 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 17 Oct 2024 10:19:47 +0200 Subject: [PATCH 18/49] Improve handling of crashing agents --- .../python/libmuscle/manager/instance_manager.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/manager/instance_manager.py b/libmuscle/python/libmuscle/manager/instance_manager.py index bc6e8edd..d29356b0 100644 --- a/libmuscle/python/libmuscle/manager/instance_manager.py +++ b/libmuscle/python/libmuscle/manager/instance_manager.py @@ -94,7 +94,17 @@ def __init__( self._log_handler.start() self._allocations: Optional[Dict[Reference, Resources]] = None - self._planner = Planner(self._resources_in.get()) + + resources = self._resources_in.get() + _logger.debug(f'Got resources {resources}') + if isinstance(resources, CrashedResult): + msg = ( + 'Instantiator crashed. This should not happen, please file a bug' + ' report.') + _logger.error(msg) + raise RuntimeError(msg) + + self._planner = Planner(resources) self._num_running = 0 def set_manager_location(self, location: str) -> None: From 7dc20b5b319e174bd9aa275ea5711b3cb611f35a Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 18 Oct 2024 11:38:41 +0200 Subject: [PATCH 19/49] Comment out temporarily unused import to make CI pass --- libmuscle/python/libmuscle/manager/instance_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/manager/instance_manager.py b/libmuscle/python/libmuscle/manager/instance_manager.py index d29356b0..31fc5edb 100644 --- a/libmuscle/python/libmuscle/manager/instance_manager.py +++ b/libmuscle/python/libmuscle/manager/instance_manager.py @@ -13,7 +13,7 @@ CancelAllRequest, CrashedResult, InstantiatorRequest, InstantiationRequest, Process, ProcessStatus, ShutdownRequest) from libmuscle.manager.logger import last_lines -from libmuscle.manager.qcgpj_instantiator import QCGPJInstantiator +# from libmuscle.manager.qcgpj_instantiator import QCGPJInstantiator from libmuscle.manager.run_dir import RunDir from libmuscle.native_instantiator.native_instantiator import NativeInstantiator from libmuscle.planner.planner import Planner, Resources From 3c04f03618110663f2c8047056ec4354c7bc76d4 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 22 Nov 2024 14:28:06 +0100 Subject: [PATCH 20/49] Switch to new Cerulean fake cluster Docker images --- integration_test/cluster_test/conftest.py | 26 ++- integration_test/cluster_test/test_cluster.py | 30 ++-- integration_test/fake_cluster/Dockerfile | 46 ++--- integration_test/fake_cluster/cgroup.conf | 6 + integration_test/fake_cluster/slurm.conf | 163 ------------------ .../fake_cluster/start-services.sh | 70 -------- .../native_instantiator/global_resources.py | 2 +- .../libmuscle/native_instantiator/slurm.py | 6 +- 8 files changed, 58 insertions(+), 291 deletions(-) create mode 100644 integration_test/fake_cluster/cgroup.conf delete mode 100644 integration_test/fake_cluster/slurm.conf delete mode 100644 integration_test/fake_cluster/start-services.sh diff --git a/integration_test/cluster_test/conftest.py b/integration_test/cluster_test/conftest.py index ec066556..f6d6e6d4 100644 --- a/integration_test/cluster_test/conftest.py +++ b/integration_test/cluster_test/conftest.py @@ -14,6 +14,10 @@ REMOTE_SHARED = '/home/cerulean/shared' +# Shut down the containers after running the tests. Set to False to debug. +CLEAN_UP_CONTAINERS = True + + skip_unless_cluster = pytest.mark.skipif( 'MUSCLE_TEST_CLUSTER' not in os.environ, reason='Cluster tests were not explicitly enabled') @@ -75,10 +79,10 @@ def shared_dir(): @pytest.fixture(scope='session') def cleanup_docker(local_term): for i in range(5): - node_name = f'muscle3-node-{i}' + node_name = f'node-{i}' run_cmd(local_term, 60, f'docker rm -f {node_name}') - run_cmd(local_term, 60, 'docker rm -f muscle3-headnode') + run_cmd(local_term, 60, 'docker rm -f headnode') run_cmd(local_term, 60, 'docker network rm -f muscle3-net') @@ -87,7 +91,9 @@ def fake_cluster_network(local_term, cleanup_docker): name = 'muscle3-net' run_cmd(local_term, 60, f'docker network create {name}') yield name - run_cmd(local_term, 60, 'docker network rm -f muscle3-net') + + if CLEAN_UP_CONTAINERS: + run_cmd(local_term, 60, 'docker network rm -f muscle3-net') @pytest.fixture(scope='session') @@ -97,12 +103,13 @@ def fake_cluster_nodes( node_names = list() for i in range(5): - node_name = f'muscle3-node-{i}' + node_name = f'node-{i}' ssh_port = 10030 + i run_cmd(local_term, 60, ( f'docker run -d --name={node_name} --hostname={node_name}' f' --network={fake_cluster_network} -p {ssh_port}:22' + f' --cap-add=CAP_SYS_NICE' f' --mount type=bind,source={shared_dir},target={REMOTE_SHARED}' f' {fake_cluster_image}')) @@ -110,7 +117,8 @@ def fake_cluster_nodes( yield None - run_cmd(local_term, 60, f'docker rm -f {" ".join(node_names)}') + if CLEAN_UP_CONTAINERS: + run_cmd(local_term, 60, f'docker rm -f {" ".join(node_names)}') @pytest.fixture(scope='session') @@ -119,7 +127,7 @@ def fake_cluster_headnode( shared_dir): run_cmd(local_term, 60, ( - 'docker run -d --name=muscle3-headnode --hostname=muscle3-headnode' + 'docker run -d --name=headnode --hostname=headnode' f' --network={fake_cluster_network} -p 10022:22' f' --mount type=bind,source={shared_dir},target={REMOTE_SHARED}' f' {fake_cluster_image}')) @@ -127,7 +135,8 @@ def fake_cluster_headnode( ssh_term('Virtual cluster container start timed out') yield None - run_cmd(local_term, 60, 'docker rm -f muscle3-headnode') + if CLEAN_UP_CONTAINERS: + run_cmd(local_term, 60, 'docker rm -f headnode') @pytest.fixture(scope='session') @@ -142,7 +151,8 @@ def setup_connection(fake_cluster_headnode): # Because it's been made inside of the container, it has a different owner # than what we're running with on the host, and the host user cannot remove # the files. - run_cmd(term, 60, f'rm -rf {REMOTE_SHARED}/*') + if CLEAN_UP_CONTAINERS: + run_cmd(term, 60, f'rm -rf {REMOTE_SHARED}/*') @pytest.fixture(scope='session') diff --git a/integration_test/cluster_test/test_cluster.py b/integration_test/cluster_test/test_cluster.py index b25ccb5a..c65717dc 100644 --- a/integration_test/cluster_test/test_cluster.py +++ b/integration_test/cluster_test/test_cluster.py @@ -74,6 +74,7 @@ def _make_job(name, mode, remote_test_files, remote_out_dir): job.time_reserved = 60 job.system_out_file = job_dir / 'sysout.txt' job.system_err_file = job_dir / 'syserr.txt' + job.extra_scheduler_options = '--ntasks-per-node=4' return job @@ -121,7 +122,7 @@ def test_single( if mode == 'slurm': job.num_nodes = 1 job.mpi_processes_per_node = 1 - job.extra_scheduler_options = '--ntasks-per-core=1 --nodelist=muscle3-node-0' + job.extra_scheduler_options += ' --nodelist=node-0' job_id = sched.submit(job) assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None @@ -130,10 +131,10 @@ def test_single( output = _get_stdout(remote_out_dir, 'single', mode, 'c1') if mode == 'local': - assert output.split('\n')[0] == 'muscle3-headnode' + assert output.split('\n')[0] == 'headnode' else: node, hwthreads, _ = output.split('\n') - assert node == 'muscle3-node-0' + assert node == 'node-0' assert hwthread_to_core(hwthreads) == [0] @@ -147,7 +148,7 @@ def test_dispatch( if mode == 'slurm': job.num_nodes = 1 job.mpi_processes_per_node = 1 - job.extra_scheduler_options = '--ntasks-per-core=1 --nodelist=muscle3-node-1' + job.extra_scheduler_options += ' --nodelist=node-1' job_id = sched.submit(job) assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None @@ -156,15 +157,15 @@ def test_dispatch( c1_out = _get_stdout(remote_out_dir, 'dispatch', mode, 'c1') c2_out = _get_stdout(remote_out_dir, 'dispatch', mode, 'c2') if mode == 'local': - assert c1_out.split('\n')[0] == 'muscle3-headnode' - assert c2_out.split('\n')[0] == 'muscle3-headnode' + assert c1_out.split('\n')[0] == 'headnode' + assert c2_out.split('\n')[0] == 'headnode' else: node, hwthreads, _ = c1_out.split('\n') - assert node == 'muscle3-node-1' + assert node == 'node-1' assert hwthread_to_core(hwthreads) == [0] node, hwthreads, _ = c2_out.split('\n') - assert node == 'muscle3-node-1' + assert node == 'node-1' assert hwthread_to_core(hwthreads) == [0] @@ -177,7 +178,7 @@ def test_multiple( job = _make_job('multiple', mode, remote_test_files, remote_out_dir) if mode == 'slurm': job.num_nodes = 3 - job.extra_scheduler_options = '--nodelist=muscle3-node-[0-2]' + job.extra_scheduler_options += ' --nodelist=node-[0-2]' job_id = sched.submit(job) assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None @@ -186,10 +187,10 @@ def test_multiple( for i in range(1, 7): out = _get_stdout(remote_out_dir, 'multiple', mode, f'c{i}') if mode == 'local': - assert out.split('\n')[0] == 'muscle3-headnode' + assert out.split('\n')[0] == 'headnode' else: node, hwthreads, _ = out.split('\n') - assert node == f'muscle3-node-{(i - 1) // 2}' + assert node == f'node-{(i - 1) // 2}' assert hwthread_to_core(hwthreads) == [(i - 1) % 2] @@ -197,12 +198,13 @@ def test_multiple( @pytest.mark.parametrize('mode', ['local', 'slurm']) def test_double_mpi( fake_cluster, remote_test_files, remote_out_dir, mode, hwthread_to_core): + sched = _sched(fake_cluster, mode) job = _make_job('double_mpi', mode, remote_test_files, remote_out_dir) if mode == 'slurm': job.num_nodes = 2 - job.extra_scheduler_options = '--nodelist=muscle3-node-[3-4]' + job.extra_scheduler_options += ' --nodelist=node-[3-4]' job_id = sched.submit(job) assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None @@ -212,8 +214,8 @@ def test_double_mpi( for rank in range(2): out = _get_outfile(remote_out_dir, 'double_mpi', mode, f'c{i}', rank) if mode == 'local': - assert out.split('\n')[0] == 'muscle3-headnode' + assert out.split('\n')[0] == 'headnode' else: node, hwthreads, _ = out.split('\n') - assert node == f'muscle3-node-{i + 2}' + assert node == f'node-{i + 2}' assert hwthread_to_core(hwthreads) == [rank] diff --git a/integration_test/fake_cluster/Dockerfile b/integration_test/fake_cluster/Dockerfile index 523b137e..bc1db68d 100644 --- a/integration_test/fake_cluster/Dockerfile +++ b/integration_test/fake_cluster/Dockerfile @@ -1,39 +1,19 @@ -FROM ghcr.io/naturalhpc/cerulean-test-docker-images/cerulean-fake-slurm-23-11:latest +FROM ghcr.io/naturalhpc/cerulean-fake-slurm-23-11:latest +# FROM naturalhpc/cerulean-fake-slurm-23-11:latest -RUN apt-get update && \ - apt-get remove -y openmpi-bin && \ - apt-get install -y python3-venv gcc g++ gfortran git build-essential xz-utils \ - bzip2 cmake +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi+legacylaunchers+pmi schedulers=slurm ^pmix@3.2.3 ^slurm/dckfty -RUN cd /opt && \ - git clone --depth=100 --branch=releases/v0.22 https://github.com/spack/spack.git +# RUN . /opt/spack/share/spack/setup-env.sh && \ +# . $(spack location -i lmod)/lmod/lmod/init/bash && \ +# spack install mpich+slurm pmi=pmix ^pmix@3.2.3 -RUN . /opt/spack/share/spack/setup-env.sh && \ - spack config add "modules:default:enable:[tcl]" && \ - spack install lmod && \ - echo >>/etc/profile && \ - echo ". $(spack location -i lmod)/lmod/lmod/init/bash" >>/etc/profile && \ - echo ". /opt/spack/share/spack/setup-env.sh" >>/etc/profile - -# OpenMPI uses libmunge from munge, which needs to look for the munge unix socket -# in /run because that's where the apt-get installed munge we're actually running -# puts it. Munge doesn't have a configuration file, but it does have a compiled-in -# constant that can be set when building. So that's what we do here. -RUN bash -l -c 'spack install munge localstatedir=/' -RUN bash -l -c 'spack install openmpi+legacylaunchers+pmi schedulers=slurm' -RUN bash -l -c 'spack install mpich+slurm' -RUN bash -l -c 'spack install intel-oneapi-mpi' - -# Enable Spack when running ssh -c -RUN echo >>/etc/ssh/sshd_config && \ - echo 'SetEnv BASH_ENV=/etc/profile' >>/etc/ssh/sshd_config - -# Point workers to muscle3-headnode -COPY integration_test/fake_cluster/slurm.conf /usr/local/etc/slurm/slurm.conf - -# Replace start-up scripts so we can run nodes separately -COPY integration_test/fake_cluster/start-services.sh /etc/start-services.sh -RUN chmod +x /etc/start-services.sh +# RUN . /opt/spack/share/spack/setup-env.sh && \ +# . $(spack location -i lmod)/lmod/lmod/init/bash && \ +# spack install intel-oneapi-mpi ^pmix@3.2.3 + +COPY integration_test/fake_cluster/cgroup.conf /etc/slurm/cgroup.conf # Disable ssh debug output RUN sed -i -e 's/^LogLevel DEBUG3$//' /etc/ssh/sshd_config diff --git a/integration_test/fake_cluster/cgroup.conf b/integration_test/fake_cluster/cgroup.conf new file mode 100644 index 00000000..4c11eb00 --- /dev/null +++ b/integration_test/fake_cluster/cgroup.conf @@ -0,0 +1,6 @@ +IgnoreSystemd=yes +CgroupPlugin=cgroup/v1 +ConstrainSwapSpace=no +ConstrainCores=yes +# ConstrainDevices=yes + diff --git a/integration_test/fake_cluster/slurm.conf b/integration_test/fake_cluster/slurm.conf deleted file mode 100644 index 647b5315..00000000 --- a/integration_test/fake_cluster/slurm.conf +++ /dev/null @@ -1,163 +0,0 @@ -# slurm.conf file generated by configurator.html. -# Put this file on all nodes of your cluster. -# See the slurm.conf man page for more information. -# -ControlMachine=muscle3-headnode -#ControlAddr= -#BackupController= -#BackupAddr= -# -AuthType=auth/munge -#CheckpointType=checkpoint/none -CredType=cred/none -CryptoType=crypto/openssl -JobCredentialPrivateKey=/usr/local/etc/slurm/slurm.key -JobCredentialPublicCertificate=/usr/local/etc/slurm/slurm.cert -#DisableRootJobs=NO -#EnforcePartLimits=NO -#Epilog= -#EpilogSlurmctld= -#FirstJobId=1 -#MaxJobId=999999 -#GresTypes= -#GroupUpdateForce=0 -GroupUpdateTime=2 -#JobCheckpointDir=/var/slurm/checkpoint -#JobCredentialPrivateKey= -#JobCredentialPublicCertificate= -#JobFileAppend=0 -#JobRequeue=1 -#JobSubmitPlugins=1 -#KillOnBadExit=0 -#Licenses=foo*4,bar -# don't send any emails: -MailProg=/bin/true -#MaxJobCount=5000 -#MaxStepCount=40000 -#MaxTasksPerNode=128 -MpiDefault=none -#MpiParams=ports=#-# -#PluginDir= -#PlugStackConfig= -#PrivateData=jobs -ProctrackType=proctrack/linuxproc -#Prolog= -#PrologSlurmctld= -#PropagatePrioProcess=0 -#PropagateResourceLimits= -#PropagateResourceLimitsExcept= -ReturnToService=1 -#SallocDefaultCommand= -#SlurmctldPidFile=/var/run/slurmctld.pid -SlurmctldPort=6817 -SlurmdPidFile=/var/run/slurmd.%n.pid -SlurmdPort=6818 -SlurmdSpoolDir=/var/spool/slurmd.%n -SlurmUser=root -SlurmdUser=root -#SrunEpilog= -#SrunProlog= -StateSaveLocation=/var/spool/slurmctld/state -SwitchType=switch/none -#TaskEpilog= -TaskPlugin=task/cgroup -#TaskPluginParam= -#TaskProlog= -#TopologyPlugin=topology/tree -#TmpFs=/tmp -#TrackWCKey=no -#TreeWidth= -#UnkillableStepProgram= -#UsePAM=0 -# -# -# TIMERS -BatchStartTimeout=2 -#CompleteWait=0 -EpilogMsgTime=1 -#GetEnvTimeout=2 -#HealthCheckInterval=0 -#HealthCheckProgram= -InactiveLimit=0 -KillWait=2 -MessageTimeout=2 -#ResvOverRun=0 -MinJobAge=2 -#OverTimeLimit=0 -SlurmctldTimeout=2 -SlurmdTimeout=2 -#UnkillableStepTimeout=60 -#VSizeFactor=0 -Waittime=0 -# -# -# SCHEDULING -#DefMemPerCPU=0 -#MaxMemPerCPU=0 -#SchedulerRootFilter=1 -SchedulerTimeSlice=5 -SchedulerType=sched/backfill -SchedulerParameters=bf_interval=1,bf_resolution=1,sched_interval=1 -SelectType=select/cons_tres -SelectTypeParameters=CR_Core -# -# -# JOB PRIORITY -#PriorityType=priority/basic -#PriorityDecayHalfLife= -#PriorityCalcPeriod= -#PriorityFavorSmall= -#PriorityMaxAge= -#PriorityUsageResetPeriod= -#PriorityWeightAge= -#PriorityWeightFairshare= -#PriorityWeightJobSize= -#PriorityWeightPartition= -#PriorityWeightQOS= -# -# -# LOGGING AND ACCOUNTING -#AccountingStorageEnforce=0 -AccountingStorageType=accounting_storage/slurmdbd -AccountingStoragePort=6819 -AccountingStorageUser=root -AccountingStoreFlags=job_comment -ClusterName=mycluster -#DebugFlags= -#JobCompHost=localhost -#JobCompLoc=slurm_acct_db -JobCompLoc=/var/log/slurm/job_completions -JobCompType=jobcomp/filetxt -#JobCompPass=xenon-slurm-pw -#JobCompPort= -#JobCompUser=root -JobAcctGatherFrequency=2 -JobAcctGatherType=jobacct_gather/linux -SlurmctldDebug=debug5 -#SlurmctldLogFile= -SlurmdDebug=debug3 -SlurmdLogFile=/var/log/slurm/slurmd.%n.log -#SlurmSchedLogFile= -#SlurmSchedLogLevel= -# -# -# POWER SAVE SUPPORT FOR IDLE NODES (optional) -#SuspendProgram= -#ResumeProgram= -#SuspendTimeout= -#ResumeTimeout= -#ResumeRate= -#SuspendExcNodes= -#SuspendExcParts= -#SuspendRate= -#SuspendTime= -# -# -# COMPUTE NODES -NodeName=muscle3-node-0 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 NodeAddr=muscle3-node-0 Port=17001 State=UNKNOWN -NodeName=muscle3-node-1 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 NodeAddr=muscle3-node-1 Port=17002 State=UNKNOWN -NodeName=muscle3-node-2 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 NodeAddr=muscle3-node-2 Port=17003 State=UNKNOWN -NodeName=muscle3-node-3 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 NodeAddr=muscle3-node-3 Port=17004 State=UNKNOWN -NodeName=muscle3-node-4 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 NodeAddr=muscle3-node-4 Port=17005 State=UNKNOWN -PartitionName=debug Nodes=muscle3-node-[0-4] Default=YES MaxTime=INFINITE State=UP -PartitionName=batch Nodes=muscle3-node-[0-2] Default=NO MaxTime=INFINITE State=UP diff --git a/integration_test/fake_cluster/start-services.sh b/integration_test/fake_cluster/start-services.sh deleted file mode 100644 index 4f131964..00000000 --- a/integration_test/fake_cluster/start-services.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -echo -e "\nstarting syslog-ng..." -syslog-ng - - -echo -e "\nstarting munged..." -setuser munge /usr/sbin/munged --foreground > /var/log/munged.out.log 2> /var/log/munged.err.log & - -echo -n -e "\nwaiting for munged to start..." -while [ ! -e /run/munge/munge.socket.2 ] ; do - sleep 1 - echo '.' -done -echo - - -NODENAME=$(hostname) - -if [ "a${NODENAME}" == "amuscle3-headnode" ] ; then - # Run as a headnode - echo -e "\nstarting mariadb..." - setuser mysql /usr/bin/mariadbd-safe >/var/log/mariadb.out.log 2>/var/log/mariadb.err.log & - - echo -n -e "\nwaiting for mariadb to start..." - while ! nc -z localhost 3306 ; do - sleep 1 - echo '.' - done - echo - - - echo -e "\nstarting slurmdbd..." - /usr/local/sbin/slurmdbd -D >/var/log/slurmdbd.out.log 2>/var/log/slurmdbd.err.log & - - echo -n -e "\nwaiting for slurmdbd to start..." - while ! nc -z localhost 6819 ; do - sleep 1 - echo '.' - done - echo - - - echo -e "\nstarting slurmctld..." - /usr/local/sbin/slurmctld -D -c -vvvv > /var/log/slurmctld.out.log 2> /var/log/slurmctld.err.log & - - echo -n -e "\nwaiting for slurmctld to start..." - while ! nc -z localhost 6817 ; do - sleep 1 - echo '.' - done - echo - - - echo -e "\nmaking accounting readable to users..." - /bin/chmod -R og+rX /var/log/slurm - -else - # Run as a compute node - - echo -e "\nstarting compute node..." - /usr/local/sbin/slurmd -D -N ${NODENAME} > /var/log/slurmd.out.log 2> /var/log/slurmd.err.log & -fi - -echo -e "\nstarting sshd..." -/usr/sbin/sshd -De > /var/log/sshd.out.log 2> /var/log/sshd.err.log & - -echo -e "\nStartup complete" - -sleep infinity - diff --git a/libmuscle/python/libmuscle/native_instantiator/global_resources.py b/libmuscle/python/libmuscle/native_instantiator/global_resources.py index 08d294a3..aea612e1 100644 --- a/libmuscle/python/libmuscle/native_instantiator/global_resources.py +++ b/libmuscle/python/libmuscle/native_instantiator/global_resources.py @@ -60,7 +60,7 @@ def agent_launch_command(self, agent_cmd: List[str]) -> List[str]: agent_cmd: A command that will start the agent. """ if self.scheduler == Scheduler.SLURM: - return slurm.agent_launch_command(agent_cmd) + return slurm.agent_launch_command(agent_cmd, len(self.nodes)) return agent_cmd diff --git a/libmuscle/python/libmuscle/native_instantiator/slurm.py b/libmuscle/python/libmuscle/native_instantiator/slurm.py index d9685687..f11a0cba 100644 --- a/libmuscle/python/libmuscle/native_instantiator/slurm.py +++ b/libmuscle/python/libmuscle/native_instantiator/slurm.py @@ -280,7 +280,7 @@ def get_cores_per_node() -> List[int]: ' of "sbatch --version" on this cluster.') -def agent_launch_command(agent_cmd: List[str]) -> List[str]: +def agent_launch_command(agent_cmd: List[str], nnodes: int) -> List[str]: """Return a command for launching one agent on each node. Args: @@ -288,4 +288,6 @@ def agent_launch_command(agent_cmd: List[str]) -> List[str]: """ # TODO: On the latest Slurm, there's a special command for this that we should use # if we have that. - return ['srun', '--ntasks-per-node', '1'] + agent_cmd + return [ + 'srun', f'--ntasks={nnodes}', '--ntasks-per-node=1', '--cpu-bind=none' + ] + agent_cmd From a482b3e646ce200604d3ca2a5a9edf538f3476c6 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 22 Nov 2024 15:57:22 +0100 Subject: [PATCH 21/49] Add srun test --- .../{double_mpi.ymmsl => double.ymmsl} | 4 +- integration_test/cluster_test/double_mpi.sh | 12 ----- .../cluster_test/double_openmpi.sh | 12 +++++ .../cluster_test/double_srunmpi.sh | 12 +++++ .../cluster_test/implementations.ymmsl | 7 --- .../implementations_openmpi.ymmsl | 9 ++++ .../implementations_srunmpi.ymmsl | 9 ++++ integration_test/cluster_test/test_cluster.py | 42 ++++++++++++----- libmuscle/python/libmuscle/errors.py | 2 + .../libmuscle/manager/instance_manager.py | 13 ++++-- .../python/libmuscle/manager/instantiator.py | 3 +- .../native_instantiator.py | 31 ++++++------- .../native_instantiator/run_script.py | 46 +++++++++++++++---- 13 files changed, 137 insertions(+), 65 deletions(-) rename integration_test/cluster_test/{double_mpi.ymmsl => double.ymmsl} (80%) delete mode 100755 integration_test/cluster_test/double_mpi.sh create mode 100755 integration_test/cluster_test/double_openmpi.sh create mode 100755 integration_test/cluster_test/double_srunmpi.sh create mode 100644 integration_test/cluster_test/implementations_openmpi.ymmsl create mode 100644 integration_test/cluster_test/implementations_srunmpi.ymmsl create mode 100644 libmuscle/python/libmuscle/errors.py diff --git a/integration_test/cluster_test/double_mpi.ymmsl b/integration_test/cluster_test/double.ymmsl similarity index 80% rename from integration_test/cluster_test/double_mpi.ymmsl rename to integration_test/cluster_test/double.ymmsl index 9d04b238..16f9094f 100644 --- a/integration_test/cluster_test/double_mpi.ymmsl +++ b/integration_test/cluster_test/double.ymmsl @@ -7,12 +7,12 @@ model: ports: o_i: inter_out s: inter_in - implementation: component_cpp_openmpi + implementation: component_cpp c2: ports: o_i: inter_out s: inter_in - implementation: component_cpp_openmpi + implementation: component_cpp conduits: c1.inter_out: c2.inter_in diff --git a/integration_test/cluster_test/double_mpi.sh b/integration_test/cluster_test/double_mpi.sh deleted file mode 100755 index 1357283b..00000000 --- a/integration_test/cluster_test/double_mpi.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -set -e - -env - -source /home/cerulean/shared/venv/bin/activate - -CT=/home/cerulean/shared/cluster_test - -muscle_manager --log-level=DEBUG --start-all $CT/double_mpi.ymmsl $CT/settings.ymmsl $CT/implementations.ymmsl - diff --git a/integration_test/cluster_test/double_openmpi.sh b/integration_test/cluster_test/double_openmpi.sh new file mode 100755 index 00000000..12e117b8 --- /dev/null +++ b/integration_test/cluster_test/double_openmpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/double.ymmsl $CT/settings.ymmsl $CT/implementations_openmpi.ymmsl + diff --git a/integration_test/cluster_test/double_srunmpi.sh b/integration_test/cluster_test/double_srunmpi.sh new file mode 100755 index 00000000..2e7dbbf4 --- /dev/null +++ b/integration_test/cluster_test/double_srunmpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/double.ymmsl $CT/settings.ymmsl $CT/implementations_srunmpi.ymmsl + diff --git a/integration_test/cluster_test/implementations.ymmsl b/integration_test/cluster_test/implementations.ymmsl index 6dab9d57..df88e24d 100644 --- a/integration_test/cluster_test/implementations.ymmsl +++ b/integration_test/cluster_test/implementations.ymmsl @@ -6,10 +6,3 @@ implementations: executable: python args: - /home/cerulean/shared/cluster_test/component.py - - component_cpp_openmpi: - modules: openmpi - env: - +LD_LIBRARY_PATH: /home/cerulean/shared/muscle3-openmpi/lib - execution_model: openmpi - executable: /home/cerulean/shared/cluster_test/component_openmpi diff --git a/integration_test/cluster_test/implementations_openmpi.ymmsl b/integration_test/cluster_test/implementations_openmpi.ymmsl new file mode 100644 index 00000000..4a0d1dab --- /dev/null +++ b/integration_test/cluster_test/implementations_openmpi.ymmsl @@ -0,0 +1,9 @@ +ymmsl_version: v0.1 + +implementations: + component_cpp: + modules: openmpi + env: + +LD_LIBRARY_PATH: /home/cerulean/shared/muscle3-openmpi/lib + execution_model: openmpi + executable: /home/cerulean/shared/cluster_test/component_openmpi diff --git a/integration_test/cluster_test/implementations_srunmpi.ymmsl b/integration_test/cluster_test/implementations_srunmpi.ymmsl new file mode 100644 index 00000000..0ccf1265 --- /dev/null +++ b/integration_test/cluster_test/implementations_srunmpi.ymmsl @@ -0,0 +1,9 @@ +ymmsl_version: v0.1 + +implementations: + component_cpp: + modules: openmpi + env: + +LD_LIBRARY_PATH: /home/cerulean/shared/muscle3-openmpi/lib + execution_model: srunmpi + executable: /home/cerulean/shared/cluster_test/component_openmpi diff --git a/integration_test/cluster_test/test_cluster.py b/integration_test/cluster_test/test_cluster.py index c65717dc..b350edbc 100644 --- a/integration_test/cluster_test/test_cluster.py +++ b/integration_test/cluster_test/test_cluster.py @@ -60,14 +60,13 @@ def remote_out_dir(remote_home): return remote_home / 'test_results' -def _make_job(name, mode, remote_test_files, remote_out_dir): - job_dir = remote_out_dir / f'test_{name}_{mode}' +def _make_base_job(name, remote_out_dir, dir_name): + job_dir = remote_out_dir / dir_name job_dir.mkdir(0o755, True, True) job = cerulean.JobDescription() job.name = name job.working_directory = job_dir - job.command = str(remote_test_files / f'{name}.sh') job.stdout_file = job_dir / 'stdout.txt' job.stderr_file = job_dir / 'stderr.txt' job.queue_name = 'debug' @@ -79,6 +78,18 @@ def _make_job(name, mode, remote_test_files, remote_out_dir): return job +def _make_job(name, mode, remote_test_files, remote_out_dir): + job = _make_base_job(name, remote_out_dir, f'test_{name}_{mode}') + job.command = str(remote_test_files / f'{name}.sh') + return job + + +def _make_mpi_job(name, mode, execution_model, remote_test_files, remote_out_dir): + job = _make_base_job(name, remote_out_dir, f'test_{name}_{mode}_{execution_model}') + job.command = str(remote_test_files / f'{name}_{execution_model}.sh') + return job + + def _sched(fake_cluster, mode): if mode == 'local': return fake_cluster[2] @@ -86,8 +97,10 @@ def _sched(fake_cluster, mode): return fake_cluster[3] -def run_cmd_dir(remote_out_dir, testname, mode): +def _run_cmd_dir(remote_out_dir, testname, mode, execution_model=None): results_name = f'test_{testname}_{mode}' + if execution_model is not None: + results_name += f'_{execution_model}' for p in (remote_out_dir / results_name).iterdir(): if p.name.startswith('run_'): @@ -95,14 +108,14 @@ def run_cmd_dir(remote_out_dir, testname, mode): def _get_stdout(remote_out_dir, testname, mode, instance): - run_dir = run_cmd_dir(remote_out_dir, testname, mode) + run_dir = _run_cmd_dir(remote_out_dir, testname, mode) stdout_file = run_dir / 'instances' / instance / 'stdout.txt' assert stdout_file.exists() # test output redirection return stdout_file.read_text() -def _get_outfile(remote_out_dir, testname, mode, instance, rank): - run_dir = run_cmd_dir(remote_out_dir, testname, mode) +def _get_outfile(remote_out_dir, testname, mode, execution_model, instance, rank): + run_dir = _run_cmd_dir(remote_out_dir, testname, mode, execution_model) work_dir = run_dir / 'instances' / instance / 'workdir' out_file = work_dir / f'out_{rank}.txt' assert out_file.exists() # test working directory @@ -196,12 +209,18 @@ def test_multiple( @skip_unless_cluster @pytest.mark.parametrize('mode', ['local', 'slurm']) -def test_double_mpi( - fake_cluster, remote_test_files, remote_out_dir, mode, hwthread_to_core): +@pytest.mark.parametrize('execution_model', ['openmpi', 'srunmpi']) +def test_double( + fake_cluster, remote_test_files, remote_out_dir, hwthread_to_core, + mode, execution_model): + + if mode == 'local' and execution_model == 'srunmpi': + pytest.skip('srun does not work without slurm') sched = _sched(fake_cluster, mode) - job = _make_job('double_mpi', mode, remote_test_files, remote_out_dir) + job = _make_mpi_job( + 'double', mode, execution_model, remote_test_files, remote_out_dir) if mode == 'slurm': job.num_nodes = 2 job.extra_scheduler_options += ' --nodelist=node-[3-4]' @@ -212,7 +231,8 @@ def test_double_mpi( for i in range(1, 3): for rank in range(2): - out = _get_outfile(remote_out_dir, 'double_mpi', mode, f'c{i}', rank) + out = _get_outfile( + remote_out_dir, 'double', mode, execution_model, f'c{i}', rank) if mode == 'local': assert out.split('\n')[0] == 'headnode' else: diff --git a/libmuscle/python/libmuscle/errors.py b/libmuscle/python/libmuscle/errors.py new file mode 100644 index 00000000..9e819602 --- /dev/null +++ b/libmuscle/python/libmuscle/errors.py @@ -0,0 +1,2 @@ +class ConfigurationError(Exception): + """Signals an issue with the user's configuration.""" diff --git a/libmuscle/python/libmuscle/manager/instance_manager.py b/libmuscle/python/libmuscle/manager/instance_manager.py index 31fc5edb..9d7cf90d 100644 --- a/libmuscle/python/libmuscle/manager/instance_manager.py +++ b/libmuscle/python/libmuscle/manager/instance_manager.py @@ -8,6 +8,7 @@ from ymmsl import Configuration, Reference +from libmuscle.errors import ConfigurationError from libmuscle.manager.instance_registry import InstanceRegistry from libmuscle.manager.instantiator import ( CancelAllRequest, CrashedResult, InstantiatorRequest, @@ -160,8 +161,7 @@ def get_resources(self) -> Dict[Reference, Resources]: """ if self._allocations is None: raise RuntimeError( - 'Tried to get resources but we are running without' - ' --start-all') + 'Tried to get resources but we are running without --start-all') return self._allocations @@ -182,9 +182,12 @@ def cancel_all() -> None: result = self._results_in.get() if isinstance(result, CrashedResult): - _logger.error( - 'Instantiator crashed. This should not happen, please file' - ' a bug report.') + if isinstance(result.exception, ConfigurationError): + _logger.error(str(result.exception)) + else: + _logger.error( + 'Instantiator crashed. This should not happen, please file' + ' a bug report.') return False results.append(result) diff --git a/libmuscle/python/libmuscle/manager/instantiator.py b/libmuscle/python/libmuscle/manager/instantiator.py index db83a52d..798482e0 100644 --- a/libmuscle/python/libmuscle/manager/instantiator.py +++ b/libmuscle/python/libmuscle/manager/instantiator.py @@ -113,7 +113,8 @@ class CancelAllRequest(InstantiatorRequest): class CrashedResult: """Signals that the instantiator process crashed.""" - pass + def __init__(self, exception: Optional[Exception] = None) -> None: + self.exception = exception class QueueingLogHandler(logging.Handler): diff --git a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py index 391d89fe..0de23936 100644 --- a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py +++ b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py @@ -196,6 +196,7 @@ import traceback from typing import Dict, List, Optional +from libmuscle.errors import ConfigurationError from libmuscle.manager.instantiator import ( CancelAllRequest, CrashedResult, create_instance_env, InstantiationRequest, Process, ProcessStatus, reconfigure_logging, ShutdownRequest) @@ -245,11 +246,16 @@ def run(self) -> None: self._send_resources() self._main() + except ConfigurationError as e: + self._results_out.put(CrashedResult(e)) + except: # noqa for line in traceback.format_exception(*sys.exc_info()): _logger.error(line) - self._resources_out.put(CrashedResult()) - self._results_out.put(CrashedResult()) + + result = CrashResult(sys.exc_info()[1]) + self._resources_out.put(result) + self._results_out.put(result) def _main(self) -> None: """Main function for the background process. @@ -352,13 +358,16 @@ def _instantiate(self, request: InstantiationRequest) -> None: env = create_instance_env(request.instance, request.implementation.env) self._add_resources(env, request.res_req) - rankfile: Optional[Path] = None + rankfile = request.instance_dir / 'rankfile' + if global_resources.on_cluster(): rankfile_contents, resource_env = prep_resources( - request.implementation.execution_model, request.resources) + request.implementation.execution_model, request.resources, + rankfile) if rankfile_contents: - rankfile = self._write_rankfile(request, rankfile_contents) + with rankfile.open('w') as f: + f.write(rankfile_contents) env['MUSCLE_RANKFILE'] = str(rankfile) env.update(resource_env) @@ -381,18 +390,6 @@ def _instantiate(self, request: InstantiationRequest) -> None: self._processes[name].status = ProcessStatus.ERROR self._processes[name].error_msg = f'Instance failed to start: {e}' - def _write_rankfile(self, request: InstantiationRequest, rankfile: str) -> Path: - """Create and write out the rankfile and return its location. - - Also known as a machinefile or hostfile depending on the MPI implementation. - """ - rankfile_file = request.instance_dir / 'rankfile' - - with rankfile_file.open('w') as f: - f.write(rankfile) - - return rankfile_file - def _write_run_script( self, request: InstantiationRequest, rankfile: Optional[Path]) -> Path: """Create and write out the run script and return its location.""" diff --git a/libmuscle/python/libmuscle/native_instantiator/run_script.py b/libmuscle/python/libmuscle/native_instantiator/run_script.py index 1c615823..cb8c002f 100644 --- a/libmuscle/python/libmuscle/native_instantiator/run_script.py +++ b/libmuscle/python/libmuscle/native_instantiator/run_script.py @@ -1,6 +1,7 @@ from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, FrozenSet, List, Optional, Tuple +from libmuscle.errors import ConfigurationError from libmuscle.planner.planner import Resources from ymmsl import ( ExecutionModel, Implementation, MPICoresResReq, MPINodesResReq, @@ -83,29 +84,45 @@ def mpich_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: raise NotImplementedError() -def srun_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: +def srun_prep_resources( + resources: Resources, rankfile_location: Path) -> Tuple[str, Dict[str, str]]: """Create resource description for srun Args: resources: The resources to describe + rankfile_location: Location where the rankfile will be written Return: The contents of the hostfile, and a set of environment variables """ - # SLURM_HOSTFILE to point to the rankfile - # CPU_BIND=verbose,mask_cpu=0x01,0x02,0x04,0x01 to specify cores 0,1,2,0 for ranks - # 0-3 - raise NotImplementedError() + hostfile = '\n'.join(( + node for node, cores in resources.cores.items() for _ in cores)) + + env = {'SLURM_HOSTFILE': str(rankfile_location)} + + bind_list = [ + core for _, cores in resources.cores.items() for core in cores] + + def core_mask(core: FrozenSet[int]) -> str: + mask = sum((1 << hwthread) for hwthread in core) + return format(mask, '#x') + + bind_str = ','.join(map(core_mask, bind_list)) + + env['SLURM_CPU_BIND'] = f'verbose,mask_cpu:{bind_str}' + + return hostfile, env def prep_resources( - model: ExecutionModel, resources: Resources + model: ExecutionModel, resources: Resources, rankfile_location: Path ) -> Tuple[str, Dict[str, str]]: """Create resource description for the given execution model. Args: model: The execution model to generate a description for resources: The resources to describe + rankfile_location: Path to where the rankfile will be written Return: The contents of the rank/machine/hostfile, and a set of environment variables. @@ -117,7 +134,7 @@ def prep_resources( elif model == ExecutionModel.INTELMPI: return impi_prep_resources(resources) elif model == ExecutionModel.SRUNMPI: - return srun_prep_resources(resources) + return srun_prep_resources(resources, rankfile_location) # elif model == ExecutionModel.MPICH: # return mpich_prep_resources(resources) raise RuntimeError( @@ -163,7 +180,13 @@ def local_command(implementation: Implementation) -> str: elif implementation.execution_model == ExecutionModel.INTELMPI: fstr = 'mpirun -n $MUSCLE_MPI_PROCESSES {command} {args}' elif implementation.execution_model == ExecutionModel.SRUNMPI: - fstr = 'srun -n $MUSCLE_MPI_PROCESSES -m arbitrary {command} {args}' + raise ConfigurationError( + f'Could not start {implementation.name} because the SRUNMPI execution' + ' method only works in a SLURM allocation, and we are running locally.' + ' Please switch this implementation to a different execution method' + ' in the configuration file. You will probably want OPENMPI or' + ' INTELMPI depending on which MPI implementation this code was' + ' compiled with.') # elif implementation.execution_model == ExecutionModel.MPICH # fstr = 'mpiexec -n {{ntasks}} {command} {args}' @@ -213,7 +236,10 @@ def cluster_command(implementation: Implementation) -> str: 'mpirun -n $MUSCLE_MPI_PROCESSES -machinefile $MUSCLE_RANKFILE' ' {command} {args}') elif implementation.execution_model == ExecutionModel.SRUNMPI: - fstr = 'srun -n $MUSCLE_MPI_PROCESSES -m arbitrary {command} {args}' + # TODO: set SLURM_CPU_BIND_VERBOSE for verbose output + fstr = ( + 'srun -n $MUSCLE_MPI_PROCESSES -m arbitrary --overlap' + ' --cpu-bind=$SLURM_CPU_BIND {command} {args}') # elif implementation.execution_model == ExecutionModel.MPICH # fstr = 'mpiexec -n $MUSCLE_MPI_PROCESSES -f $MUSCLE_RANKFILE {command} {args}' From cbee20270410e50e313d5b8d91edabf0cd7e0afd Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 22 Nov 2024 15:57:47 +0100 Subject: [PATCH 22/49] Improve agent manager debug output --- .../python/libmuscle/native_instantiator/agent_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/native_instantiator/agent_manager.py b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py index 2e5aa361..c3a29fcc 100644 --- a/libmuscle/python/libmuscle/native_instantiator/agent_manager.py +++ b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py @@ -57,7 +57,7 @@ def __init__(self, agent_dir: Path) -> None: sleep(0.1) with self._resources_lock: resources_complete = len(self._nodes) == len(expected_nodes) - _logger.debug(f'{len(self._resources)} agents up') + _logger.debug(f'{len(self._nodes)} agents up of {len(expected_nodes)}') if self._agents_process.poll() is not None: msg = ( @@ -163,6 +163,7 @@ def report_resources(self, node_id: str, resources: Dict[str, Any]) -> None: node_id: Id of the node these resources are on resources: Dict mapping resource type to resource ids """ + _logger.debug(f'Agent on {node_id} reported {resources}') with self._resources_lock: self._nodes.append(node_id) self._resources[node_id] = resources @@ -202,4 +203,5 @@ def _launch_agents(self, agent_dir: Path, server_location: str) -> Popen: args = global_resources.agent_launch_command(args) + _logger.debug(f'Launching agents using {args}') return Popen(args, cwd=agent_dir) From edb63ac6ac4c8d52571d2cf49a1d01695f24cc87 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 29 Nov 2024 13:20:17 +0100 Subject: [PATCH 23/49] Rearrange tests to enable multiple SLURM and MPI versions --- integration_test/cluster_test/conftest.py | 217 +++++++++++------- integration_test/cluster_test/test_cluster.py | 31 +-- integration_test/fake_cluster/Dockerfile | 32 ++- 3 files changed, 171 insertions(+), 109 deletions(-) diff --git a/integration_test/cluster_test/conftest.py b/integration_test/cluster_test/conftest.py index f6d6e6d4..19eb7ebe 100644 --- a/integration_test/cluster_test/conftest.py +++ b/integration_test/cluster_test/conftest.py @@ -11,8 +11,11 @@ logger_ = logging.getLogger(__name__) +IMAGE_NAME = 'muscle3_test_cluster' + REMOTE_SHARED = '/home/cerulean/shared' +IDX_SLURM_VERSIONS = list(enumerate(['23-11'])) # Shut down the containers after running the tests. Set to False to debug. CLEAN_UP_CONTAINERS = True @@ -41,16 +44,20 @@ def local_fs(): return cerulean.LocalFileSystem() +@pytest.fixture(scope='session') +def repo_root(local_fs): + root_dir = Path(__file__).parents[2] + return local_fs / str(root_dir) + + @pytest.fixture(scope='session') def fake_cluster_image(local_term): - IMAGE_NAME = 'muscle3_test_cluster' run_cmd(local_term, 5400, ( f'docker buildx build -t {IMAGE_NAME}' ' -f integration_test/fake_cluster/Dockerfile .')) - return IMAGE_NAME -def ssh_term(timeout_msg): +def ssh_term(port, timeout_msg): cred = cerulean.PasswordCredential('cerulean', 'kingfisher') ready = False start = time.monotonic() @@ -59,7 +66,7 @@ def ssh_term(timeout_msg): raise Exception(timeout_msg) try: - term = cerulean.SshTerminal('localhost', 10022, cred) + term = cerulean.SshTerminal('localhost', port, cred) ready = True except Exception: time.sleep(3.0) @@ -78,93 +85,55 @@ def shared_dir(): @pytest.fixture(scope='session') def cleanup_docker(local_term): - for i in range(5): - node_name = f'node-{i}' - run_cmd(local_term, 60, f'docker rm -f {node_name}') + for _, slurm_version in IDX_SLURM_VERSIONS: + _clean_up_base_cluster(local_term, slurm_version) - run_cmd(local_term, 60, 'docker rm -f headnode') - run_cmd(local_term, 60, 'docker network rm -f muscle3-net') - -@pytest.fixture(scope='session') -def fake_cluster_network(local_term, cleanup_docker): - name = 'muscle3-net' +def _create_network(local_term, slurm_version): + name = f'muscle3-net-{slurm_version}' run_cmd(local_term, 60, f'docker network create {name}') - yield name - - if CLEAN_UP_CONTAINERS: - run_cmd(local_term, 60, 'docker network rm -f muscle3-net') + return name -@pytest.fixture(scope='session') -def fake_cluster_nodes( - local_term, fake_cluster_image, fake_cluster_network, shared_dir): - - node_names = list() - +def _start_nodes(local_term, slurm_version, net_name, shared_dir): for i in range(5): node_name = f'node-{i}' - ssh_port = 10030 + i run_cmd(local_term, 60, ( - f'docker run -d --name={node_name} --hostname={node_name}' - f' --network={fake_cluster_network} -p {ssh_port}:22' - f' --cap-add=CAP_SYS_NICE' + f'docker run -d --name={node_name}-{slurm_version} --hostname={node_name}' + f' --network={net_name} --cap-add=CAP_SYS_NICE' + f' --env SLURM_VERSION={slurm_version}' f' --mount type=bind,source={shared_dir},target={REMOTE_SHARED}' - f' {fake_cluster_image}')) - - node_names.append(node_name) + f' {IMAGE_NAME}')) - yield None - - if CLEAN_UP_CONTAINERS: - run_cmd(local_term, 60, f'docker rm -f {" ".join(node_names)}') - - -@pytest.fixture(scope='session') -def fake_cluster_headnode( - local_term, fake_cluster_image, fake_cluster_network, fake_cluster_nodes, - shared_dir): +def _start_headnode(local_term, slurm_version, net_name, shared_dir, headnode_port): run_cmd(local_term, 60, ( - 'docker run -d --name=headnode --hostname=headnode' - f' --network={fake_cluster_network} -p 10022:22' + f'docker run -d --name=headnode-{slurm_version} --hostname=headnode' + f' --network={net_name} -p {headnode_port}:22' + f' --env SLURM_VERSION={slurm_version}' f' --mount type=bind,source={shared_dir},target={REMOTE_SHARED}' - f' {fake_cluster_image}')) + f' {IMAGE_NAME}')) - ssh_term('Virtual cluster container start timed out') - yield None + ssh_term(headnode_port, 'Virtual cluster container start timed out') - if CLEAN_UP_CONTAINERS: - run_cmd(local_term, 60, 'docker rm -f headnode') +def _start_base_cluster(local_term, idx_slurm_version, shared_dir): + slurm_index, slurm_version = idx_slurm_version -@pytest.fixture(scope='session') -def setup_connection(fake_cluster_headnode): - # Session-wide connection used for container setup actions only - # Tests each have their own connection, see fake_cluster() below - term = ssh_term('Connection to virtual cluster container timed out') - with cerulean.SftpFileSystem(term, True) as fs: - yield term, fs - - # We abuse this to clean up the contents of the shared directory. - # Because it's been made inside of the container, it has a different owner - # than what we're running with on the host, and the host user cannot remove - # the files. - if CLEAN_UP_CONTAINERS: - run_cmd(term, 60, f'rm -rf {REMOTE_SHARED}/*') + headnode_port = 10022 + slurm_index + net_name = _create_network(local_term, slurm_version) + _start_nodes(local_term, slurm_version, net_name, shared_dir) + _start_headnode(local_term, slurm_version, net_name, shared_dir, headnode_port) -@pytest.fixture(scope='session') -def repo_root(local_fs): - root_dir = Path(__file__).parents[2] - return local_fs / str(root_dir) + term = ssh_term(headnode_port, 'Connection to virtual cluster container timed out') + fs = cerulean.SftpFileSystem(term, False) + return term, fs, headnode_port -@pytest.fixture(scope='session') -def remote_source(repo_root, setup_connection): - remote_term, remote_fs = setup_connection +def _install_remote_source(repo_root, remote_term, remote_fs): muscle3_tgt = remote_fs / 'home' / 'cerulean' / 'muscle3' muscle3_tgt.mkdir() (muscle3_tgt / 'libmuscle').mkdir() @@ -178,10 +147,7 @@ def remote_source(repo_root, setup_connection): return muscle3_tgt -@pytest.fixture(scope='session') -def muscle3_venv(repo_root, remote_source, setup_connection): - remote_term, remote_fs = setup_connection - +def _create_muscle3_venv(remote_term, remote_source): run_cmd(remote_term, 10, f'python3 -m venv {REMOTE_SHARED}/venv') in_venv = f'source {REMOTE_SHARED}/venv/bin/activate && ' @@ -192,21 +158,116 @@ def muscle3_venv(repo_root, remote_source, setup_connection): return in_venv -@pytest.fixture(scope='session') -def muscle3_native_openmpi(remote_source, setup_connection): - remote_term, remote_fs = setup_connection - +def _install_muscle3_native_openmpi( + remote_source, remote_term, remote_fs, slurm_version): prefix = remote_fs / REMOTE_SHARED / 'muscle3-openmpi' prefix.mkdir() + openmpi_hash = run_cmd(remote_term, 600, ( + '/bin/bash -c "' + 'for phash in $(/opt/spack/bin/spack find --format \\"{hash}\\" openmpi' + ' | tr \'\\n\' \' \') ; do' + ' if /opt/spack/bin/spack find --deps /\\${phash} |' + f' grep -q slurm@{slurm_version} ; then' + ' echo \\${phash} ;' + ' fi ;' + 'done' + '"')) + + openmpi_version = run_cmd(remote_term, 600, ( + '/bin/bash -c "' + f'/opt/spack/bin/spack find --format \\"{{version}}\\" /{openmpi_hash}' + '"')).strip() + + module_name = f'openmpi/{openmpi_version}-gcc-11.4.0-{openmpi_hash[:7]}' + + logger_.info(f'Slurm {slurm_version} and module {module_name}') + run_cmd(remote_term, 600, ( f'/bin/bash -l -c "' - f'module load openmpi && ' + f'module load {module_name} && ' f'cd {remote_source} && ' f'make distclean && ' f'PREFIX={prefix} make install"')) - return prefix + return prefix, module_name + + +def _install_muscle3(repo_root, remote_term, remote_fs, slurm_version): + remote_source = _install_remote_source(repo_root, remote_term, remote_fs) + in_venv = _create_muscle3_venv(remote_term, remote_source) + return _install_muscle3_native_openmpi( + remote_source, remote_term, remote_fs, slurm_version) + + +def _install_tests(repo_root, remote_term, remote_fs, remote_m3_openmpi): + remote_home = remote_fs / REMOTE_SHARED + remote_m3, openmpi_module = remote_m3_openmpi + + cerulean.copy( + repo_root / 'integration_test' / 'cluster_test', remote_home, + copy_permissions=True) + + remote_source = remote_home / 'cluster_test' + + run_cmd(remote_term, 30, ( + '/bin/bash -c "' + f'sed -i \\"s^modules: openmpi^modules: {openmpi_module}^\\"' + f' {remote_source}/implementations_openmpi.ymmsl' + '"')) + + run_cmd(remote_term, 30, ( + '/bin/bash -c "' + f'sed -i \\"s^modules: openmpi^modules: {openmpi_module}^\\"' + f' {remote_source}/implementations_srunmpi.ymmsl' + '"')) + + run_cmd(remote_term, 30, ( + f'/bin/bash -l -c "' + f'module load {openmpi_module} && ' + f'. {remote_m3}/bin/muscle3.env && ' + f'make -C {remote_source}"')) + + +def _clean_up_base_cluster(local_term, slurm_version): + node_names = [f'node-{i}-{slurm_version}' for i in range(5)] + run_cmd(local_term, 60, f'docker rm -f {" ".join(node_names)}') + + run_cmd(local_term, 60, f'docker rm -f headnode-{slurm_version}') + + net_name = f'muscle3-net-{slurm_version}' + run_cmd(local_term, 60, f'docker network rm -f {net_name}') + + +@pytest.fixture(scope='session', params=IDX_SLURM_VERSIONS) +def installed_cluster( + request, cleanup_docker, fake_cluster_image, shared_dir, + repo_root, local_term): + + slurm_version = request.param[1] + local_shared_dir = shared_dir / slurm_version + local_shared_dir.mkdir() + local_shared_dir.chmod(0o1777) + + remote_term, remote_fs, headnode_port = _start_base_cluster( + local_term, request.param, local_shared_dir) + remote_m3_openmpi = _install_muscle3( + repo_root, remote_term, remote_fs, slurm_version) + _install_tests(repo_root, remote_term, remote_fs, remote_m3_openmpi) + + yield headnode_port + + # Because it's been made inside of the container, the shared directory has a + # different owner than what we're running with on the host, and the host user cannot + # remove the files. So we do it here from inside the container + if CLEAN_UP_CONTAINERS: + run_cmd(remote_term, 60, f'rm -rf {REMOTE_SHARED}/*') + + remote_fs.close() + remote_term.close() + + if CLEAN_UP_CONTAINERS: + _clean_up_base_cluster(local_term, slurm_version) @pytest.fixture(scope='session') diff --git a/integration_test/cluster_test/test_cluster.py b/integration_test/cluster_test/test_cluster.py index b350edbc..b9b08b6e 100644 --- a/integration_test/cluster_test/test_cluster.py +++ b/integration_test/cluster_test/test_cluster.py @@ -9,35 +9,10 @@ logger_ = logging.getLogger(__name__) -@pytest.fixture(scope='session') -def copy_test_files(repo_root, setup_connection): - remote_term, remote_fs = setup_connection - remote_home = remote_fs / REMOTE_SHARED - - cerulean.copy( - repo_root / 'integration_test' / 'cluster_test', remote_home, - copy_permissions=True) - - return remote_home / 'cluster_test' - - -@pytest.fixture(scope='session') -def build_native_components( - muscle3_native_openmpi, setup_connection, copy_test_files): - remote_term, remote_fs = setup_connection - remote_source = copy_test_files - - run_cmd(remote_term, 30, ( - f"/bin/bash -l -c '" - f"module load openmpi && " - f". {muscle3_native_openmpi}/bin/muscle3.env && " - f"make -C {remote_source}'")) - - @pytest.fixture -def fake_cluster( - fake_cluster_headnode, muscle3_venv, build_native_components, copy_test_files): - term = ssh_term('Connection to virtual cluster container timed out') +def fake_cluster(installed_cluster): + headnode_port = installed_cluster + term = ssh_term(headnode_port, 'Connection to virtual cluster container timed out') with cerulean.SftpFileSystem(term, True) as fs: local_sched = cerulean.DirectGnuScheduler(term) slurm_sched = cerulean.SlurmScheduler(term) diff --git a/integration_test/fake_cluster/Dockerfile b/integration_test/fake_cluster/Dockerfile index bc1db68d..16561062 100644 --- a/integration_test/fake_cluster/Dockerfile +++ b/integration_test/fake_cluster/Dockerfile @@ -1,9 +1,35 @@ -FROM ghcr.io/naturalhpc/cerulean-fake-slurm-23-11:latest -# FROM naturalhpc/cerulean-fake-slurm-23-11:latest +FROM ghcr.io/naturalhpc/cerulean-fake-slurm-base:latest +# FROM naturalhpc/cerulean-fake-slurm-base:latest RUN . /opt/spack/share/spack/setup-env.sh && \ . $(spack location -i lmod)/lmod/lmod/init/bash && \ - spack install openmpi+legacylaunchers+pmi schedulers=slurm ^pmix@3.2.3 ^slurm/dckfty + spack install openmpi@3.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@20-11 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@20-11) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@3.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@21-08 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@21-08) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@3.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@22-05 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@22-05) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@3.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@23-02 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@23-02) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@4.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@23-11 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@23-11) # RUN . /opt/spack/share/spack/setup-env.sh && \ # . $(spack location -i lmod)/lmod/lmod/init/bash && \ From 75909083299f5f9ba15704ad7335a09c58430b5e Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sat, 30 Nov 2024 19:27:55 +0100 Subject: [PATCH 24/49] Fix mpirun/srun/agents resource collision on SLURM <= 21-08 --- .../python/libmuscle/native_instantiator/run_script.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/libmuscle/python/libmuscle/native_instantiator/run_script.py b/libmuscle/python/libmuscle/native_instantiator/run_script.py index cb8c002f..042f897d 100644 --- a/libmuscle/python/libmuscle/native_instantiator/run_script.py +++ b/libmuscle/python/libmuscle/native_instantiator/run_script.py @@ -229,6 +229,13 @@ def cluster_command(implementation: Implementation) -> str: ' --rankfile $MUSCLE_RANKFILE --use-hwthread-cpus --oversubscribe' # ' --map-by rankfile:file=$MUSCLE_RANKFILE:oversubscribe' # ' --display-map --display-allocation {command} {args}' + + # This adds the given option to the srun command used by mpirun to + # launch its daemons. mpirun specifies --exclusive, which on SLURM <= + # 21-08 causes SLURM to wait for our agents to quit, as it considers + # them to be occupying the cores, causing a deadlock. Fortunately, it + # seems that adding --overlap overrides the --exclusive and it works. + ' -mca plm_slurm_args "--overlap"' ' --bind-to core --display-map --display-allocation {command} {args}' ) elif implementation.execution_model == ExecutionModel.INTELMPI: From cda6c6501107ccb091dac59ee8923d9d2f4158a0 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sat, 30 Nov 2024 19:44:32 +0100 Subject: [PATCH 25/49] Fix number of agents launched on SLURM <= 23-02 --- .../python/libmuscle/native_instantiator/slurm.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/libmuscle/python/libmuscle/native_instantiator/slurm.py b/libmuscle/python/libmuscle/native_instantiator/slurm.py index f11a0cba..0c726cd2 100644 --- a/libmuscle/python/libmuscle/native_instantiator/slurm.py +++ b/libmuscle/python/libmuscle/native_instantiator/slurm.py @@ -287,7 +287,12 @@ def agent_launch_command(agent_cmd: List[str], nnodes: int) -> List[str]: agent_cmd: A command that will start the agent. """ # TODO: On the latest Slurm, there's a special command for this that we should use - # if we have that. + # if we have that, --external-launcher. Poorly documented though, so will require + # some experimentation. + + # On SLURM <= 23-02, the number of tasks is inherited by srun from sbatch rather + # than calculated anew from --nodes and --ntasks-per-node, so we specify it + # explicitly to avoid getting an agent per logical cpu rather than per node. return [ - 'srun', f'--ntasks={nnodes}', '--ntasks-per-node=1', '--cpu-bind=none' - ] + agent_cmd + 'srun', f'--nodes={nnodes}', f'--ntasks={nnodes}', '--ntasks-per-node=1', + '--overlap'] + agent_cmd From 12dedf60d7995122066de180bd13cab311e24812 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sat, 30 Nov 2024 20:02:27 +0100 Subject: [PATCH 26/49] Improve agent launch logging and error handling --- .../native_instantiator/agent_manager.py | 87 +++++++++++-------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/libmuscle/python/libmuscle/native_instantiator/agent_manager.py b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py index c3a29fcc..c42b96c0 100644 --- a/libmuscle/python/libmuscle/native_instantiator/agent_manager.py +++ b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py @@ -45,39 +45,7 @@ def __init__(self, agent_dir: Path) -> None: self._finished_processes_lock = Lock() self._server = MAPServer(self) - - _logger.info('Launching MUSCLE agents...') - self._agents_process = self._launch_agents( - agent_dir, self._server.get_location()) - - expected_nodes = global_resources.nodes - - resources_complete = False - while not resources_complete: - sleep(0.1) - with self._resources_lock: - resources_complete = len(self._nodes) == len(expected_nodes) - _logger.debug(f'{len(self._nodes)} agents up of {len(expected_nodes)}') - - if self._agents_process.poll() is not None: - msg = ( - 'Agents unexpectedly stopped running. This is not supposed' - ' to happen. Please see the agent log for more information,' - ' and please file an issue on GitHub.') - _logger.error(msg) - raise RuntimeError(msg) - - _logger.info(f'All agents running on {self._nodes}') - - if sorted(expected_nodes) != sorted(self._nodes): - _logger.error( - 'Agent-reported node hostnames do not match what we got from the' - ' resource manager.') - _logger.error( - 'According to the resource manager, we have' - f' {sorted(expected_nodes)}') - _logger.error( - f'The agents are reporting {sorted(self._nodes)}') + self._launch_agents(agent_dir, self._server.get_location()) def get_resources(self) -> Dict[str, List[FrozenSet[int]]]: """Return detected resources. @@ -149,6 +117,8 @@ def shutdown(self) -> None: try: self._agents_process.wait(10) + self._agents_stdout.close() + self._agents_stderr.close() except TimeoutExpired: _logger.warning('Agents still not down, continuing shutdown anyway.') @@ -179,7 +149,7 @@ def report_result(self, names_exit_codes: List[Tuple[str, int]]) -> None: with self._finished_processes_lock: self._finished_processes.extend(names_exit_codes) - def _launch_agents(self, agent_dir: Path, server_location: str) -> Popen: + def _launch_agents(self, agent_dir: Path, server_location: str) -> None: """Actually launch the agents. This runs a local process, either to start a single agent locally, or on a @@ -190,6 +160,8 @@ def _launch_agents(self, agent_dir: Path, server_location: str) -> Popen: server_location: MAPServer network location string for the agents to connect to """ + _logger.info('Launching MUSCLE agents...') + python = sys.executable if not python: raise RuntimeError( @@ -203,5 +175,50 @@ def _launch_agents(self, agent_dir: Path, server_location: str) -> Popen: args = global_resources.agent_launch_command(args) + self._agents_stdout = (agent_dir / 'agent_launch.out').open('a') + self._agents_stderr = (agent_dir / 'agent_launch.err').open('a') + _logger.debug(f'Launching agents using {args}') - return Popen(args, cwd=agent_dir) + self._agents_process = Popen( + args, cwd=agent_dir, stdout=self._agents_stdout, + stderr=self._agents_stderr) + + expected_nodes = global_resources().nodes + + resources_complete = False + while not resources_complete: + sleep(0.1) + with self._resources_lock: + resources_complete = len(self._nodes) == len(expected_nodes) + too_many_agents = len(self._nodes) > len(expected_nodes) + + _logger.debug(f'{len(self._nodes)} agents up of {len(expected_nodes)}') + + if self._agents_process.poll() is not None: + msg = ( + 'Agents unexpectedly stopped running. This is not supposed' + ' to happen. Please see the agent log for more information,' + ' and please file an issue on GitHub.') + _logger.error(msg) + raise RuntimeError(msg) + + if too_many_agents: + msg = ( + 'More agents were started than MUSCLE3 asked for. This is not' + ' supposed to happen. Please file an issue on GitHub, with the' + ' SLURM version (use "sbatch -v") and the sbatch command used' + ' to submit the job.') + _logger.error(msg) + raise RuntimeError(msg) + + _logger.info(f'All agents running on {self._nodes}') + + if sorted(expected_nodes) != sorted(self._nodes): + _logger.error( + 'Agent-reported node hostnames do not match what we got from the' + ' resource manager.') + _logger.error( + 'According to the resource manager, we have' + f' {sorted(expected_nodes)}') + _logger.error( + f'The agents are reporting {sorted(self._nodes)}') From 9a94ed1fc54fdf20d132db9ab385df70bc1fb2dd Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sat, 30 Nov 2024 20:03:48 +0100 Subject: [PATCH 27/49] Fix global resources log output --- .../native_instantiator/agent_manager.py | 2 +- .../native_instantiator/global_resources.py | 21 +++++++++++++++++-- .../native_instantiator.py | 6 +++--- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/libmuscle/python/libmuscle/native_instantiator/agent_manager.py b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py index c42b96c0..a6e249a6 100644 --- a/libmuscle/python/libmuscle/native_instantiator/agent_manager.py +++ b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py @@ -173,7 +173,7 @@ def _launch_agents(self, agent_dir: Path, server_location: str) -> None: sys.executable, '-m', 'libmuscle.native_instantiator.agent', server_location, str(log_level)] - args = global_resources.agent_launch_command(args) + args = global_resources().agent_launch_command(args) self._agents_stdout = (agent_dir / 'agent_launch.out').open('a') self._agents_stderr = (agent_dir / 'agent_launch.err').open('a') diff --git a/libmuscle/python/libmuscle/native_instantiator/global_resources.py b/libmuscle/python/libmuscle/native_instantiator/global_resources.py index aea612e1..e3c12e02 100644 --- a/libmuscle/python/libmuscle/native_instantiator/global_resources.py +++ b/libmuscle/python/libmuscle/native_instantiator/global_resources.py @@ -1,7 +1,7 @@ from enum import Enum import logging from socket import gethostname -from typing import List +from typing import List, Optional import psutil @@ -64,9 +64,26 @@ def agent_launch_command(self, agent_cmd: List[str]) -> List[str]: return agent_cmd -global_resources = GlobalResources() +_global_resources: Optional[GlobalResources] = None """Global resources object. This is a singleton, and that's fine because it's created once and then read-only. Also, it's used in two places, and making two objects logs everything twice which is annoying. """ + + +def global_resources() -> GlobalResources: + """Wrapper for _global_resources. + + This is here to ensure that the object gets created after we've configured logging, + so that the log output it generates actually ends up in the manager log. + + The users are all in the main thread of the NativeInstantiator background process, + so there's no need for a lock right now. + """ + global _global_resources + + if _global_resources is None: + _global_resources = GlobalResources() + + return _global_resources diff --git a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py index 0de23936..bc90cb3b 100644 --- a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py +++ b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py @@ -311,7 +311,7 @@ def _send_resources(self) -> None: agent_cores = self._agent_manager.get_resources() env_ncores = dict( - zip(global_resources.nodes, global_resources.cores_per_node) + zip(global_resources().nodes, global_resources().cores_per_node) ) for node in env_ncores: @@ -360,7 +360,7 @@ def _instantiate(self, request: InstantiationRequest) -> None: rankfile = request.instance_dir / 'rankfile' - if global_resources.on_cluster(): + if global_resources().on_cluster(): rankfile_contents, resource_env = prep_resources( request.implementation.execution_model, request.resources, rankfile) @@ -399,7 +399,7 @@ def _write_run_script( else: run_script = make_script( request.implementation, request.res_req, - not global_resources.on_cluster(), rankfile) + not global_resources().on_cluster(), rankfile) run_script_file = request.instance_dir / 'run_script.sh' From 21e68b4db48cdadf18811ea14f771a9e9e9a8848 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sat, 30 Nov 2024 20:04:28 +0100 Subject: [PATCH 28/49] Fix planner predictability and add some logging --- libmuscle/python/libmuscle/planner/planner.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/planner/planner.py b/libmuscle/python/libmuscle/planner/planner.py index 2d63828e..612a89a2 100644 --- a/libmuscle/python/libmuscle/planner/planner.py +++ b/libmuscle/python/libmuscle/planner/planner.py @@ -550,6 +550,8 @@ def allocate_all( """ result: Dict[Reference, Resources] = {} + _logger.debug(f'Planning on resources {self._all_resources}') + # Analyse model model = ModelGraph(configuration.model) requirements = configuration.resources @@ -570,6 +572,7 @@ def allocate_all( unallocated_instances, requirements) for instance in to_allocate: + _logger.debug(f'Placing {instance}') component = model.component(instance.without_trailing_ints()) conflicting_names = self._conflicting_names( model, exclusive, component, instance) @@ -735,6 +738,7 @@ def _allocate_instance( if other in simultaneous_instances: free_resources -= self._allocations[other] + _logger.debug(f'Free resources: {free_resources}') try: if isinstance(requirements, ThreadedResReq): allocation = self._allocate_thread_block( @@ -788,7 +792,9 @@ def _allocate_thread_block( """ for node in free_resources.nodes(): if len(free_resources.cores[node]) >= threads: - available_cores = sorted(free_resources.cores[node]) + available_cores = sorted(free_resources.cores[node], key=sorted) + _logger.debug(f'available cores: {available_cores}') to_reserve = set(available_cores[:threads]) + _logger.debug(f'assigned {to_reserve}') return Resources({node: to_reserve}) raise InsufficientResourcesAvailable() From 8e5e7eb1705af56be29c20bdb783db15f9346d7f Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sat, 30 Nov 2024 20:05:16 +0100 Subject: [PATCH 29/49] Improve assertion precision if it fails --- integration_test/cluster_test/test_cluster.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/integration_test/cluster_test/test_cluster.py b/integration_test/cluster_test/test_cluster.py index b9b08b6e..57ef408c 100644 --- a/integration_test/cluster_test/test_cluster.py +++ b/integration_test/cluster_test/test_cluster.py @@ -173,13 +173,14 @@ def test_multiple( assert sched.get_exit_code(job_id) == 0 for i in range(1, 7): - out = _get_stdout(remote_out_dir, 'multiple', mode, f'c{i}') + instance = f'c{i}' + out = _get_stdout(remote_out_dir, 'multiple', mode, instance) if mode == 'local': assert out.split('\n')[0] == 'headnode' else: node, hwthreads, _ = out.split('\n') - assert node == f'node-{(i - 1) // 2}' - assert hwthread_to_core(hwthreads) == [(i - 1) % 2] + assert (instance, node) == (instance, f'node-{(i - 1) // 2}') + assert (instance, hwthread_to_core(hwthreads)) == (instance, [(i - 1) % 2]) @skip_unless_cluster From 2601d877ad8c5a2a7f0f4d0db0475e0a13393704 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sat, 30 Nov 2024 22:07:57 +0100 Subject: [PATCH 30/49] Fix global/local core/cpu confusion on SMT systems --- .../native_instantiator/agent_manager.py | 2 +- .../native_instantiator/global_resources.py | 12 ++++--- .../native_instantiator.py | 33 ++++++++++++------- .../libmuscle/native_instantiator/slurm.py | 4 +-- 4 files changed, 32 insertions(+), 19 deletions(-) diff --git a/libmuscle/python/libmuscle/native_instantiator/agent_manager.py b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py index a6e249a6..39d9a648 100644 --- a/libmuscle/python/libmuscle/native_instantiator/agent_manager.py +++ b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py @@ -50,7 +50,7 @@ def __init__(self, agent_dir: Path) -> None: def get_resources(self) -> Dict[str, List[FrozenSet[int]]]: """Return detected resources. - This returns a list of tuples of logical hwthread ids for each core per node. + This returns a list of sets of logical hwthread ids per core, per node. Called by NativeInstantiator. """ diff --git a/libmuscle/python/libmuscle/native_instantiator/global_resources.py b/libmuscle/python/libmuscle/native_instantiator/global_resources.py index e3c12e02..1053a717 100644 --- a/libmuscle/python/libmuscle/native_instantiator/global_resources.py +++ b/libmuscle/python/libmuscle/native_instantiator/global_resources.py @@ -26,7 +26,8 @@ class GlobalResources: Attributes: scheduler: The HPC scheduler we're running under, if any. nodes: List of hostnames of available nodes to run on. - cores_per_node: Number of cores available on each node. List alongside nodes. + logical_cpus_per_node: Number of cores available on each node. + List alongside nodes. """ def __init__(self) -> None: """Create a GlobalResources. @@ -38,16 +39,17 @@ def __init__(self) -> None: _logger.info('Detected a SLURM allocation') self.scheduler = Scheduler.SLURM self.nodes = slurm.get_nodes() - self.cores_per_node = slurm.get_cores_per_node() + self.logical_cpus_per_node = slurm.get_logical_cpus_per_node() _logger.info( f'We have {len(self.nodes)} nodes and a total of' - f' {sum(self.cores_per_node)} cores available') + f' {sum(self.logical_cpus_per_node)} logical CPUs available') else: _logger.info('Running locally without a cluster scheduler') self.scheduler = Scheduler.NONE self.nodes = [gethostname()] - self.cores_per_node = [psutil.cpu_count(logical=False)] - _logger.info(f'We have {self.cores_per_node[0]} cores available') + self.logical_cpus_per_node = [psutil.cpu_count(logical=True)] + _logger.info( + f'We have {self.logical_cpus_per_node[0]} logical CPUS available') def on_cluster(self) -> bool: """Return whether we're running on a cluster.""" diff --git a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py index bc90cb3b..bccaabbc 100644 --- a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py +++ b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py @@ -306,15 +306,16 @@ def _send_resources(self) -> None: step outside our bounds even if the cluster doesn't constrain processes to their assigned processors. """ + already_logged_smt = False resources = Resources() agent_cores = self._agent_manager.get_resources() - env_ncores = dict( - zip(global_resources().nodes, global_resources().cores_per_node) + env_ncpus = dict( + zip(global_resources().nodes, global_resources().logical_cpus_per_node) ) - for node in env_ncores: + for node in env_ncpus: if node not in agent_cores: _logger.warning( f'The environment suggests we should have node {node},' @@ -323,26 +324,36 @@ def _send_resources(self) -> None: else: resources.cores[node] = set(agent_cores[node]) - env_nncores = env_ncores[node] + env_nncpus = env_ncpus[node] ag_nncores = len(agent_cores[node]) - if ag_nncores < env_nncores: + ag_nnthreads = sum((len(ts) for ts in agent_cores[node])) + + if ag_nncores != ag_nnthreads and ag_nnthreads == env_nncpus: + if not already_logged_smt: + _logger.info( + 'Detected SMT (hyperthreading) as available and' + ' enabled. Note that MUSCLE3 will assign whole cores to' + ' each thread or MPI process.') + already_logged_smt = True + + elif ag_nncores < env_nncpus: _logger.warning( - f'Node {node} should have {env_nncores} cores available,' + f'Node {node} should have {env_nncpus} cores available,' f' but the agent reports only {ag_nncores} available to it.' f' We\'ll use the {ag_nncores} we seem to have.') resources.cores[node] = set(agent_cores[node]) - elif env_nncores < ag_nncores: + elif env_nncpus < ag_nncores: _logger.warning( - f'Node {node} should have {env_nncores} cores available,' + f'Node {node} should have {env_nncpus} cores available,' f' but the agent reports {ag_nncores} available to it.' ' Maybe the cluster does not constrain resources? We\'ll' - f' use the {env_nncores} that we should have got.') - resources.cores[node] = set(agent_cores[node][:env_nncores]) + f' use the {env_nncpus} that we should have got.') + resources.cores[node] = set(agent_cores[node][:env_nncpus]) for node in agent_cores: - if node not in env_ncores: + if node not in env_ncpus: _logger.warning( f'An agent is running on node {node} but the environment' ' does not list it as ours. It seems that the node\'s' diff --git a/libmuscle/python/libmuscle/native_instantiator/slurm.py b/libmuscle/python/libmuscle/native_instantiator/slurm.py index 0c726cd2..f61a02e8 100644 --- a/libmuscle/python/libmuscle/native_instantiator/slurm.py +++ b/libmuscle/python/libmuscle/native_instantiator/slurm.py @@ -250,8 +250,8 @@ def get_nodes() -> List[str]: return parse_slurm_nodelist(nodelist) -def get_cores_per_node() -> List[int]: - """Return the number of CPU cores per node. +def get_logical_cpus_per_node() -> List[int]: + """Return the number of logical CPU cores per node. This returns a list with the number of cores of each node in the result of get_nodes(), which gets read from SLURM_JOB_CPUS_PER_NODE. From 39a3c4402b5d70501a101991471d773225a6eb9d Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sun, 1 Dec 2024 17:23:24 +0100 Subject: [PATCH 31/49] Improve compatibility with older SLURM versions --- .../native_instantiator/global_resources.py | 2 +- .../native_instantiator/run_script.py | 51 +++--- .../libmuscle/native_instantiator/slurm.py | 168 ++++++++++++------ 3 files changed, 143 insertions(+), 78 deletions(-) diff --git a/libmuscle/python/libmuscle/native_instantiator/global_resources.py b/libmuscle/python/libmuscle/native_instantiator/global_resources.py index 1053a717..4b1e28c7 100644 --- a/libmuscle/python/libmuscle/native_instantiator/global_resources.py +++ b/libmuscle/python/libmuscle/native_instantiator/global_resources.py @@ -5,7 +5,7 @@ import psutil -from libmuscle.native_instantiator import slurm +from libmuscle.native_instantiator.slurm import slurm _logger = logging.getLogger(__name__) diff --git a/libmuscle/python/libmuscle/native_instantiator/run_script.py b/libmuscle/python/libmuscle/native_instantiator/run_script.py index 042f897d..8be23d3d 100644 --- a/libmuscle/python/libmuscle/native_instantiator/run_script.py +++ b/libmuscle/python/libmuscle/native_instantiator/run_script.py @@ -2,6 +2,7 @@ from typing import Dict, FrozenSet, List, Optional, Tuple from libmuscle.errors import ConfigurationError +from libmuscle.native_instantiator.slurm import slurm from libmuscle.planner.planner import Resources from ymmsl import ( ExecutionModel, Implementation, MPICoresResReq, MPINodesResReq, @@ -221,32 +222,42 @@ def cluster_command(implementation: Implementation) -> str: if implementation.execution_model == ExecutionModel.DIRECT: fstr = 'taskset $MUSCLE_BIND_MASK {command} {args}' elif implementation.execution_model == ExecutionModel.OPENMPI: - # Native name is orterun for older and prterun for newer OpenMPI. - # So we go with mpirun, which works for either. - fstr = ( - 'mpirun -v -np $MUSCLE_MPI_PROCESSES' - ' -d --debug-daemons' - ' --rankfile $MUSCLE_RANKFILE --use-hwthread-cpus --oversubscribe' - # ' --map-by rankfile:file=$MUSCLE_RANKFILE:oversubscribe' - # ' --display-map --display-allocation {command} {args}' - - # This adds the given option to the srun command used by mpirun to - # launch its daemons. mpirun specifies --exclusive, which on SLURM <= - # 21-08 causes SLURM to wait for our agents to quit, as it considers - # them to be occupying the cores, causing a deadlock. Fortunately, it - # seems that adding --overlap overrides the --exclusive and it works. - ' -mca plm_slurm_args "--overlap"' - ' --bind-to core --display-map --display-allocation {command} {args}' - ) + fargs = [ + # Native name is orterun for older and prterun for newer OpenMPI. + # So we go with mpirun, which works for either. + 'mpirun -v -np $MUSCLE_MPI_PROCESSES', + '-d --debug-daemons', + '--rankfile $MUSCLE_RANKFILE --use-hwthread-cpus --oversubscribe' + ] + + if slurm.quirks.overlap: + # This adds the given option to the srun command used by mpirun to launch + # its daemons. mpirun specifies --exclusive, which on SLURM <= 21-08 causes + # SLURM to wait for our agents to quit, as it considers them to be occupying + # the cores, causing a deadlock. Fortunately, it seems that adding --overlap + # overrides the --exclusive and it works. + fargs.append('-mca plm_slurm_args "--overlap"') + + fargs.extend([ + '--bind-to core --display-map --display-allocation {command} {args}']) + + fstr = ' '.join(fargs) + elif implementation.execution_model == ExecutionModel.INTELMPI: fstr = ( 'mpirun -n $MUSCLE_MPI_PROCESSES -machinefile $MUSCLE_RANKFILE' ' {command} {args}') elif implementation.execution_model == ExecutionModel.SRUNMPI: # TODO: set SLURM_CPU_BIND_VERBOSE for verbose output - fstr = ( - 'srun -n $MUSCLE_MPI_PROCESSES -m arbitrary --overlap' - ' --cpu-bind=$SLURM_CPU_BIND {command} {args}') + fargs = ['srun -n $MUSCLE_MPI_PROCESSES -m arbitrary'] + + if slurm.quirks.overlap: + fargs.append('--overlap') + + fargs.append(f'{slurm.quirks.cpu_bind}=$SLURM_CPU_BIND {{command}} {{args}}') + + fstr = ' '.join(fargs) + # elif implementation.execution_model == ExecutionModel.MPICH # fstr = 'mpiexec -n $MUSCLE_MPI_PROCESSES -f $MUSCLE_RANKFILE {command} {args}' diff --git a/libmuscle/python/libmuscle/native_instantiator/slurm.py b/libmuscle/python/libmuscle/native_instantiator/slurm.py index f61a02e8..a6286ee0 100644 --- a/libmuscle/python/libmuscle/native_instantiator/slurm.py +++ b/libmuscle/python/libmuscle/native_instantiator/slurm.py @@ -3,6 +3,7 @@ import os from parsimonious import Grammar, NodeVisitor from parsimonious.nodes import Node +import subprocess from typing import Any, cast, List, Sequence, Tuple @@ -222,77 +223,130 @@ def parse_slurm_nodes_cores(s: str) -> List[int]: return cast(List[int], _nce_visitor.visit(ast)) -def in_slurm_allocation() -> bool: - """Check whether we're in a SLURM allocation. +class SlurmQuirks: + """Collects features of the present SLURM.""" + overlap: bool + """True iff --overlap must be specified for srun.""" + cpu_bind: str + """CPU binding argument, --cpu-bind or --cpu_bind.""" - Returns true iff SLURM was detected. - """ - return 'SLURM_JOB_ID' in os.environ +class SlurmInfo: + """Detects and holds information about the present SLURM scheduler.""" + def __init__(self) -> None: + if self.in_slurm_allocation(): + self.version = self._slurm_version() + self.quirks = SlurmQuirks() -def get_nodes() -> List[str]: - """Get a list of node names from SLURM_JOB_NODELIST. + self.quirks.overlap = self.version > (20, 2) + self.quirks.cpu_bind = ( + '--cpu-bind' if self.version > (17, 2) else '--cpu_bind') - This inspects SLURM_JOB_NODELIST or SLURM_NODELIST and returns an - expanded list of node names. + def in_slurm_allocation(self) -> bool: + """Check whether we're in a SLURM allocation. - If SLURM_JOB_NODELIST is "node[020-023]" then this returns - ["node020", "node021", "node022", "node023"]. - """ - nodelist = os.environ.get('SLURM_JOB_NODELIST') - if not nodelist: - nodelist = os.environ.get('SLURM_NODELIST') - if not nodelist: - raise RuntimeError('SLURM_(JOB_)NODELIST not set, are we running locally?') + Returns true iff SLURM was detected. + """ + return 'SLURM_JOB_ID' in os.environ - _logger.debug(f'SLURM node list: {nodelist}') + def get_nodes(self) -> List[str]: + """Get a list of node names from SLURM_JOB_NODELIST. - return parse_slurm_nodelist(nodelist) + This inspects SLURM_JOB_NODELIST or SLURM_NODELIST and returns an + expanded list of node names. + If SLURM_JOB_NODELIST is "node[020-023]" then this returns + ["node020", "node021", "node022", "node023"]. + """ + nodelist = os.environ.get('SLURM_JOB_NODELIST') + if not nodelist: + nodelist = os.environ.get('SLURM_NODELIST') + if not nodelist: + raise RuntimeError('SLURM_(JOB_)NODELIST not set, are we running locally?') -def get_logical_cpus_per_node() -> List[int]: - """Return the number of logical CPU cores per node. + _logger.debug(f'SLURM node list: {nodelist}') - This returns a list with the number of cores of each node in the result of - get_nodes(), which gets read from SLURM_JOB_CPUS_PER_NODE. - """ - sjcpn = os.environ.get('SLURM_JOB_CPUS_PER_NODE') - _logger.debug(f'SLURM_JOB_CPUS_PER_NODE: {sjcpn}') + return parse_slurm_nodelist(nodelist) - if sjcpn: - return parse_slurm_nodes_cores(sjcpn) - else: - scon = os.environ.get('SLURM_CPUS_ON_NODE') - _logger.debug(f'SLURM_CPUS_ON_NODE: {scon}') + def get_logical_cpus_per_node(self) -> List[int]: + """Return the number of logical CPU cores per node. - snn = os.environ.get('SLURM_JOB_NUM_NODES') - if not snn: - snn = os.environ.get('SLURM_NNODES') - _logger.debug(f'SLURM num nodes: {snn}') + This returns a list with the number of cores of each node in the result of + get_nodes(), which gets read from SLURM_JOB_CPUS_PER_NODE. + """ + sjcpn = os.environ.get('SLURM_JOB_CPUS_PER_NODE') + _logger.debug(f'SLURM_JOB_CPUS_PER_NODE: {sjcpn}') - if scon and snn: - return [int(scon)] * int(snn) + if sjcpn: + return parse_slurm_nodes_cores(sjcpn) + else: + scon = os.environ.get('SLURM_CPUS_ON_NODE') + _logger.debug(f'SLURM_CPUS_ON_NODE: {scon}') - raise RuntimeError( - 'SLURM_JOB_CPUS_PER_NODE is not set in the environment, and also' - ' SLURM_CPUS_ON_NODE is missing or neither SLURM_JOB_NUM_NODES nor' - ' SLURM_NNODES is set. Please create an issue on GitHub with the output' - ' of "sbatch --version" on this cluster.') + snn = os.environ.get('SLURM_JOB_NUM_NODES') + if not snn: + snn = os.environ.get('SLURM_NNODES') + _logger.debug(f'SLURM num nodes: {snn}') + if scon and snn: + return [int(scon)] * int(snn) -def agent_launch_command(agent_cmd: List[str], nnodes: int) -> List[str]: - """Return a command for launching one agent on each node. + raise RuntimeError( + 'SLURM_JOB_CPUS_PER_NODE is not set in the environment, and also' + ' SLURM_CPUS_ON_NODE is missing or neither SLURM_JOB_NUM_NODES nor' + ' SLURM_NNODES is set. Please create an issue on GitHub with the output' + ' of "sbatch --version" on this cluster.') - Args: - agent_cmd: A command that will start the agent. - """ - # TODO: On the latest Slurm, there's a special command for this that we should use - # if we have that, --external-launcher. Poorly documented though, so will require - # some experimentation. - - # On SLURM <= 23-02, the number of tasks is inherited by srun from sbatch rather - # than calculated anew from --nodes and --ntasks-per-node, so we specify it - # explicitly to avoid getting an agent per logical cpu rather than per node. - return [ - 'srun', f'--nodes={nnodes}', f'--ntasks={nnodes}', '--ntasks-per-node=1', - '--overlap'] + agent_cmd + def agent_launch_command(self, agent_cmd: List[str], nnodes: int) -> List[str]: + """Return a command for launching one agent on each node. + + Args: + agent_cmd: A command that will start the agent. + """ + # TODO: On the latest Slurm, there's a special command for this that we should use + # if we have that, --external-launcher. Poorly documented though, so will require + # some experimentation. + + # On SLURM <= 23-02, the number of tasks is inherited by srun from sbatch rather + # than calculated anew from --nodes and --ntasks-per-node, so we specify it + # explicitly to avoid getting an agent per logical cpu rather than per node. + srun_cmd = [ + 'srun', f'--nodes={nnodes}', f'--ntasks={nnodes}', + '--ntasks-per-node=1' + ] + + if self.quirks.overlap: + srun_cmd.append('--overlap') + + return srun_cmd + agent_cmd + + def _slurm_version(self) -> Tuple[int, int]: + """Obtains current version of SLURM from srun -v. + + This returns only the first two numbers, hopefully there won't be any changes in + behaviour within a release series. + """ + proc = subprocess.run( + ['srun', '--version'], check=True, capture_output=True, text=True, + encoding='utf-8' + ) + + output = proc.stdout.strip().split() + if len(output) < 2: + raise RuntimeError( + f'Unexpected srun version output "{output}". MUSCLE3 does not know' + ' how to run on this version of SLURM. Please file an issue on' + ' GitHub.') + + version_str = output[1] + version = version_str.split('.') + if len(version) < 2: + _logger.error(f'srun produced unexpected version {version_str}') + raise RuntimeError( + f'Unexpected srun version output "{output}". MUSCLE3 does not know' + ' how to run on this version of SLURM. Please file an issue on' + ' GitHub.') + return int(version[0]), int(version[1]) + + +slurm = SlurmInfo() From f27757661927d3a922042ee18b15cc590f68b0ba Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sun, 1 Dec 2024 17:23:49 +0100 Subject: [PATCH 32/49] Add older SLURM versions to test setup --- integration_test/cluster_test/conftest.py | 38 +++++++++++++-- integration_test/fake_cluster/old.Dockerfile | 50 ++++++++++++++++++++ 2 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 integration_test/fake_cluster/old.Dockerfile diff --git a/integration_test/cluster_test/conftest.py b/integration_test/cluster_test/conftest.py index 19eb7ebe..721d12dd 100644 --- a/integration_test/cluster_test/conftest.py +++ b/integration_test/cluster_test/conftest.py @@ -15,7 +15,10 @@ REMOTE_SHARED = '/home/cerulean/shared' -IDX_SLURM_VERSIONS = list(enumerate(['23-11'])) +IDX_SLURM_VERSIONS = list(enumerate([ + '17-02', '17-11', '18-08', '19-05', '20-02', '20-11', '21-08', '22-05', '23-02', + '23-11' + ])) # Shut down the containers after running the tests. Set to False to debug. CLEAN_UP_CONTAINERS = True @@ -57,6 +60,25 @@ def fake_cluster_image(local_term): ' -f integration_test/fake_cluster/Dockerfile .')) +@pytest.fixture(scope='session') +def fake_cluster_image_old(local_term): + run_cmd(local_term, 5400, ( + f'docker buildx build -t {IMAGE_NAME}_old' + ' -f integration_test/fake_cluster/old.Dockerfile .')) + + +def _image_name(slurm_version): + if slurm_version <= '20-02': + return IMAGE_NAME + '_old' + return IMAGE_NAME + + +def _gcc_version(slurm_version): + if slurm_version <= '20-02': + return '7.5.0' + return '11.4.0' + + def ssh_term(port, timeout_msg): cred = cerulean.PasswordCredential('cerulean', 'kingfisher') ready = False @@ -99,21 +121,25 @@ def _start_nodes(local_term, slurm_version, net_name, shared_dir): for i in range(5): node_name = f'node-{i}' + image_name = _image_name(slurm_version) + run_cmd(local_term, 60, ( f'docker run -d --name={node_name}-{slurm_version} --hostname={node_name}' f' --network={net_name} --cap-add=CAP_SYS_NICE' f' --env SLURM_VERSION={slurm_version}' f' --mount type=bind,source={shared_dir},target={REMOTE_SHARED}' - f' {IMAGE_NAME}')) + f' {image_name}')) def _start_headnode(local_term, slurm_version, net_name, shared_dir, headnode_port): + image_name = _image_name(slurm_version) + run_cmd(local_term, 60, ( f'docker run -d --name=headnode-{slurm_version} --hostname=headnode' f' --network={net_name} -p {headnode_port}:22' f' --env SLURM_VERSION={slurm_version}' f' --mount type=bind,source={shared_dir},target={REMOTE_SHARED}' - f' {IMAGE_NAME}')) + f' {image_name}')) ssh_term(headnode_port, 'Virtual cluster container start timed out') @@ -179,7 +205,9 @@ def _install_muscle3_native_openmpi( f'/opt/spack/bin/spack find --format \\"{{version}}\\" /{openmpi_hash}' '"')).strip() - module_name = f'openmpi/{openmpi_version}-gcc-11.4.0-{openmpi_hash[:7]}' + gcc_version = _gcc_version(slurm_version) + + module_name = f'openmpi/{openmpi_version}-gcc-{gcc_version}-{openmpi_hash[:7]}' logger_.info(f'Slurm {slurm_version} and module {module_name}') @@ -241,7 +269,7 @@ def _clean_up_base_cluster(local_term, slurm_version): @pytest.fixture(scope='session', params=IDX_SLURM_VERSIONS) def installed_cluster( - request, cleanup_docker, fake_cluster_image, shared_dir, + request, cleanup_docker, fake_cluster_image, fake_cluster_image_old, shared_dir, repo_root, local_term): slurm_version = request.param[1] diff --git a/integration_test/fake_cluster/old.Dockerfile b/integration_test/fake_cluster/old.Dockerfile new file mode 100644 index 00000000..700075c7 --- /dev/null +++ b/integration_test/fake_cluster/old.Dockerfile @@ -0,0 +1,50 @@ +FROM ghcr.io/naturalhpc/cerulean-fake-slurm-base-old:latest +# FROM naturalhpc/cerulean-fake-slurm-base-old:latest + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@2.0.0 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@17-02 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@17-02) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@2.0.0 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@17-11 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@17-11) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@2.0.0 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@18-08 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@18-08) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@2.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@19-05 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@19-05) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@2.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@20-02 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@20-02) + +# RUN . /opt/spack/share/spack/setup-env.sh && \ +# . $(spack location -i lmod)/lmod/lmod/init/bash && \ +# spack install mpich+slurm pmi=pmix ^pmix@3.2.3 + +# RUN . /opt/spack/share/spack/setup-env.sh && \ +# . $(spack location -i lmod)/lmod/lmod/init/bash && \ +# spack install intel-oneapi-mpi ^pmix@3.2.3 + +# Disable ssh debug output +RUN sed -i -e 's/^LogLevel DEBUG3$//' /etc/ssh/sshd_config +RUN sed -i -e 's^Subsystem sftp /usr/lib/openssh/sftp-server -l DEBUG3^Subsystem sftp /usr/lib/openssh/sftp-server^' /etc/ssh/sshd_config + + +RUN apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /home/cerulean + From ceae033537aab2f49d19b69815ab058a7c33c211 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sun, 1 Dec 2024 18:22:12 +0100 Subject: [PATCH 33/49] Fix planner tests after SMT updates (oops!) --- .../planner/test/test_planner_scenarios.py | 369 +++++++++--------- 1 file changed, 194 insertions(+), 175 deletions(-) diff --git a/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py b/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py index cf6067d4..6a4a2a95 100644 --- a/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py +++ b/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py @@ -1,7 +1,7 @@ from copy import deepcopy from libmuscle.planner.planner import ModelGraph, Planner, Resources -from typing import Dict, Tuple +from typing import Dict, FrozenSet, Tuple import pytest from ymmsl import ( @@ -9,6 +9,11 @@ MPICoresResReq, Ports, Reference, ResourceRequirements, ThreadedResReq) +def c(hwthread_id: int) -> FrozenSet[int]: + """Helper that defines a core with the given hwthread id.""" + return frozenset({hwthread_id}) + + _ResReqs = Dict[Reference, ResourceRequirements] @@ -38,12 +43,12 @@ s0_model, None, s0_implementations, s0_requirements) -s0_resources = Resources({'node001': {0, 1, 2, 3}}) +s0_resources = Resources({'node001': {c(0), c(1), c(2), c(3)}}) s0_solution = { - Reference('macro'): Resources({'node001': {0, 1}}), - Reference('micro'): Resources({'node001': {2, 3}})} + Reference('macro'): Resources({'node001': {c(0), c(1)}}), + Reference('micro'): Resources({'node001': {c(2), c(3)}})} s1_model = Model( @@ -83,14 +88,14 @@ s1_model, None, s1_implementations, s1_requirements) -s1_resources = Resources({'node001': {0, 1, 2, 3}}) +s1_resources = Resources({'node001': {c(0), c(1), c(2), c(3)}}) s1_solution = { - Reference('macro'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro1'): Resources({'node001': {0, 1}}), - Reference('micro2'): Resources({'node001': {0, 1}}), - Reference('micro3'): Resources({'node001': {0}})} + Reference('macro'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('micro1'): Resources({'node001': {c(0), c(1)}}), + Reference('micro2'): Resources({'node001': {c(0), c(1)}}), + Reference('micro3'): Resources({'node001': {c(0)}})} s2_model = Model( @@ -125,13 +130,14 @@ s2_model, None, s2_implementations, s2_requirements) -s2_resources = Resources({'node001': {0, 1, 2, 3}, 'node002': {0, 1, 2, 3}}) +s2_resources = Resources( + {'node001': {c(0), c(1), c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}}) s2_solution = { - Reference('macro'): Resources({'node001': {0}}), - Reference('micro1'): Resources({'node001': {0, 1, 2}}), - Reference('micro2'): Resources({'node002': {0, 1}})} + Reference('macro'): Resources({'node001': {c(0)}}), + Reference('micro1'): Resources({'node001': {c(0), c(1), c(2)}}), + Reference('micro2'): Resources({'node002': {c(0), c(1)}})} s3_model = Model( @@ -170,14 +176,16 @@ s3_model, None, s3_implementations, s3_requirements) -s3_resources = Resources({'node001': {0, 1, 2, 3}, 'node002': {0, 1, 2, 3}}) +s3_resources = Resources( + {'node001': {c(0), c(1), c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}}) s3_solution = { - Reference('a'): Resources({'node001': {0}}), - Reference('b1'): Resources({'node001': {2, 3}, 'node002': {0, 1, 2, 3}}), - Reference('b2'): Resources({'node001': {0, 1}}), - Reference('c'): Resources({'node001': {0, 1, 2, 3}})} + Reference('a'): Resources({'node001': {c(0)}}), + Reference('b1'): Resources( + {'node001': {c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}}), + Reference('b2'): Resources({'node001': {c(0), c(1)}}), + Reference('c'): Resources({'node001': {c(0), c(1), c(2), c(3)}})} s4_model = Model( @@ -213,13 +221,14 @@ s4_model, None, s4_implementations, s4_requirements) -s4_resources = Resources({'node001': {0, 1, 2, 3}, 'node002': {0, 1, 2, 3}}) +s4_resources = Resources( + {'node001': {c(0), c(1), c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}}) s4_solution = { - Reference('macro1'): Resources({'node002': {0, 1}}), - Reference('macro2'): Resources({'node001': {0, 1, 2}}), - Reference('micro'): Resources({'node001': {0, 1, 2}})} + Reference('macro1'): Resources({'node002': {c(0), c(1)}}), + Reference('macro2'): Resources({'node001': {c(0), c(1), c(2)}}), + Reference('micro'): Resources({'node001': {c(0), c(1), c(2)}})} s5_model = Model( @@ -262,17 +271,18 @@ s5_resources = Resources({ - 'node001': {0, 1, 2, 3}, 'node002': {0, 1, 2, 3}, 'node003': {0, 1}}) + 'node001': {c(0), c(1), c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}, + 'node003': {c(0), c(1)}}) # This is inefficient, as the models can all share resources. But repeater # is funny, and the algorithm cannot deal with it yet. It does give a valid # result with no overlap, so we'll accept that for the time being. s5_solution = { - Reference('init'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro'): Resources({'node002': {0, 1, 2, 3}}), - Reference('repeater'): Resources({'node003': {0}})} + Reference('init'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('macro'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('micro'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), + Reference('repeater'): Resources({'node003': {c(0)}})} s6_model = Model( @@ -309,21 +319,21 @@ s6_resources = Resources({ - 'node001': {0, 1, 2, 3}, 'node002': {0, 1, 2, 3}, - 'node003': {0, 1, 2, 3}, 'node004': {0, 1, 2, 3}, - 'node005': {0, 1, 2, 3}, 'node006': {0, 1, 2, 3} + 'node001': {c(0), c(1), c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}, + 'node003': {c(0), c(1), c(2), c(3)}, 'node004': {c(0), c(1), c(2), c(3)}, + 'node005': {c(0), c(1), c(2), c(3)}, 'node006': {c(0), c(1), c(2), c(3)} }) s6_solution = { - Reference('a'): Resources({'node001': {0, 1, 2, 3}}), - Reference('tcf'): Resources({'node002': {0}}), + Reference('a'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('tcf'): Resources({'node002': {c(0)}}), Reference('b'): Resources({ - 'node002': {1, 2, 3}, - 'node003': {0, 1, 2, 3}, - 'node004': {0, 1, 2, 3}, - 'node005': {0, 1, 2, 3}, - 'node006': {0}})} + 'node002': {c(1), c(2), c(3)}, + 'node003': {c(0), c(1), c(2), c(3)}, + 'node004': {c(0), c(1), c(2), c(3)}, + 'node005': {c(0), c(1), c(2), c(3)}, + 'node006': {c(0)}})} s7_model = Model( @@ -365,46 +375,46 @@ s7_resources = Resources({ - 'node001': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node002': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node003': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node004': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node005': {0, 1, 2, 3, 4, 5, 6, 7}, + 'node001': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, + 'node002': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, + 'node003': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, + 'node004': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, + 'node005': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, }) s7_solution = { - Reference('mc'): Resources({'node001': {0}}), - Reference('init[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('init[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('init[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('init[3]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('init[4]'): Resources({'node003': {0, 1, 2, 3}}), - Reference('init[5]'): Resources({'node003': {4, 5, 6, 7}}), - Reference('init[6]'): Resources({'node004': {0, 1, 2, 3}}), - Reference('init[7]'): Resources({'node004': {4, 5, 6, 7}}), - Reference('init[8]'): Resources({'node005': {0, 1, 2, 3}}), - Reference('init[9]'): Resources({'node005': {4, 5, 6, 7}}), - Reference('macro[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('macro[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('macro[3]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('macro[4]'): Resources({'node003': {0, 1, 2, 3}}), - Reference('macro[5]'): Resources({'node003': {4, 5, 6, 7}}), - Reference('macro[6]'): Resources({'node004': {0, 1, 2, 3}}), - Reference('macro[7]'): Resources({'node004': {4, 5, 6, 7}}), - Reference('macro[8]'): Resources({'node005': {0, 1, 2, 3}}), - Reference('macro[9]'): Resources({'node005': {4, 5, 6, 7}}), - Reference('micro[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('micro[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('micro[3]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('micro[4]'): Resources({'node003': {0, 1, 2, 3}}), - Reference('micro[5]'): Resources({'node003': {4, 5, 6, 7}}), - Reference('micro[6]'): Resources({'node004': {0, 1, 2, 3}}), - Reference('micro[7]'): Resources({'node004': {4, 5, 6, 7}}), - Reference('micro[8]'): Resources({'node005': {0, 1, 2, 3}}), - Reference('micro[9]'): Resources({'node005': {4, 5, 6, 7}})} + Reference('mc'): Resources({'node001': {c(0)}}), + Reference('init[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('init[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), + Reference('init[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), + Reference('init[3]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), + Reference('init[4]'): Resources({'node003': {c(0), c(1), c(2), c(3)}}), + Reference('init[5]'): Resources({'node003': {c(4), c(5), c(6), c(7)}}), + Reference('init[6]'): Resources({'node004': {c(0), c(1), c(2), c(3)}}), + Reference('init[7]'): Resources({'node004': {c(4), c(5), c(6), c(7)}}), + Reference('init[8]'): Resources({'node005': {c(0), c(1), c(2), c(3)}}), + Reference('init[9]'): Resources({'node005': {c(4), c(5), c(6), c(7)}}), + Reference('macro[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('macro[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), + Reference('macro[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), + Reference('macro[3]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), + Reference('macro[4]'): Resources({'node003': {c(0), c(1), c(2), c(3)}}), + Reference('macro[5]'): Resources({'node003': {c(4), c(5), c(6), c(7)}}), + Reference('macro[6]'): Resources({'node004': {c(0), c(1), c(2), c(3)}}), + Reference('macro[7]'): Resources({'node004': {c(4), c(5), c(6), c(7)}}), + Reference('macro[8]'): Resources({'node005': {c(0), c(1), c(2), c(3)}}), + Reference('macro[9]'): Resources({'node005': {c(4), c(5), c(6), c(7)}}), + Reference('micro[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('micro[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), + Reference('micro[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), + Reference('micro[3]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), + Reference('micro[4]'): Resources({'node003': {c(0), c(1), c(2), c(3)}}), + Reference('micro[5]'): Resources({'node003': {c(4), c(5), c(6), c(7)}}), + Reference('micro[6]'): Resources({'node004': {c(0), c(1), c(2), c(3)}}), + Reference('micro[7]'): Resources({'node004': {c(4), c(5), c(6), c(7)}}), + Reference('micro[8]'): Resources({'node005': {c(0), c(1), c(2), c(3)}}), + Reference('micro[9]'): Resources({'node005': {c(4), c(5), c(6), c(7)}})} s8_model = Model( @@ -441,13 +451,14 @@ s8_model, None, s8_implementations, s8_requirements) -s8_resources = Resources({'node001': {0, 1, 2, 3}, 'node002': {0, 1, 2, 3}}) +s8_resources = Resources( + {'node001': {c(0), c(1), c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}}) s8_solution = { - Reference('macro'): Resources({'node001': {3}}), - Reference('micro1'): Resources({'node001': {0, 1, 2}}), - Reference('micro2'): Resources({'node001': {0, 1}})} + Reference('macro'): Resources({'node001': {c(3)}}), + Reference('micro1'): Resources({'node001': {c(0), c(1), c(2)}}), + Reference('micro2'): Resources({'node001': {c(0), c(1)}})} s9_model = Model( @@ -489,15 +500,15 @@ s9_model, None, s9_implementations, s9_requirements) -s9_resources = Resources({'node001': {0, 1, 2, 3}}) +s9_resources = Resources({'node001': {c(0), c(1), c(2), c(3)}}) s9_solution = { - Reference('a'): Resources({'node001': {1}}), - Reference('b'): Resources({'node001': {0}}), - Reference('c'): Resources({'node001': {0}}), - Reference('d'): Resources({'node001': {1}}), - Reference('e'): Resources({'node001': {0}})} + Reference('a'): Resources({'node001': {c(1)}}), + Reference('b'): Resources({'node001': {c(0)}}), + Reference('c'): Resources({'node001': {c(0)}}), + Reference('d'): Resources({'node001': {c(1)}}), + Reference('e'): Resources({'node001': {c(0)}})} s10_model = Model( @@ -542,31 +553,37 @@ s10_resources = Resources({ - 'node001': {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - 'node002': {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - 'node003': {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + 'node001': { + c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7), + c(8),c(9), c(10), c(11), c(12), c(13), c(14), c(15)}, + 'node002': { + c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7), + c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)}, + 'node003': { + c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7), + c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)}, }) s10_solution = { - Reference('mc'): Resources({'node001': {0}}), - Reference('rr'): Resources({'node001': {0}}), - Reference('macro[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('macro[2]'): Resources({'node001': {8, 9, 10, 11}}), - Reference('macro[3]'): Resources({'node001': {12, 13, 14, 15}}), - Reference('macro[4]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('macro[5]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('macro[6]'): Resources({'node002': {8, 9, 10, 11}}), - Reference('macro[7]'): Resources({'node002': {12, 13, 14, 15}}), - Reference('micro[0]'): Resources({'node001': {0, 1}}), - Reference('micro[1]'): Resources({'node001': {4, 5}}), - Reference('micro[2]'): Resources({'node001': {8, 9}}), - Reference('micro[3]'): Resources({'node001': {12, 13}}), - Reference('micro[4]'): Resources({'node002': {0, 1}}), - Reference('micro[5]'): Resources({'node002': {4, 5}}), - Reference('micro[6]'): Resources({'node002': {8, 9}}), - Reference('micro[7]'): Resources({'node002': {12, 13}})} + Reference('mc'): Resources({'node001': {c(0)}}), + Reference('rr'): Resources({'node001': {c(0)}}), + Reference('macro[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('macro[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), + Reference('macro[2]'): Resources({'node001': {c(8), c(9), c(10), c(11)}}), + Reference('macro[3]'): Resources({'node001': {c(12), c(13), c(14), c(15)}}), + Reference('macro[4]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), + Reference('macro[5]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), + Reference('macro[6]'): Resources({'node002': {c(8), c(9), c(10), c(11)}}), + Reference('macro[7]'): Resources({'node002': {c(12), c(13), c(14), c(15)}}), + Reference('micro[0]'): Resources({'node001': {c(0), c(1)}}), + Reference('micro[1]'): Resources({'node001': {c(4), c(5)}}), + Reference('micro[2]'): Resources({'node001': {c(8), c(9)}}), + Reference('micro[3]'): Resources({'node001': {c(12), c(13)}}), + Reference('micro[4]'): Resources({'node002': {c(0), c(1)}}), + Reference('micro[5]'): Resources({'node002': {c(4), c(5)}}), + Reference('micro[6]'): Resources({'node002': {c(8), c(9)}}), + Reference('micro[7]'): Resources({'node002': {c(12), c(13)}})} s11_model = Model( @@ -606,24 +623,24 @@ s11_resources = Resources({ - 'node001': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node002': {0, 1, 2, 3, 4, 5, 6, 7}, + 'node001': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, + 'node002': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, }) s11_solution = { - Reference('macro1[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro1[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('macro1[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('micro1[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro1[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('micro1[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('macro2[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro2[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('macro2[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('micro2[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro2[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('micro2[2]'): Resources({'node002': {0, 1, 2, 3}}), + Reference('macro1[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('macro1[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), + Reference('macro1[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), + Reference('micro1[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('micro1[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), + Reference('micro1[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), + Reference('macro2[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('macro2[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), + Reference('macro2[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), + Reference('micro2[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('micro2[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), + Reference('micro2[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), } @@ -646,14 +663,16 @@ s12_solution = { - Reference('macro1'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro1[0]'): Resources({'node001': {0, 1, 2, 3, 4, 5, 6, 7}}), - Reference('micro1[1]'): Resources({'node002': {0, 1, 2, 3, 4, 5, 6, 7}}), - Reference('macro2'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro2[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro2[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('micro2[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('micro2[3]'): Resources({'node002': {4, 5, 6, 7}}), + Reference('macro1'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('micro1[0]'): Resources({'node001': { + c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}}), + Reference('micro1[1]'): Resources({'node002': { + c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}}), + Reference('macro2'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('micro2[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('micro2[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), + Reference('micro2[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), + Reference('micro2[3]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), } @@ -676,58 +695,58 @@ s13_resources = Resources({ - 'node001': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node002': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node003': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node004': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node005': {0, 1, 2, 3, 4, 5, 6, 7}, + 'node001': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, + 'node002': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, + 'node003': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, + 'node004': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, + 'node005': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, }) s13_solution = { - Reference('macro1[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro1[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('macro1[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('macro1[3]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('macro1[4]'): Resources({'node003': {0, 1, 2, 3}}), - - Reference('micro1[0][0]'): Resources({'node001': {0, 1}}), - Reference('micro1[0][1]'): Resources({'node001': {2, 3}}), - Reference('micro1[0][2]'): Resources({'node003': {4, 5}}), - Reference('micro1[0][3]'): Resources({'node003': {6, 7}}), - Reference('micro1[1][0]'): Resources({'node001': {4, 5}}), - Reference('micro1[1][1]'): Resources({'node001': {6, 7}}), - Reference('micro1[1][2]'): Resources({'node004': {0, 1}}), - Reference('micro1[1][3]'): Resources({'node004': {2, 3}}), - Reference('micro1[2][0]'): Resources({'node002': {0, 1}}), - Reference('micro1[2][1]'): Resources({'node002': {2, 3}}), - Reference('micro1[2][2]'): Resources({'node004': {4, 5}}), - Reference('micro1[2][3]'): Resources({'node004': {6, 7}}), - Reference('micro1[3][0]'): Resources({'node002': {4, 5}}), - Reference('micro1[3][1]'): Resources({'node002': {6, 7}}), - Reference('micro1[3][2]'): Resources({'node005': {0, 1}}), - Reference('micro1[3][3]'): Resources({'node005': {2, 3}}), - Reference('micro1[4][0]'): Resources({'node003': {0, 1}}), - Reference('micro1[4][1]'): Resources({'node003': {2, 3}}), - Reference('micro1[4][2]'): Resources({'node005': {4, 5}}), - Reference('micro1[4][3]'): Resources({'node005': {6, 7}}), - - Reference('macro2[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro2[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('macro2[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('macro2[3]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('macro2[4]'): Resources({'node003': {0, 1, 2, 3}}), - - Reference('micro2[0][0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro2[0][1]'): Resources({'node003': {4, 5, 6, 7}}), - Reference('micro2[1][0]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('micro2[1][1]'): Resources({'node004': {0, 1, 2, 3}}), - Reference('micro2[2][0]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('micro2[2][1]'): Resources({'node004': {4, 5, 6, 7}}), - Reference('micro2[3][0]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('micro2[3][1]'): Resources({'node005': {0, 1, 2, 3}}), - Reference('micro2[4][0]'): Resources({'node003': {0, 1, 2, 3}}), - Reference('micro2[4][1]'): Resources({'node005': {4, 5, 6, 7}}), + Reference('macro1[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('macro1[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), + Reference('macro1[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), + Reference('macro1[3]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), + Reference('macro1[4]'): Resources({'node003': {c(0), c(1), c(2), c(3)}}), + + Reference('micro1[0][0]'): Resources({'node001': {c(0), c(1)}}), + Reference('micro1[0][1]'): Resources({'node001': {c(2), c(3)}}), + Reference('micro1[0][2]'): Resources({'node003': {c(4), c(5)}}), + Reference('micro1[0][3]'): Resources({'node003': {c(6), c(7)}}), + Reference('micro1[1][0]'): Resources({'node001': {c(4), c(5)}}), + Reference('micro1[1][1]'): Resources({'node001': {c(6), c(7)}}), + Reference('micro1[1][2]'): Resources({'node004': {c(0), c(1)}}), + Reference('micro1[1][3]'): Resources({'node004': {c(2), c(3)}}), + Reference('micro1[2][0]'): Resources({'node002': {c(0), c(1)}}), + Reference('micro1[2][1]'): Resources({'node002': {c(2), c(3)}}), + Reference('micro1[2][2]'): Resources({'node004': {c(4), c(5)}}), + Reference('micro1[2][3]'): Resources({'node004': {c(6), c(7)}}), + Reference('micro1[3][0]'): Resources({'node002': {c(4), c(5)}}), + Reference('micro1[3][1]'): Resources({'node002': {c(6), c(7)}}), + Reference('micro1[3][2]'): Resources({'node005': {c(0), c(1)}}), + Reference('micro1[3][3]'): Resources({'node005': {c(2), c(3)}}), + Reference('micro1[4][0]'): Resources({'node003': {c(0), c(1)}}), + Reference('micro1[4][1]'): Resources({'node003': {c(2), c(3)}}), + Reference('micro1[4][2]'): Resources({'node005': {c(4), c(5)}}), + Reference('micro1[4][3]'): Resources({'node005': {c(6), c(7)}}), + + Reference('macro2[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('macro2[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), + Reference('macro2[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), + Reference('macro2[3]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), + Reference('macro2[4]'): Resources({'node003': {c(0), c(1), c(2), c(3)}}), + + Reference('micro2[0][0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), + Reference('micro2[0][1]'): Resources({'node003': {c(4), c(5), c(6), c(7)}}), + Reference('micro2[1][0]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), + Reference('micro2[1][1]'): Resources({'node004': {c(0), c(1), c(2), c(3)}}), + Reference('micro2[2][0]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), + Reference('micro2[2][1]'): Resources({'node004': {c(4), c(5), c(6), c(7)}}), + Reference('micro2[3][0]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), + Reference('micro2[3][1]'): Resources({'node005': {c(0), c(1), c(2), c(3)}}), + Reference('micro2[4][0]'): Resources({'node003': {c(0), c(1), c(2), c(3)}}), + Reference('micro2[4][1]'): Resources({'node005': {c(4), c(5), c(6), c(7)}}), } @@ -763,7 +782,7 @@ s14_model, None, s14_implementations, s14_requirements) -s14_resources = Resources({'node001': {0, 1, 2, 3, 4, 5}}) +s14_resources = Resources({'node001': {c(0), c(1), c(2), c(3), c(4), c(5)}}) s14_solution = RuntimeError From 00a1ccef3fa07df3f0bf0dc65a401c8e96c3f83e Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sun, 1 Dec 2024 18:22:38 +0100 Subject: [PATCH 34/49] Fix type typo --- .../python/libmuscle/native_instantiator/native_instantiator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py index bccaabbc..a876a682 100644 --- a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py +++ b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py @@ -253,7 +253,7 @@ def run(self) -> None: for line in traceback.format_exception(*sys.exc_info()): _logger.error(line) - result = CrashResult(sys.exc_info()[1]) + result = CrashedResult(sys.exc_info()[1]) self._resources_out.put(result) self._results_out.put(result) From dfd6b2a01add897add48de7c0d5a7b5a673739e0 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sun, 1 Dec 2024 18:23:22 +0100 Subject: [PATCH 35/49] Fix mypy errors --- libmuscle/python/libmuscle/manager/instantiator.py | 2 +- .../native_instantiator/agent/__main__.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/manager/instantiator.py b/libmuscle/python/libmuscle/manager/instantiator.py index 798482e0..b86f7cbf 100644 --- a/libmuscle/python/libmuscle/manager/instantiator.py +++ b/libmuscle/python/libmuscle/manager/instantiator.py @@ -113,7 +113,7 @@ class CancelAllRequest(InstantiatorRequest): class CrashedResult: """Signals that the instantiator process crashed.""" - def __init__(self, exception: Optional[Exception] = None) -> None: + def __init__(self, exception: Optional[BaseException] = None) -> None: self.exception = exception diff --git a/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py b/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py index 712da253..35af9c8a 100644 --- a/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py +++ b/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py @@ -108,6 +108,20 @@ def _inspect_resources(self) -> Dict[str, Any]: nhwthreads = psutil.cpu_count(logical=True) ncores = psutil.cpu_count(logical=False) + if nhwthreads is None and ncores is not None: + _logger.warning( + 'Could not determine number of hwthreads, assuming no SMT') + nhwthreads = ncores + elif ncores is None and nhwthreads is not None: + _logger.warning( + 'Could not determine number of cores, assuming no SMT') + ncores = nhwthreads + elif ncores is None and nhwthreads is None: + _logger.warning( + 'Could not determine CPU configuration, assuming a single core') + ncores = 1 + nhwthreads = 1 + hwthreads_per_core = nhwthreads // ncores if ncores * hwthreads_per_core != nhwthreads: From 65f20d8cb9ed836566a19bbea72b18669e55d1bb Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sun, 1 Dec 2024 18:44:48 +0100 Subject: [PATCH 36/49] Fix linter warnings --- integration_test/cluster_test/conftest.py | 3 +-- integration_test/cluster_test/test_cluster.py | 2 +- libmuscle/python/libmuscle/native_instantiator/slurm.py | 6 +++--- .../python/libmuscle/planner/test/test_planner_scenarios.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/integration_test/cluster_test/conftest.py b/integration_test/cluster_test/conftest.py index 721d12dd..51f934b7 100644 --- a/integration_test/cluster_test/conftest.py +++ b/integration_test/cluster_test/conftest.py @@ -181,7 +181,6 @@ def _create_muscle3_venv(remote_term, remote_source): f'/bin/bash -c "{in_venv} python3 -m pip install pip wheel setuptools"')) run_cmd(remote_term, 60, f'/bin/bash -c "{in_venv} pip install {remote_source}"') - return in_venv def _install_muscle3_native_openmpi( @@ -223,7 +222,7 @@ def _install_muscle3_native_openmpi( def _install_muscle3(repo_root, remote_term, remote_fs, slurm_version): remote_source = _install_remote_source(repo_root, remote_term, remote_fs) - in_venv = _create_muscle3_venv(remote_term, remote_source) + _create_muscle3_venv(remote_term, remote_source) return _install_muscle3_native_openmpi( remote_source, remote_term, remote_fs, slurm_version) diff --git a/integration_test/cluster_test/test_cluster.py b/integration_test/cluster_test/test_cluster.py index 57ef408c..51655584 100644 --- a/integration_test/cluster_test/test_cluster.py +++ b/integration_test/cluster_test/test_cluster.py @@ -3,7 +3,7 @@ import pytest from integration_test.cluster_test.conftest import ( - REMOTE_SHARED, run_cmd, ssh_term, skip_unless_cluster) + REMOTE_SHARED, ssh_term, skip_unless_cluster) logger_ = logging.getLogger(__name__) diff --git a/libmuscle/python/libmuscle/native_instantiator/slurm.py b/libmuscle/python/libmuscle/native_instantiator/slurm.py index a6286ee0..dc22d23d 100644 --- a/libmuscle/python/libmuscle/native_instantiator/slurm.py +++ b/libmuscle/python/libmuscle/native_instantiator/slurm.py @@ -303,9 +303,9 @@ def agent_launch_command(self, agent_cmd: List[str], nnodes: int) -> List[str]: Args: agent_cmd: A command that will start the agent. """ - # TODO: On the latest Slurm, there's a special command for this that we should use - # if we have that, --external-launcher. Poorly documented though, so will require - # some experimentation. + # TODO: On the latest Slurm, there's a special command for this that we should + # use if we have that, --external-launcher. Poorly documented though, so will + # require some experimentation. # On SLURM <= 23-02, the number of tasks is inherited by srun from sbatch rather # than calculated anew from --nodes and --ntasks-per-node, so we specify it diff --git a/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py b/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py index 6a4a2a95..f1f5b02a 100644 --- a/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py +++ b/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py @@ -555,7 +555,7 @@ def c(hwthread_id: int) -> FrozenSet[int]: s10_resources = Resources({ 'node001': { c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7), - c(8),c(9), c(10), c(11), c(12), c(13), c(14), c(15)}, + c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)}, 'node002': { c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7), c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)}, From 447acd530144697c7eed48ac480a26bde92a6473 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sun, 1 Dec 2024 20:54:05 +0100 Subject: [PATCH 37/49] Use Docker cp to upload to the fake cluster for better speed --- integration_test/cluster_test/conftest.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/integration_test/cluster_test/conftest.py b/integration_test/cluster_test/conftest.py index 51f934b7..3d1df34a 100644 --- a/integration_test/cluster_test/conftest.py +++ b/integration_test/cluster_test/conftest.py @@ -159,16 +159,22 @@ def _start_base_cluster(local_term, idx_slurm_version, shared_dir): return term, fs, headnode_port -def _install_remote_source(repo_root, remote_term, remote_fs): +def _install_remote_source(local_term, repo_root, remote_fs, slurm_version): muscle3_tgt = remote_fs / 'home' / 'cerulean' / 'muscle3' muscle3_tgt.mkdir() - (muscle3_tgt / 'libmuscle').mkdir() + + container = f'headnode-{slurm_version}' for f in ( 'muscle3', 'libmuscle', 'scripts', 'docs', 'setup.py', 'Makefile', 'MANIFEST.in', 'LICENSE', 'NOTICE', 'VERSION', 'README.rst'): - cerulean.copy( - repo_root / f, muscle3_tgt / f, overwrite='always', copy_into=False) + run_cmd(local_term, 60, ( + f'docker cp {repo_root / f} {container}:{muscle3_tgt / f}')) + + # needs to run as root, so not run through remote_term + run_cmd(local_term, 60, ( + f'docker exec {container} /bin/bash -c' + f' "chown -R cerulean:cerulean {muscle3_tgt}"')) return muscle3_tgt @@ -220,8 +226,9 @@ def _install_muscle3_native_openmpi( return prefix, module_name -def _install_muscle3(repo_root, remote_term, remote_fs, slurm_version): - remote_source = _install_remote_source(repo_root, remote_term, remote_fs) +def _install_muscle3(local_term, repo_root, remote_term, remote_fs, slurm_version): + remote_source = _install_remote_source( + local_term, repo_root, remote_fs, slurm_version) _create_muscle3_venv(remote_term, remote_source) return _install_muscle3_native_openmpi( remote_source, remote_term, remote_fs, slurm_version) @@ -279,7 +286,7 @@ def installed_cluster( remote_term, remote_fs, headnode_port = _start_base_cluster( local_term, request.param, local_shared_dir) remote_m3_openmpi = _install_muscle3( - repo_root, remote_term, remote_fs, slurm_version) + local_term, repo_root, remote_term, remote_fs, slurm_version) _install_tests(repo_root, remote_term, remote_fs, remote_m3_openmpi) yield headnode_port From 5fd98a77a66339f45253cb4324267eaea3cc667e Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sun, 1 Dec 2024 21:02:24 +0100 Subject: [PATCH 38/49] Fix issues raised by latest mypy --- .../native_instantiator/agent/__main__.py | 22 ++++++++++--------- .../native_instantiator/global_resources.py | 2 +- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py b/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py index 35af9c8a..a47dfca6 100644 --- a/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py +++ b/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py @@ -108,19 +108,21 @@ def _inspect_resources(self) -> Dict[str, Any]: nhwthreads = psutil.cpu_count(logical=True) ncores = psutil.cpu_count(logical=False) - if nhwthreads is None and ncores is not None: - _logger.warning( - 'Could not determine number of hwthreads, assuming no SMT') - nhwthreads = ncores - elif ncores is None and nhwthreads is not None: + if nhwthreads is None: + if ncores is not None: + _logger.warning( + 'Could not determine number of hwthreads, assuming no SMT') + nhwthreads = ncores + else: + _logger.warning( + 'Could not determine CPU configuration, assuming a single' + ' core') + ncores = 1 + nhwthreads = 1 + elif ncores is None: _logger.warning( 'Could not determine number of cores, assuming no SMT') ncores = nhwthreads - elif ncores is None and nhwthreads is None: - _logger.warning( - 'Could not determine CPU configuration, assuming a single core') - ncores = 1 - nhwthreads = 1 hwthreads_per_core = nhwthreads // ncores diff --git a/libmuscle/python/libmuscle/native_instantiator/global_resources.py b/libmuscle/python/libmuscle/native_instantiator/global_resources.py index 4b1e28c7..ce5ab82c 100644 --- a/libmuscle/python/libmuscle/native_instantiator/global_resources.py +++ b/libmuscle/python/libmuscle/native_instantiator/global_resources.py @@ -47,7 +47,7 @@ def __init__(self) -> None: _logger.info('Running locally without a cluster scheduler') self.scheduler = Scheduler.NONE self.nodes = [gethostname()] - self.logical_cpus_per_node = [psutil.cpu_count(logical=True)] + self.logical_cpus_per_node = [psutil.cpu_count(logical=True) or 0] _logger.info( f'We have {self.logical_cpus_per_node[0]} logical CPUS available') From 414a835ec056fd81fe61d145cf7860db83c311e9 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 4 Dec 2024 13:43:09 +0100 Subject: [PATCH 39/49] Add test with two instances on the same cores --- integration_test/cluster_test/conftest.py | 2 +- .../cluster_test/macro_micro.ymmsl | 25 ++++++++++++++ .../cluster_test/macro_micro_openmpi.sh | 12 +++++++ .../cluster_test/macro_micro_srunmpi.sh | 12 +++++++ integration_test/cluster_test/test_cluster.py | 34 +++++++++++++++++++ 5 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 integration_test/cluster_test/macro_micro.ymmsl create mode 100755 integration_test/cluster_test/macro_micro_openmpi.sh create mode 100755 integration_test/cluster_test/macro_micro_srunmpi.sh diff --git a/integration_test/cluster_test/conftest.py b/integration_test/cluster_test/conftest.py index 3d1df34a..2350f38a 100644 --- a/integration_test/cluster_test/conftest.py +++ b/integration_test/cluster_test/conftest.py @@ -21,7 +21,7 @@ ])) # Shut down the containers after running the tests. Set to False to debug. -CLEAN_UP_CONTAINERS = True +CLEAN_UP_CONTAINERS = False skip_unless_cluster = pytest.mark.skipif( diff --git a/integration_test/cluster_test/macro_micro.ymmsl b/integration_test/cluster_test/macro_micro.ymmsl new file mode 100644 index 00000000..22cbf8a5 --- /dev/null +++ b/integration_test/cluster_test/macro_micro.ymmsl @@ -0,0 +1,25 @@ +ymmsl_version: v0.1 + +model: + name: macro_micro + components: + c1: + ports: + o_i: inter_out + s: inter_in + implementation: component_cpp + c2: + ports: + f_init: init_in + o_f: final_out + implementation: component_cpp + + conduits: + c1.inter_out: c2.init_in + c2.final_out: c1.inter_in + +resources: + c1: + mpi_processes: 2 + c2: + mpi_processes: 2 diff --git a/integration_test/cluster_test/macro_micro_openmpi.sh b/integration_test/cluster_test/macro_micro_openmpi.sh new file mode 100755 index 00000000..6b7fccb3 --- /dev/null +++ b/integration_test/cluster_test/macro_micro_openmpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/macro_micro.ymmsl $CT/settings.ymmsl $CT/implementations_openmpi.ymmsl + diff --git a/integration_test/cluster_test/macro_micro_srunmpi.sh b/integration_test/cluster_test/macro_micro_srunmpi.sh new file mode 100755 index 00000000..a98aca57 --- /dev/null +++ b/integration_test/cluster_test/macro_micro_srunmpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/macro_micro.ymmsl $CT/settings.ymmsl $CT/implementations_srunmpi.ymmsl + diff --git a/integration_test/cluster_test/test_cluster.py b/integration_test/cluster_test/test_cluster.py index 51655584..9f0a7156 100644 --- a/integration_test/cluster_test/test_cluster.py +++ b/integration_test/cluster_test/test_cluster.py @@ -215,3 +215,37 @@ def test_double( node, hwthreads, _ = out.split('\n') assert node == f'node-{i + 2}' assert hwthread_to_core(hwthreads) == [rank] + + +@skip_unless_cluster +@pytest.mark.parametrize('mode', ['local', 'slurm']) +@pytest.mark.parametrize('execution_model', ['openmpi', 'srunmpi']) +def test_macro_micro( + fake_cluster, remote_test_files, remote_out_dir, hwthread_to_core, + mode, execution_model): + + if mode == 'local' and execution_model == 'srunmpi': + pytest.skip('srun does not work without slurm') + + sched = _sched(fake_cluster, mode) + + job = _make_mpi_job( + 'macro_micro', mode, execution_model, remote_test_files, remote_out_dir) + if mode == 'slurm': + job.num_nodes = 1 + job.extra_scheduler_options += ' --nodelist=node-4' + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 + + for i in range(1, 3): + for rank in range(2): + out = _get_outfile( + remote_out_dir, 'macro_micro', mode, execution_model, f'c{i}', rank) + if mode == 'local': + assert out.split('\n')[0] == 'headnode' + else: + node, hwthreads, _ = out.split('\n') + assert node == f'node-4' + assert hwthread_to_core(hwthreads) == [rank] From 88ceb42ba796b9811b6a1966cf0a00a0b8376fac Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 4 Dec 2024 20:31:44 +0100 Subject: [PATCH 40/49] Enable MPI debug output only if manager log level is debug --- .../native_instantiator/run_script.py | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/libmuscle/python/libmuscle/native_instantiator/run_script.py b/libmuscle/python/libmuscle/native_instantiator/run_script.py index 8be23d3d..a2c3d9b7 100644 --- a/libmuscle/python/libmuscle/native_instantiator/run_script.py +++ b/libmuscle/python/libmuscle/native_instantiator/run_script.py @@ -1,3 +1,4 @@ +import logging from pathlib import Path from typing import Dict, FrozenSet, List, Optional, Tuple @@ -159,7 +160,7 @@ def num_mpi_tasks(res_req: ResourceRequirements) -> int: raise RuntimeError('Invalid ResourceRequirements') -def local_command(implementation: Implementation) -> str: +def local_command(implementation: Implementation, enable_debug: bool) -> str: """Make a format string for the command to run. This interprets the execution_model and produces an appropriate shell command to @@ -168,6 +169,7 @@ def local_command(implementation: Implementation) -> str: Args: implementation: The implementation to start. + enable_debug: Whether to produce extra debug output. Return: A format string with embedded {ntasks} and {rankfile}. @@ -177,7 +179,18 @@ def local_command(implementation: Implementation) -> str: elif implementation.execution_model == ExecutionModel.OPENMPI: # Native name is orterun for older and prterun for newer OpenMPI. # So we go with mpirun, which works for either. - fstr = 'mpirun -np $MUSCLE_MPI_PROCESSES --oversubscribe {command} {args}' + fargs = [ + 'mpirun -np $MUSCLE_MPI_PROCESSES', + '--oversubscribe' + ] + + if enable_debug: + fargs.append('-v --debug-daemons --display-map --display-allocation') + + fargs.append('{command} {args}') + + fstr = ' '.join(fargs) + elif implementation.execution_model == ExecutionModel.INTELMPI: fstr = 'mpirun -n $MUSCLE_MPI_PROCESSES {command} {args}' elif implementation.execution_model == ExecutionModel.SRUNMPI: @@ -204,7 +217,7 @@ def local_command(implementation: Implementation) -> str: ) -def cluster_command(implementation: Implementation) -> str: +def cluster_command(implementation: Implementation, enable_debug: bool) -> str: """Make a format string for the command to run. This interprets the execution_model and produces an appropriate shell command to @@ -213,11 +226,11 @@ def cluster_command(implementation: Implementation) -> str: Args: implementation: The implementation to start. + enable_debug: Whether to produce extra debug output. Return: A string with the command to use to start the implementation. """ - # TODO: enable debug options iff the manager log level is set to DEBUG # TODO: don't use taskset if it's not available if implementation.execution_model == ExecutionModel.DIRECT: fstr = 'taskset $MUSCLE_BIND_MASK {command} {args}' @@ -225,11 +238,14 @@ def cluster_command(implementation: Implementation) -> str: fargs = [ # Native name is orterun for older and prterun for newer OpenMPI. # So we go with mpirun, which works for either. - 'mpirun -v -np $MUSCLE_MPI_PROCESSES', - '-d --debug-daemons', - '--rankfile $MUSCLE_RANKFILE --use-hwthread-cpus --oversubscribe' + 'mpirun -np $MUSCLE_MPI_PROCESSES', + '--rankfile $MUSCLE_RANKFILE --use-hwthread-cpus --bind-to core', + '--oversubscribe' ] + if enable_debug: + fargs.append('-v --debug-daemons --display-map --display-allocation') + if slurm.quirks.overlap: # This adds the given option to the srun command used by mpirun to launch # its daemons. mpirun specifies --exclusive, which on SLURM <= 21-08 causes @@ -238,8 +254,7 @@ def cluster_command(implementation: Implementation) -> str: # overrides the --exclusive and it works. fargs.append('-mca plm_slurm_args "--overlap"') - fargs.extend([ - '--bind-to core --display-map --display-allocation {command} {args}']) + fargs.append('{command} {args}') fstr = ' '.join(fargs) @@ -248,13 +263,15 @@ def cluster_command(implementation: Implementation) -> str: 'mpirun -n $MUSCLE_MPI_PROCESSES -machinefile $MUSCLE_RANKFILE' ' {command} {args}') elif implementation.execution_model == ExecutionModel.SRUNMPI: - # TODO: set SLURM_CPU_BIND_VERBOSE for verbose output fargs = ['srun -n $MUSCLE_MPI_PROCESSES -m arbitrary'] if slurm.quirks.overlap: fargs.append('--overlap') - fargs.append(f'{slurm.quirks.cpu_bind}=$SLURM_CPU_BIND {{command}} {{args}}') + verbose = 'verbose,' if enable_debug else '' + + fargs.append(f'{slurm.quirks.cpu_bind}={verbose}$SLURM_CPU_BIND') + fargs.append('{command} {args}') fstr = ' '.join(fargs) @@ -288,6 +305,8 @@ def make_script( Return: A string with embedded newlines containing the shell script. """ + enable_debug = logging.getLogger('libmuscle').getEffectiveLevel() <= logging.DEBUG + lines: List[str] = list() lines.append('#!/bin/bash') @@ -309,9 +328,9 @@ def make_script( lines.append('') if local: - lines.append(local_command(implementation)) + lines.append(local_command(implementation, enable_debug)) else: - lines.append(cluster_command(implementation)) + lines.append(cluster_command(implementation, enable_debug)) lines.append('') From e19e10e2f1910c3645a2015bd071e2e6d2345bcd Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 4 Dec 2024 20:36:27 +0100 Subject: [PATCH 41/49] Fix linter warning --- integration_test/cluster_test/test_cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_test/cluster_test/test_cluster.py b/integration_test/cluster_test/test_cluster.py index 9f0a7156..d8a52c67 100644 --- a/integration_test/cluster_test/test_cluster.py +++ b/integration_test/cluster_test/test_cluster.py @@ -247,5 +247,5 @@ def test_macro_micro( assert out.split('\n')[0] == 'headnode' else: node, hwthreads, _ = out.split('\n') - assert node == f'node-4' + assert node == 'node-4' assert hwthread_to_core(hwthreads) == [rank] From 7d363b02239806aaafa0933c1536380e09a3b2d1 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 4 Dec 2024 22:18:16 +0100 Subject: [PATCH 42/49] Don't bind (but also don't crash) if taskset doesn't exist --- .../libmuscle/native_instantiator/run_script.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/libmuscle/python/libmuscle/native_instantiator/run_script.py b/libmuscle/python/libmuscle/native_instantiator/run_script.py index a2c3d9b7..124b0897 100644 --- a/libmuscle/python/libmuscle/native_instantiator/run_script.py +++ b/libmuscle/python/libmuscle/native_instantiator/run_script.py @@ -231,9 +231,16 @@ def cluster_command(implementation: Implementation, enable_debug: bool) -> str: Return: A string with the command to use to start the implementation. """ - # TODO: don't use taskset if it's not available if implementation.execution_model == ExecutionModel.DIRECT: - fstr = 'taskset $MUSCLE_BIND_MASK {command} {args}' + fargs = [ + 'if ! taskset -V >/dev/null 2>&1 ; then', + ' {command} {args}', + 'else', + ' taskset $MUSCLE_BIND_MASK {command} {args}', + 'fi' + ] + fstr = '\n'.join(fargs) + elif implementation.execution_model == ExecutionModel.OPENMPI: fargs = [ # Native name is orterun for older and prterun for newer OpenMPI. From 17c51d14e8eb236c70c441f767295fe193004730 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sun, 8 Dec 2024 23:23:29 +0100 Subject: [PATCH 43/49] Add SLURM 24-05 and 24-11 --- integration_test/cluster_test/conftest.py | 4 ++-- integration_test/fake_cluster/Dockerfile | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/integration_test/cluster_test/conftest.py b/integration_test/cluster_test/conftest.py index 2350f38a..97b7b255 100644 --- a/integration_test/cluster_test/conftest.py +++ b/integration_test/cluster_test/conftest.py @@ -17,11 +17,11 @@ IDX_SLURM_VERSIONS = list(enumerate([ '17-02', '17-11', '18-08', '19-05', '20-02', '20-11', '21-08', '22-05', '23-02', - '23-11' + '23-11', '24-05', '24-11' ])) # Shut down the containers after running the tests. Set to False to debug. -CLEAN_UP_CONTAINERS = False +CLEAN_UP_CONTAINERS = True skip_unless_cluster = pytest.mark.skipif( diff --git a/integration_test/fake_cluster/Dockerfile b/integration_test/fake_cluster/Dockerfile index 16561062..25a85ebe 100644 --- a/integration_test/fake_cluster/Dockerfile +++ b/integration_test/fake_cluster/Dockerfile @@ -31,6 +31,18 @@ RUN . /opt/spack/share/spack/setup-env.sh && \ ^$(spack find --deps slurm@23-11 | grep pmix | tr -d ' ') \ ^$(spack find --format "slurm/{hash}" slurm@23-11) +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@4.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@24-05 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@24-05) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@4.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@24-11 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@24-11) + # RUN . /opt/spack/share/spack/setup-env.sh && \ # . $(spack location -i lmod)/lmod/lmod/init/bash && \ # spack install mpich+slurm pmi=pmix ^pmix@3.2.3 From c27239eb28371ae96d1bfbbafaf30099b96e58fc Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sun, 8 Dec 2024 23:23:49 +0100 Subject: [PATCH 44/49] Improve mpirun command --- libmuscle/python/libmuscle/native_instantiator/run_script.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libmuscle/python/libmuscle/native_instantiator/run_script.py b/libmuscle/python/libmuscle/native_instantiator/run_script.py index 124b0897..c3aa2bfc 100644 --- a/libmuscle/python/libmuscle/native_instantiator/run_script.py +++ b/libmuscle/python/libmuscle/native_instantiator/run_script.py @@ -246,12 +246,12 @@ def cluster_command(implementation: Implementation, enable_debug: bool) -> str: # Native name is orterun for older and prterun for newer OpenMPI. # So we go with mpirun, which works for either. 'mpirun -np $MUSCLE_MPI_PROCESSES', - '--rankfile $MUSCLE_RANKFILE --use-hwthread-cpus --bind-to core', + '--rankfile $MUSCLE_RANKFILE --use-hwthread-cpus --bind-to hwthread', '--oversubscribe' ] if enable_debug: - fargs.append('-v --debug-daemons --display-map --display-allocation') + fargs.append('-v --display-allocation --display-map --report-bindings') if slurm.quirks.overlap: # This adds the given option to the srun command used by mpirun to launch From 0a67cc614fe66e9059412e6f7f047ab92cfcd424 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sun, 29 Dec 2024 20:48:41 +0100 Subject: [PATCH 45/49] Switch to class-based resource representation --- .../libmuscle/manager/instance_manager.py | 7 +- .../python/libmuscle/manager/instantiator.py | 8 +- .../python/libmuscle/manager/profile_store.py | 10 +- .../libmuscle/manager/qcgpj_instantiator.py | 38 +- .../manager/test/test_profile_database.py | 16 +- .../native_instantiator/agent/__main__.py | 57 +- .../native_instantiator/agent/map_client.py | 23 +- .../native_instantiator/agent_manager.py | 34 +- .../native_instantiator/iagent_manager.py | 8 +- .../native_instantiator/map_server.py | 28 +- .../native_instantiator.py | 252 +------ .../native_instantiator/run_script.py | 59 +- libmuscle/python/libmuscle/planner/planner.py | 271 ++++---- .../python/libmuscle/planner/resources.py | 647 ++++++++++++++++++ .../libmuscle/planner/test/test_planner.py | 238 +++---- .../planner/test/test_planner_scenarios.py | 437 ++++++------ .../libmuscle/planner/test/test_resources.py | 435 ++++++++++++ libmuscle/python/libmuscle/test/conftest.py | 24 +- muscle3/muscle3.py | 9 +- 19 files changed, 1759 insertions(+), 842 deletions(-) create mode 100644 libmuscle/python/libmuscle/planner/resources.py create mode 100644 libmuscle/python/libmuscle/planner/test/test_resources.py diff --git a/libmuscle/python/libmuscle/manager/instance_manager.py b/libmuscle/python/libmuscle/manager/instance_manager.py index 9d7cf90d..23980903 100644 --- a/libmuscle/python/libmuscle/manager/instance_manager.py +++ b/libmuscle/python/libmuscle/manager/instance_manager.py @@ -17,7 +17,8 @@ # from libmuscle.manager.qcgpj_instantiator import QCGPJInstantiator from libmuscle.manager.run_dir import RunDir from libmuscle.native_instantiator.native_instantiator import NativeInstantiator -from libmuscle.planner.planner import Planner, Resources +from libmuscle.planner.planner import Planner, ResourceAssignment +from libmuscle.planner.resources import Resources _logger = logging.getLogger(__name__) @@ -94,7 +95,7 @@ def __init__( self._log_handler = LogHandlingThread(self._log_records_in) self._log_handler.start() - self._allocations: Optional[Dict[Reference, Resources]] = None + self._allocations: Optional[Dict[Reference, ResourceAssignment]] = None resources = self._resources_in.get() _logger.debug(f'Got resources {resources}') @@ -150,7 +151,7 @@ def start_all(self) -> None: self._requests_out.put(request) self._num_running += 1 - def get_resources(self) -> Dict[Reference, Resources]: + def get_resources(self) -> Dict[Reference, ResourceAssignment]: """Returns the resources allocated to each instance. Only call this after start_all() has been called, or it will raise diff --git a/libmuscle/python/libmuscle/manager/instantiator.py b/libmuscle/python/libmuscle/manager/instantiator.py index b86f7cbf..e29e48c2 100644 --- a/libmuscle/python/libmuscle/manager/instantiator.py +++ b/libmuscle/python/libmuscle/manager/instantiator.py @@ -8,7 +8,7 @@ from ymmsl import Implementation, Reference, ResourceRequirements -from libmuscle.planner.planner import Resources +from libmuscle.planner.planner import ResourceAssignment class ProcessStatus(enum.Enum): @@ -40,7 +40,7 @@ class Process: exit_code: Exit code, if status is ERROR error_msg: Error message, if status is ERROR """ - def __init__(self, instance: Reference, resources: Resources) -> None: + def __init__(self, instance: Reference, resources: ResourceAssignment) -> None: """Create a Process object. Args: @@ -81,8 +81,8 @@ class InstantiationRequest(InstantiatorRequest): """ def __init__( self, instance: Reference, implementation: Implementation, - res_req: ResourceRequirements, resources: Resources, instance_dir: - Path, work_dir: Path, stdout_path: Path, stderr_path: Path + res_req: ResourceRequirements, resources: ResourceAssignment, + instance_dir: Path, work_dir: Path, stdout_path: Path, stderr_path: Path ) -> None: """Create an InstantiationRequest. diff --git a/libmuscle/python/libmuscle/manager/profile_store.py b/libmuscle/python/libmuscle/manager/profile_store.py index 0fba694e..3ee262b8 100644 --- a/libmuscle/python/libmuscle/manager/profile_store.py +++ b/libmuscle/python/libmuscle/manager/profile_store.py @@ -5,7 +5,7 @@ from threading import Thread from typing import cast, Dict, Iterable, List, Optional, Tuple -from libmuscle.planner.planner import Resources +from libmuscle.planner.planner import ResourceAssignment from libmuscle.profiling import ProfileEvent, ProfileEventType from libmuscle.manager.profile_database import ProfileDatabase from ymmsl import Operator, Reference @@ -77,7 +77,7 @@ def store_instances( cur.execute("COMMIT") cur.close() - def store_resources(self, resources: Dict[Reference, Resources]) -> None: + def store_resources(self, resources: Dict[Reference, ResourceAssignment]) -> None: """Store resource assignments into the database. Args: @@ -90,9 +90,9 @@ def store_resources(self, resources: Dict[Reference, Resources]) -> None: instance_oid = self._get_instance_oid(cur, instance_id) tuples = [ - (instance_oid, node, hwthread) - for node, cores in res.cores.items() - for core in cores for hwthread in core] + (instance_oid, node.node_name, core.cid) + for node in res.as_resources() + for core in node.cpu_cores] cur.executemany( "INSERT INTO assigned_cores (instance_oid, node, core)" diff --git a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py index 9130779f..f54e96e2 100644 --- a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py +++ b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py @@ -28,7 +28,7 @@ from libmuscle.manager.instantiator import ( CancelAllRequest, CrashedResult, create_instance_env, InstantiationRequest, Process, ProcessStatus, reconfigure_logging, ShutdownRequest) -from libmuscle.planner.planner import Resources +from libmuscle.planner.resources import Core, CoreSet, OnNodeResources, Resources _logger = logging.getLogger(__name__) @@ -198,10 +198,13 @@ async def _main(self) -> None: def _send_resources(self) -> None: """Converts and sends QCG available resources.""" - resources = Resources() + resources = Resources([]) for node in self._qcg_resources.nodes: - resources.cores[node.name] = { - frozenset(n.split(',')) for n in node.free_ids} + cs = CoreSet([ + Core(cid, set(map(int, hwthreads_str.split(',')))) + for cid, hwthreads_str in enumerate(node.free_ids)]) + nr = OnNodeResources(node.name, cs) + resources.add_node(nr) self._resources_out.put(resources) @@ -237,7 +240,8 @@ def _create_job( qcg_resources_type: qcg_ResourcesType ) -> Tuple[qcg_Allocation, qcg_SchedulingIteration]: """Creates a QCG allocation and job for a request.""" - total_cores = sum(map(len, request.resources.cores.values())) + total_cores = sum([ + nres.total_cores() for nres in request.resources.by_rank]) env = create_instance_env(request.instance, request.implementation.env) @@ -255,10 +259,13 @@ def _create_job( resources=resources) qcg_allocation = qcg_Allocation() - for node_name, cores in request.resources.cores.items(): - qcg_cores = [str(i) for i in cores] + res = request.resources.as_resources() + for node in res: + qcg_cores = [ + ','.join(map(str, core.hwthreads)) + for core in node.cpu_cores] qcg_allocation.add_node( - qcg_NodeAllocation(qcg_Node(node_name), qcg_cores, {})) + qcg_NodeAllocation(qcg_Node(node.node_name), qcg_cores, {})) sjob = qcg_SchedulingJob(self._state_tracker, qcg_job) qcg_iteration = qcg_SchedulingIteration(sjob, None, None, resources, []) @@ -284,16 +291,19 @@ def _qcg_job_execution_with_script( rank_file = request.instance_dir / 'rankfile' with rank_file.open('w') as f: i = 0 - for node, cores in request.resources.cores.items(): - for c in sorted(cores): - f.write(f'rank {i}={node} slot={c}\n') + res = request.resources.as_resources() + for node in res: + for cid in sorted([c.cid for c in node.cpu_cores]): + f.write(f'rank {i}={node.node_name} slot={cid}\n') i += 1 env['MUSCLE_OPENMPI_RANK_FILE'] = str(rank_file) # IntelMPI support mpi_res_args = list() - for node, cores in request.resources.cores.items(): - mpi_res_args.extend(['-host', node, '-n', str(len(cores))]) + res = request.resources.as_resources() + for node in res: + mpi_res_args.extend([ + '-host', node.node_name, '-n', str(node.total_cores())]) env['MUSCLE_INTELMPI_RESOURCES'] = ' '.join(mpi_res_args) # General environment @@ -315,7 +325,7 @@ def _qcg_job_execution_normal( qcg_resources_type: qcg_ResourcesType) -> qcg_JobExecution: """Create a JobExecution for a normal description.""" impl = request.implementation - total_cores = sum(map(len, request.resources.cores.values())) + total_cores = request.resources.as_resources().total_cores() if impl.execution_model == ExecutionModel.DIRECT: env['OMP_NUM_THREADS'] = str(total_cores) diff --git a/libmuscle/python/libmuscle/manager/test/test_profile_database.py b/libmuscle/python/libmuscle/manager/test/test_profile_database.py index 33bbb9dd..b72c964a 100644 --- a/libmuscle/python/libmuscle/manager/test/test_profile_database.py +++ b/libmuscle/python/libmuscle/manager/test/test_profile_database.py @@ -2,13 +2,13 @@ from libmuscle.manager.profile_database import ProfileDatabase from libmuscle.manager.profile_store import ProfileStore -from libmuscle.planner.planner import Resources +from libmuscle.planner.planner import ResourceAssignment from libmuscle.profiling import ( ProfileEvent, ProfileEventType, ProfileTimestamp) -from ymmsl import Operator, Port, Reference +from libmuscle.test.conftest import on_node_resources as onr -from libmuscle.test.conftest import frozenset_of as s +from ymmsl import Operator, Port, Reference import pytest @@ -23,13 +23,11 @@ def db_file(tmp_path) -> Path: store.store_instances([Reference('instance1'), Reference('instance2')]) - resources1 = Resources({ - 'node001': {s(0), s(1)}, - 'node002': {s(0), s(1)}}) + resources1 = ResourceAssignment([ + onr('node001', {0, 1}), onr('node002', {0, 1})]) - resources2 = Resources({ - 'node001': {s(0)}, - 'node002': {s(0), s(1), s(2)}}) + resources2 = ResourceAssignment([ + onr('node001', {0}), onr('node002', {0, 1, 2})]) store.store_resources({ Reference('instance1'): resources1, diff --git a/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py b/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py index a47dfca6..a85f2096 100644 --- a/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py +++ b/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py @@ -1,16 +1,16 @@ -from itertools import groupby import logging import os import psutil from socket import gethostname import sys from time import sleep -from typing import Any, Dict, Set +from typing import Dict, Set from libmuscle.native_instantiator.process_manager import ProcessManager from libmuscle.native_instantiator.agent.map_client import MAPClient from libmuscle.native_instantiator.agent.agent_commands import ( CancelAllCommand, ShutdownCommand, StartCommand) +from libmuscle.planner.resources import Core, CoreSet, OnNodeResources _logger = logging.getLogger(__name__) @@ -18,21 +18,21 @@ class Agent: """Runs on a compute node and starts processes there.""" - def __init__(self, node_id: str, server_location: str) -> None: + def __init__(self, node_name: str, server_location: str) -> None: """Create an Agent. Args: - node_id: Id (hostname) of this node + node_name: Name (hostname) of this node server_location: MAP server of the manager to connect to """ - _logger.info(f'Agent at {node_id} starting') + _logger.info(f'Agent at {node_name} starting') self._process_manager = ProcessManager() - self._node_id = node_id + self._node_name = node_name _logger.info(f'Connecting to manager at {server_location}') - self._server = MAPClient(self._node_id, server_location) + self._server = MAPClient(self._node_name, server_location) _logger.info('Connected to manager') def run(self) -> None: @@ -68,17 +68,13 @@ def run(self) -> None: sleep(0.1) - def _inspect_resources(self) -> Dict[str, Any]: + def _inspect_resources(self) -> OnNodeResources: """Inspect the node to find resources and report on them. - The only resource type for now is 'cpu'. The returned dict will have that key - mapping to a list of sets of logical hwthread ids, with each set designating - a set of hwthreads that share a core. - The terminology for identifying processors gets very convoluted, with Linux, Slurm, OpenMPI and IntelMPI all using different terms, or sometimes the same - terms for different things. See the comment in native_instantiator.py for what - is what and how we use it. + terms for different things. See the comment in planner/resources.py for what is + what and how we use it. Returns: A dict mapping resource types to resource descriptions. @@ -95,8 +91,9 @@ def _inspect_resources(self) -> Dict[str, Any]: core_id = int(f.read()) hwthreads_by_core.setdefault(core_id, set()).add(i) - cpu_resources = sorted( - map(frozenset, hwthreads_by_core.values()), key=sorted) + cores = CoreSet(( + Core(core_id, hwthreads) + for core_id, hwthreads in hwthreads_by_core.items())) else: # MacOS doesn't support thread affinity, but older Macs with Intel @@ -138,22 +135,26 @@ def _inspect_resources(self) -> Dict[str, Any]: ' still appreciate an issue, because it is unexpected for sure.' ) - hwthread_ids = list(range(nhwthreads)) - cpu_resources = [ - frozenset(g) - for _, g in groupby( - hwthread_ids, lambda i: i // hwthreads_per_core)] + cores = CoreSet(( + Core( + cid, + set(range( + cid * hwthreads_per_core, (cid + 1) * hwthreads_per_core)) + ) + for cid in range(ncores) + )) - _logger.info(f'Found CPU resources: {cpu_resources}') - return {'cpu': cpu_resources} + resources = OnNodeResources(self._node_name, cores) + _logger.info(f'Found resources: {resources}') + return resources -def configure_logging(node_id: str, log_level: int) -> None: +def configure_logging(node_name: str, log_level: int) -> None: """Make us output logs to a custom log file.""" fmt = '%(asctime)s %(levelname)s %(message)s' formatter = logging.Formatter(fmt) - handler = logging.FileHandler(f'muscle3_agent_{node_id}.log', mode='w') + handler = logging.FileHandler(f'muscle3_agent_{node_name}.log', mode='w') handler.setFormatter(formatter) # Find and remove default handler to disable automatic console output @@ -170,11 +171,11 @@ def configure_logging(node_id: str, log_level: int) -> None: if __name__ == '__main__': - node_id = gethostname() + node_name = gethostname() server_location = sys.argv[1] log_level = int(sys.argv[2]) - configure_logging(node_id, log_level) + configure_logging(node_name, log_level) - agent = Agent(node_id, server_location) + agent = Agent(node_name, server_location) agent.run() diff --git a/libmuscle/python/libmuscle/native_instantiator/agent/map_client.py b/libmuscle/python/libmuscle/native_instantiator/agent/map_client.py index d360b0a5..e402b29f 100644 --- a/libmuscle/python/libmuscle/native_instantiator/agent/map_client.py +++ b/libmuscle/python/libmuscle/native_instantiator/agent/map_client.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, List, Optional, Tuple import msgpack @@ -7,6 +7,7 @@ from libmuscle.mcp.tcp_transport_client import TcpTransportClient from libmuscle.native_instantiator.agent.agent_commands import ( AgentCommand, StartCommand, CancelAllCommand, ShutdownCommand) +from libmuscle.planner.resources import OnNodeResources class MAPClient: @@ -14,14 +15,14 @@ class MAPClient: This class connects to the AgentManager and communicates with it. """ - def __init__(self, node_id: str, location: str) -> None: + def __init__(self, node_name: str, location: str) -> None: """Create a MAPClient Args: - node_id: Id of the local node + node_name: Name (hostname) of the local node location: A connection string of the form hostname:port """ - self._node_id = node_id + self._node_name = node_name self._transport_client = TcpTransportClient(location) def close(self) -> None: @@ -31,20 +32,16 @@ def close(self) -> None: """ self._transport_client.close() - def report_resources(self, resources: Dict[str, Any]) -> None: + def report_resources(self, resources: OnNodeResources) -> None: """Report local resources - The only key in the dict is currently 'cpu', and it maps to a list of frozensets - of hwthread ids that we can bind to with taskset or in a rankfile. - Args: - resources: Available resource ids by type + resources: Description of the resources on this node """ - enc_cpu_resources = [ - list(hwthreads) for hwthreads in resources['cpu']] + enc_cpu_resources = [[c.cid] + list(c.hwthreads) for c in resources.cpu_cores] request = [ RequestType.REPORT_RESOURCES.value, - self._node_id, {'cpu': enc_cpu_resources}] + resources.node_name, {'cpu': enc_cpu_resources}] self._call_agent_manager(request) def get_command(self) -> Optional[AgentCommand]: @@ -53,7 +50,7 @@ def get_command(self) -> Optional[AgentCommand]: Returns: A command, or None if there are no commands pending. """ - request = [RequestType.GET_COMMAND.value, self._node_id] + request = [RequestType.GET_COMMAND.value, self._node_name] response = self._call_agent_manager(request) if response[0] == ResponseType.PENDING.value: diff --git a/libmuscle/python/libmuscle/native_instantiator/agent_manager.py b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py index 39d9a648..37883749 100644 --- a/libmuscle/python/libmuscle/native_instantiator/agent_manager.py +++ b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py @@ -4,13 +4,14 @@ import sys from threading import Lock from time import sleep -from typing import Any, Dict, FrozenSet, List, Tuple +from typing import Dict, List, Tuple from libmuscle.native_instantiator.agent.agent_commands import ( CancelAllCommand, StartCommand, ShutdownCommand) from libmuscle.native_instantiator.iagent_manager import IAgentManager from libmuscle.native_instantiator.map_server import MAPServer from libmuscle.native_instantiator.global_resources import global_resources +from libmuscle.planner.resources import OnNodeResources, Resources _logger = logging.getLogger(__name__) @@ -38,7 +39,7 @@ def __init__(self, agent_dir: Path) -> None: agent_dir: Directory in which agents can write log files. """ self._nodes: List[str] = list() - self._resources: Dict[str, Dict[str, Any]] = dict() + self._resources: Resources = Resources([]) self._resources_lock = Lock() # protects _nodes and _resources self._finished_processes: List[Tuple[str, int]] = list() @@ -47,7 +48,7 @@ def __init__(self, agent_dir: Path) -> None: self._server = MAPServer(self) self._launch_agents(agent_dir, self._server.get_location()) - def get_resources(self) -> Dict[str, List[FrozenSet[int]]]: + def get_resources(self) -> Resources: """Return detected resources. This returns a list of sets of logical hwthread ids per core, per node. @@ -55,10 +56,10 @@ def get_resources(self) -> Dict[str, List[FrozenSet[int]]]: Called by NativeInstantiator. """ # no need to lock, _resources is already in its final state - return {node_id: res['cpu'] for node_id, res in self._resources.items()} + return self._resources def start( - self, node_id: str, name: str, work_dir: Path, args: List[str], + self, node_name: str, name: str, work_dir: Path, args: List[str], env: Dict[str, str], stdout: Path, stderr: Path) -> None: """Start a process on a node. @@ -66,7 +67,7 @@ def start( exist. Args: - node_id: Id of the node to run the process on + node_name: Name of the node to run the process on name: Name under which this process will be known work_dir: Working directory in which to start args: Executable and arguments to run @@ -75,7 +76,7 @@ def start( stderr: File to redirect stderr to """ command = StartCommand(name, work_dir, args, env, stdout, stderr) - self._server.deposit_command(node_id, command) + self._server.deposit_command(node_name, command) def cancel_all(self) -> None: """Cancel all processes. @@ -84,8 +85,8 @@ def cancel_all(self) -> None: Called by NativeInstantiator. """ - for node_id in self._nodes: - self._server.deposit_command(node_id, CancelAllCommand()) + for node_name in self._nodes: + self._server.deposit_command(node_name, CancelAllCommand()) def get_finished(self) -> List[Tuple[str, int]]: """Returns names and exit codes of finished processes. @@ -105,8 +106,8 @@ def get_finished(self) -> List[Tuple[str, int]]: def shutdown(self) -> None: """Shut down the manager and its agents.""" command = ShutdownCommand() - for node_id in self._nodes: - self._server.deposit_command(node_id, command) + for node_name in self._nodes: + self._server.deposit_command(node_name, command) try: self._agents_process.wait(60) @@ -124,19 +125,18 @@ def shutdown(self) -> None: self._server.stop() - def report_resources(self, node_id: str, resources: Dict[str, Any]) -> None: + def report_resources(self, resources: OnNodeResources) -> None: """Report resources found on a node. Called by MAPServer from a server thread. Args: - node_id: Id of the node these resources are on - resources: Dict mapping resource type to resource ids + resources: Description of a node's resources """ - _logger.debug(f'Agent on {node_id} reported {resources}') + _logger.debug(f'Agent reported {resources}') with self._resources_lock: - self._nodes.append(node_id) - self._resources[node_id] = resources + self._nodes.append(resources.node_name) + self._resources.add_node(resources) def report_result(self, names_exit_codes: List[Tuple[str, int]]) -> None: """Report results of finished processes. diff --git a/libmuscle/python/libmuscle/native_instantiator/iagent_manager.py b/libmuscle/python/libmuscle/native_instantiator/iagent_manager.py index 93d063f8..badf6a46 100644 --- a/libmuscle/python/libmuscle/native_instantiator/iagent_manager.py +++ b/libmuscle/python/libmuscle/native_instantiator/iagent_manager.py @@ -1,4 +1,6 @@ -from typing import Any, Dict, List, Tuple +from typing import List, Tuple + +from libmuscle.planner.resources import OnNodeResources class IAgentManager: @@ -7,13 +9,13 @@ class IAgentManager: Only implemented by AgentManager, and only exists to avoid a circular dependency between AgentManager, MAPServer, and MAPRequestHandler. Ugh. """ - def report_resources(self, node_id: str, resources: Dict[str, Any]) -> None: + def report_resources(self, resources: OnNodeResources) -> None: """Report resources found on a node. Called by MAPServer from a server thread. Args: - node_id: Id of the node these resources are on + node_name: Id of the node these resources are on resources: Dict mapping resource type to resource ids """ raise NotImplementedError() diff --git a/libmuscle/python/libmuscle/native_instantiator/map_server.py b/libmuscle/python/libmuscle/native_instantiator/map_server.py index 6ab847c0..87c3f5ca 100644 --- a/libmuscle/python/libmuscle/native_instantiator/map_server.py +++ b/libmuscle/python/libmuscle/native_instantiator/map_server.py @@ -10,6 +10,7 @@ from libmuscle.native_instantiator.agent.agent_commands import ( AgentCommand, CancelAllCommand, ShutdownCommand, StartCommand) from libmuscle.native_instantiator.iagent_manager import IAgentManager +from libmuscle.planner.resources import Core, CoreSet, OnNodeResources from libmuscle.post_office import PostOffice from ymmsl import Reference @@ -52,22 +53,25 @@ def handle_request(self, request: bytes) -> bytes: return cast(bytes, msgpack.packb(response, use_bin_type=True)) def _report_resources( - self, node_id: str, resources: Dict[str, Any]) -> Any: + self, node_name: str, data: Dict[str, Any]) -> Any: """Handle a report resources request. This is used by the agent to report available resources on its node when it starts up. Args: - node_id: Hostname (id) of the node - resources: Resource dictionary, containing a single key 'cpu' which - maps to a list of lists of hwthread ids representing cores. + node_name: Name (hostname) of the node + data: Resource dictionary, containing a single key 'cpu' which maps to a + list of cores, where each core is a list of ints, starting with the core + id at index [0] followed by the hwthread ids of all hwthreads in this + core. """ - dec_cpu_resources = [frozenset(hwthreads) for hwthreads in resources['cpu']] - self._agent_manager.report_resources(node_id, {'cpu': dec_cpu_resources}) + cores = CoreSet((Core(ids[0], set(ids[1:])) for ids in data['cpu'])) + node_resources = OnNodeResources(node_name, cores) + self._agent_manager.report_resources(node_resources) return [ResponseType.SUCCESS.value] - def _get_command(self, node_id: str) -> Any: + def _get_command(self, node_name: str) -> Any: """Handle a get command request. This is used by the agent to ask if there's anything we would like it to do. @@ -78,9 +82,9 @@ def _get_command(self, node_id: str) -> Any: do). Args: - node_id: Hostname (id) of the agent's node + node_name: Hostname (name) of the agent's node """ - node_ref = Reference(node_id.replace('-', '_')) + node_ref = Reference(node_name.replace('-', '_')) next_request: Optional[bytes] = None if self._post_office.have_message(node_ref): next_request = self._post_office.get_message(node_ref) @@ -145,17 +149,17 @@ def stop(self) -> None: """ self._server.close() - def deposit_command(self, node_id: str, command: AgentCommand) -> None: + def deposit_command(self, node_name: str, command: AgentCommand) -> None: """Deposit a command for the given agent. This takes the given command and queues it for the given agent to pick up next time it asks us for one. Args: - node_id: Id of the node whose agent should execute the command + node_name: Name of the node whose agent should execute the command command: The command to send """ - agent = Reference(node_id.replace('-', '_')) + agent = Reference(node_name.replace('-', '_')) if isinstance(command, StartCommand): command_obj = [ diff --git a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py index a876a682..5c3ecd95 100644 --- a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py +++ b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py @@ -1,191 +1,3 @@ -"""Module for examining resources and instantiating instances on them - -There's a huge comment here because there's a big mess here that took me forever to -figure out, so now I'm going to document it for the future. - - -Identifying hardware resources - -Today's computers all contain multi-core CPUs, often with symmetric multithreading -(SMT), also known as hyperthreading. This means that we have hardware threads -(hwthreads) and also cores, and then there's caches and memory as well but we're not -going into NUMA here. - -Cores and hwthreads are identified by number, but they have multiple different numbers -that are referred to by different names in different contexts, making everything very -confusing. So here are some definitions to disambiguate things. Note that this is still -a rather simplified representation, but it's enough for what we're doing here in -MUSCLE3. - - -Hardware threads - -A *hardware thread (hwthread)* is, at the hardware level, an instruction decoder. It -points to wherever in the code we are currently executing, and it can read the next -couple of instructions and figure out how to execute them. It can't actually execute -anything however, because it doesn't have the hardware that does that. - -Intel refers to hwthreads as "logical processors" and so does Linux, hwloc calls them -"processing units" or PUs and so does OpenMPI unless it uses the term hwthread just to -confuse things a bit more. - -Cores - -A *core* contains at least one hwthread, and at least one functional unit, which is a -hardware component that actually does calculations and other data processing. Within a -core, the hwthread(s) read instructions and pass them to the functional units to be -executed. If a core has more than one hwthread, then the CPU supports SMT. - -Intel refers to cores as "physical processors", hwloc calls them cores and so do most -other sources. We'll use cores here. - -Since a hwthread cannot do anything on its own, it's always part of a core. - -CPUs - -The term CPU is used in many ways by various bits of documentation, sometimes referring -to a hwthread or a core, but here we'll take it to mean a collection of cores in a -plastic box. Similar terms are *package* (referring to that plastic box with very many -metal pins) and *socket* (the thing the package mounts into), or *processor*, which was -originally used to refer to all of the above when CPUs still had only one core with only -one hwthread, and has now become ambiguous. - -Weird things can happen here, I've seen CPUs that as far as I can tell are a single -package, but nevertheless claim to have two sockets. I suspect that that's two physical -chips in a single plastic box, but I don't know for sure. - -Here, we're concerned with hwthreads and cores and how to identify them and assign -instances to them. - - -Linux - -On modern operating systems, hardware access is mediated by the operating system, and -we're mainly concerned with Linux here because that is what all the clusters are running -(see the note on macOS below). Information about the CPU(s) can be obtained on Linux -from the /proc/cpuinfo file, or equivalently but more modernly, from the files in -/sys/devices/system/cpu/cpu/topology/. - -Linux collects information about processors because it needs to run processes (programs, -software threads) on them on behalf of the user. Processes are assigned to hwthreads, so -that is what Linux considers a *processor*. /proc/cpuinfo lists all these processors, -and they each have their own directory /sys/devices/system/cpu/cpu. - -On Linux, processors have an id, which is that number in the directory, and is -listed under "processor" in /proc/cpuinfo. Since this number identifies a hwthread and -is assigned by Linux rather than being baked into the hardware, I'm calling it a -"logical hwthread id", this being a logical id of a hwthread, not an id of a logical -hwthread. It's also the id of a logical processor in Intel-speak. - -Hwthreads actually have a second number associated with them, which does come from the -hardware. In /proc/cpuinfo, that's listed under "apicid"; it doesn't seem to be -available from sysfs. Hwloc call this the "physical PU (its name for a hwthread) id", -and OpenMPI's mpirun manpage also refers to it as a "physical processor location". - -There's great potential for confusion here: the "physical PU id" and "physical processor -location" both identify a hardware-specified number (a physical id or a physical -location) for a hwthread. This is something completely different than what Intel calls a -"physical processor", which they use to refer to a core. - -MUSCLE3 uses logical hwthread ids everywhere, it does not use physical ids. - -Linux knows about how hwthreads are grouped into bigger things of course. Cores are -identified in Linux using the "core id", which is listed in /proc/cpuinfo and in -/sys/devices/system/cpu/cpu/topology/core_id. So for each hwthread, identified by its -logical id, we can look up which core it is a part of. The core id is a logical id, -assigned by Linux, not by the hardware. While logical hwthread ids seem to always be -consecutive at least on the hardware I've seen so far, core ids may have gaps. - -MUSCLE3 does not use core ids, although it uses groups of hwthread ids that contain all -the hwthreads for a given core. - - -Resource binding - -Running processes need something to run on, a hwthread. The assignment of process to -hwthread is done by the operating system's scheduler: when a process is ready to run, -the scheduler will try to find it a free hwthread to run on. - -The scheduler can be constrained in which hwthreads it considers for a given process, -which is known as binding the process. This may have performance benefits, because -moving a process from one hwthread to another takes time. In MUSCLE3, when running on a -cluster, each process is assigned its own specific set of hwthreads to run on, and we -try to bind the instance to the assigned hwthreads. - -Taskset - -How this is done depends on how the instance is started. For non-MPI instances, we use a -Linux utility named 'taskset' that starts another program with a giving binding. The -binding is expressed as an *affinity mask*, a string of bits that say whether a given -processor (hwthread) can be used by the process or not. Each position in the string of -bits corresponds to the hwthread with that logical id. - -OpenMPI - -OpenMPI can bind cores in various ways, we use a rankfile and the --use-hwthread-cpus -option to specify the logical hwthread ids we want to bind each MPI process (rank) to. -Note that OpenMPI by default binds to cores, and can also bind to various other things -including sockets. - -MPICH - -MPICH doesn't support binding, as far as I can see. - -Intel MPI - -Intel MPI uses logical hwthread ids-based masks, specified in an environment variable, -to go with a machinefile that lists the nodes to put each process on. - -Slurm srun - -Slurm's srun has a CPU_BIND environment variable that likewise contains logical hwthread -ids-based masks, and a hostfile that lists the nodes to put each process on. - -Here are some disambiguation tables to help with the confusion: - - -``` -MUSCLE3 hwthread logical hwthread id physical hwthread id - -Linux processor processor apicid - (/proc/cpuinfo only) - -cgroups always uses these - -taskset always uses these - -hwloc PU PU L# PU P# - -OpenMPI hwthread used in rankfile if used in rankfile if - --use-hwthread-cpus rmaps_rank_file_physical - is specified MCA param set - -Intel logical logical processor - processor number - -srun used by --bind-to - -psutil logical returned by Process.cpu_affinity() - core counted by psutil.cpu_count(logical=True) -``` - - -``` -MUSCLE3 core (uses list of hwthread ids) - -Linux core core id - -Hwloc core core L# - -OpenMPI core used in rankfile if - --use-hwthread-cpus not - specified - -psutil physical counted by psutil.cpu_count(logical=False) - core -``` - -""" import logging import multiprocessing as mp from os import chdir @@ -203,7 +15,7 @@ from libmuscle.native_instantiator.agent_manager import AgentManager from libmuscle.native_instantiator.global_resources import global_resources from libmuscle.native_instantiator.run_script import make_script, prep_resources -from libmuscle.planner.planner import Resources +from libmuscle.planner.resources import OnNodeResources, Resources from ymmsl import MPICoresResReq, MPINodesResReq, ResourceRequirements, ThreadedResReq @@ -309,24 +121,22 @@ def _send_resources(self) -> None: already_logged_smt = False resources = Resources() - agent_cores = self._agent_manager.get_resources() + agent_res = self._agent_manager.get_resources() env_ncpus = dict( zip(global_resources().nodes, global_resources().logical_cpus_per_node) ) - for node in env_ncpus: - if node not in agent_cores: + for node_name in env_ncpus: + if node_name not in agent_res.nodes(): _logger.warning( - f'The environment suggests we should have node {node},' + f'The environment suggests we should have node {node_name},' ' but no agent reported running on it. We won''t be able' ' to use this node.') else: - resources.cores[node] = set(agent_cores[node]) - - env_nncpus = env_ncpus[node] - ag_nncores = len(agent_cores[node]) - ag_nnthreads = sum((len(ts) for ts in agent_cores[node])) + env_nncpus = env_ncpus[node_name] + ag_nncores = len(agent_res[node_name].cpu_cores) + ag_nnthreads = len(list(agent_res[node_name].hwthreads())) if ag_nncores != ag_nnthreads and ag_nnthreads == env_nncpus: if not already_logged_smt: @@ -336,29 +146,41 @@ def _send_resources(self) -> None: ' each thread or MPI process.') already_logged_smt = True + resources.add_node(agent_res[node_name]) + elif ag_nncores < env_nncpus: _logger.warning( - f'Node {node} should have {env_nncpus} cores available,' - f' but the agent reports only {ag_nncores} available to it.' - f' We\'ll use the {ag_nncores} we seem to have.') + f'Node {node_name} should have {env_nncpus} cores' + f' available, but the agent reports only {ag_nncores}' + f' available to it. We\'ll use the {ag_nncores} we seem to' + ' have.') - resources.cores[node] = set(agent_cores[node]) + resources.add_node(agent_res[node_name]) elif env_nncpus < ag_nncores: _logger.warning( - f'Node {node} should have {env_nncpus} cores available,' - f' but the agent reports {ag_nncores} available to it.' - ' Maybe the cluster does not constrain resources? We\'ll' - f' use the {env_nncpus} that we should have got.') - resources.cores[node] = set(agent_cores[node][:env_nncpus]) - - for node in agent_cores: - if node not in env_ncpus: + f'Node {node_name} should have {env_nncpus} cores' + f' available, but the agent reports {ag_nncores} available' + ' to it. Maybe the cluster does not constrain resources?' + f' We\'ll use the {env_nncpus} that we should have got.') + resources.add_node( + OnNodeResources( + node_name, + agent_res[node_name].cpu_cores.get_first_cores( + env_nncpus))) + + else: + # no SMT, agent matches environment + resources.add_node(agent_res[node_name]) + + for node in agent_res: + if node.node_name not in env_ncpus: _logger.warning( - f'An agent is running on node {node} but the environment' - ' does not list it as ours. It seems that the node\'s' - ' hostname does not match what SLURM calls it. We will not use' - ' this node, because we\'re not sure it\'s really ours.') + f'An agent is running on node {node.node_name} but the' + ' environment does not list it as ours. It seems that the' + ' node\'s hostname does not match what SLURM calls it. We will' + ' not use this node, because we\'re not sure it\'s really ours.' + ) self._resources_out.put(resources) @@ -391,7 +213,7 @@ def _instantiate(self, request: InstantiationRequest) -> None: _logger.debug(f'Instantiating {name} on {request.resources}') try: self._agent_manager.start( - next(iter(request.resources.cores.keys())), + request.resources.by_rank[0].node_name, name, request.work_dir, args, env, request.stdout_path, request.stderr_path) self._processes[name].status = ProcessStatus.RUNNING diff --git a/libmuscle/python/libmuscle/native_instantiator/run_script.py b/libmuscle/python/libmuscle/native_instantiator/run_script.py index c3aa2bfc..faa14a68 100644 --- a/libmuscle/python/libmuscle/native_instantiator/run_script.py +++ b/libmuscle/python/libmuscle/native_instantiator/run_script.py @@ -1,32 +1,30 @@ import logging from pathlib import Path -from typing import Dict, FrozenSet, List, Optional, Tuple +from typing import Dict, Iterable, List, Optional, Tuple from libmuscle.errors import ConfigurationError from libmuscle.native_instantiator.slurm import slurm -from libmuscle.planner.planner import Resources +from libmuscle.planner.planner import ResourceAssignment from ymmsl import ( ExecutionModel, Implementation, MPICoresResReq, MPINodesResReq, ResourceRequirements, ThreadedResReq) -def direct_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: +def direct_prep_resources(resources: ResourceAssignment) -> Tuple[str, Dict[str, str]]: """Create resources for a non-MPI program with taskset. - Taskset expects a set of cores on the command line, which we put into a - MUSCLE_CORES environment variable here. + Taskset expects a set of hwthreads on the command line, either as a comma-separated + list or as a hexadecimal mask. We generate both here and set two environment + variables. Args: - resources: The resources to describe + resources: The resource assignment to describe Return: No rank file, and a set of environment variables. """ env: Dict[str, str] = dict() - only_node_hwthreads_list = [ - hwthread - for core in next(iter(resources.cores.values())) - for hwthread in core] + only_node_hwthreads_list = list(resources.by_rank[0].hwthreads()) env['MUSCLE_BIND_LIST'] = ','.join(map(str, only_node_hwthreads_list)) @@ -36,34 +34,33 @@ def direct_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: return '', env -def openmpi_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: +def openmpi_prep_resources(resources: ResourceAssignment) -> Tuple[str, Dict[str, str]]: """Create resource description for OpenMPI mpirun Args: - resources: The resources to describe + resources: The resource assignment to describe Return: The contents of the rankfile, and a set of environment variables """ ranklines: List[str] = list() all_cores = ( - (node, ','.join(sorted(map(str, hwthreads)))) - for node, cores in resources.cores.items() - for hwthreads in cores) + (node_res, ','.join(map(str, sorted(node_res.hwthreads())))) + for node_res in resources.by_rank) - for i, (node, hwthreads) in enumerate(all_cores): - ranklines.append(f'rank {i}={node} slot={hwthreads}') + for i, (node_res, hwthreads) in enumerate(all_cores): + ranklines.append(f'rank {i}={node_res.node_name} slot={hwthreads}') rankfile = '\n'.join(ranklines) + '\n' return rankfile, dict() -def impi_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: +def impi_prep_resources(resources: ResourceAssignment) -> Tuple[str, Dict[str, str]]: """Create resource description for Intel MPI mpirun Args: - resources: The resources to describe + resources: The resource assignment to describe Return: The contents of the machinefile, and a set of environment variables @@ -73,11 +70,11 @@ def impi_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: raise NotImplementedError() -def mpich_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: +def mpich_prep_resources(resources: ResourceAssignment) -> Tuple[str, Dict[str, str]]: """Create resource description for MPICH mpirun Args: - resources: The resources to describe + resources: The resource assignment to describe Return: The contents of the machinefile, and a set of environment variables @@ -87,7 +84,8 @@ def mpich_prep_resources(resources: Resources) -> Tuple[str, Dict[str, str]]: def srun_prep_resources( - resources: Resources, rankfile_location: Path) -> Tuple[str, Dict[str, str]]: + resources: ResourceAssignment, rankfile_location: Path + ) -> Tuple[str, Dict[str, str]]: """Create resource description for srun Args: @@ -98,18 +96,17 @@ def srun_prep_resources( The contents of the hostfile, and a set of environment variables """ hostfile = '\n'.join(( - node for node, cores in resources.cores.items() for _ in cores)) + node_res.node_name for node_res in resources.by_rank + for _ in node_res.hwthreads())) env = {'SLURM_HOSTFILE': str(rankfile_location)} - bind_list = [ - core for _, cores in resources.cores.items() for core in cores] - - def core_mask(core: FrozenSet[int]) -> str: - mask = sum((1 << hwthread) for hwthread in core) + def core_mask(hwthreads: Iterable[int]) -> str: + mask = sum((1 << hwthread) for hwthread in hwthreads) return format(mask, '#x') - bind_str = ','.join(map(core_mask, bind_list)) + bind_str = ','.join([ + core_mask(node_res.hwthreads()) for node_res in resources.by_rank]) env['SLURM_CPU_BIND'] = f'verbose,mask_cpu:{bind_str}' @@ -117,13 +114,13 @@ def core_mask(core: FrozenSet[int]) -> str: def prep_resources( - model: ExecutionModel, resources: Resources, rankfile_location: Path + model: ExecutionModel, resources: ResourceAssignment, rankfile_location: Path ) -> Tuple[str, Dict[str, str]]: """Create resource description for the given execution model. Args: model: The execution model to generate a description for - resources: The resources to describe + resources: The resource assignment to describe rankfile_location: Path to where the rankfile will be written Return: diff --git a/libmuscle/python/libmuscle/planner/planner.py b/libmuscle/python/libmuscle/planner/planner.py index 612a89a2..5a443a68 100644 --- a/libmuscle/python/libmuscle/planner/planner.py +++ b/libmuscle/python/libmuscle/planner/planner.py @@ -1,11 +1,12 @@ -from copy import copy, deepcopy +from copy import copy import logging -from typing import Dict, Iterable, FrozenSet, List, Mapping, Optional, Set, Tuple +from typing import Dict, Iterable, List, Mapping, Set, Tuple from ymmsl import ( Component, Configuration, Model, MPICoresResReq, MPINodesResReq, Operator, Reference, ResourceRequirements, ThreadedResReq) +from libmuscle.planner.resources import OnNodeResources, Resources from libmuscle.util import instance_indices @@ -383,125 +384,54 @@ def _calc_direct_succs_preds(self) -> None: self._direct_supersuccs[sender].add((receiver, shared_dims)) -class Resources: - """Designates a (sub)set of resources. +class ResourceAssignment: + """Assigned resources for each process of an instance. - Whether these resources are free or allocated in general or by - something specific depends on the context, this just says which - resources we're talking about. + Note that we use the classes from libmuscle.planner.resources to generically refer + to collections of resources, either to describe the available hardware or to + designate a subset of it that is occupied by a particular instance, or a subset that + isn't currently occupied. + + This class has more detailed information, because it knows for each process (MPI + rank) in the instance which subset of the overall resources for the instance it + should be on, which we need to launch it in the right place. Attributes: - cores: A dictionary mapping designated nodes to designated cores on them. Cores - are represented by sets of hwthreads they have. + by_rank: List of OnNodeResources objects containing assigned resources, + indexed by rank. """ - def __init__(self, cores: Optional[Dict[str, Set[FrozenSet[int]]]] = None) -> None: - """Create a Resources object with the given cores. + def __init__(self, by_rank: List[OnNodeResources]) -> None: + """Create a ResourceAssignment. Args: - cores: Cores to be designated by this object. + by_rank: List of OnNodeResources objects containing assigned resources, + indexed by rank. """ - if cores is None: - self.cores: Dict[str, Set[FrozenSet[int]]] = {} - else: - self.cores = cores - - def __copy__(self) -> 'Resources': - """Copy the object.""" - return Resources(deepcopy(self.cores)) + self.by_rank = by_rank def __eq__(self, other: object) -> bool: - """Check for equality.""" - if not isinstance(other, Resources): + if not isinstance(other, ResourceAssignment): return NotImplemented - if len(self.cores) != len(other.cores): - return False - - for node, cores in self.cores.items(): - if node not in other.cores: - return False - if other.cores[node] != cores: - return False - return True - - def __iadd__(self, other: 'Resources') -> 'Resources': - """Add the resources in the argument to this object.""" - for node in other.cores: - if node in self.cores: - self.cores[node] |= other.cores[node] - else: - self.cores[node] = set(other.cores[node]) - return self - - def __isub__(self, other: 'Resources') -> 'Resources': - """Remove the resources in the argument from this object.""" - for node in other.cores: - if node in self.cores: - self.cores[node] -= other.cores[node] - if not self.cores[node]: - del self.cores[node] - return self + return ( + len(self.by_rank) == len(other.by_rank) and + all([ + snr == onr + for snr, onr in zip(self.by_rank, other.by_rank)])) def __str__(self) -> str: - """Return a human-readable string representation.""" - def collapse_ranges(cores: Set[FrozenSet[int]]) -> str: - if len(cores) == 0: - return '' - - result = list() - hwthreads = sorted((hwthread for core in cores for hwthread in core)) - start = 0 - i = 1 - while i <= len(hwthreads): - if (i == len(hwthreads)) or (hwthreads[i-1] != hwthreads[i] - 1): - if start == i - 1: - # run of one - result.append(str(hwthreads[i-1])) - else: - # run of at least two - result.append(f'{hwthreads[start]}-{hwthreads[i-1]}') - start = i - i += 1 - return ','.join(result) - - return 'Resources(' + '; '.join([ - n + ': ' + collapse_ranges(cs) - for n, cs in self.cores.items()]) + ')' + # str(list()) uses repr() on the elements, we want str() + str_rbr = ', '.join([str(nr) for nr in self.by_rank]) + return f'[{str_rbr}]' def __repr__(self) -> str: - """Return a string representation.""" - return f'Resources({self.cores})' - - def nodes(self) -> Iterable[str]: - """Returns the nodes on which we designate resources.""" - return self.cores.keys() - - def total_cores(self) -> int: - """Returns the total number of cores (not hwthreads) designated.""" - return sum([len(cs) for cs in self.cores.values()]) - - def isdisjoint(self, other: 'Resources') -> bool: - """Returns whether we share resources with other.""" - for node, cores in self.cores.items(): - if node in other.cores: - if not cores.isdisjoint(other.cores[node]): - return False - return True - - @staticmethod - def union(resources: Iterable['Resources']) -> 'Resources': - """Combines the resources into one. + return f'ResourceAssignment({repr(self.by_rank)})' - Args: - resources: A collection of resources to merge. - - Return: - A Resources object referring to all the resources in the - input. - """ + def as_resources(self) -> Resources: + """Return a Resources representing the combined assigned resources.""" result = Resources() - for cur_resources in resources: - result += cur_resources + for node_res in self.by_rank: + result.merge_node(node_res) return result @@ -511,12 +441,12 @@ class InsufficientResourcesAvailable(RuntimeError): class Planner: """Allocates resources and keeps track of allocations.""" - def __init__(self, all_resources: Resources): - """Create a ResourceManager. + def __init__(self, all_resources: Resources) -> None: + """Create a Planner. Args: all_resources: An object describing the available resources - to be managed by this ResourceManager. + for the planner to use. """ self._all_resources = all_resources self._allocations: Dict[Reference, Resources] = {} @@ -525,7 +455,7 @@ def __init__(self, all_resources: Resources): def allocate_all( self, configuration: Configuration, virtual: bool = False - ) -> Dict[Reference, Resources]: + ) -> Dict[Reference, ResourceAssignment]: """Allocates resources for the given components. Allocation can occur either on a fixed set of available @@ -546,9 +476,9 @@ def allocate_all( virtual: Allocate on virtual resources or not, see above Returns: - Resources for each instance required by the model. + Assigned resources for each instance required by the model. """ - result: Dict[Reference, Resources] = {} + result: Dict[Reference, ResourceAssignment] = {} _logger.debug(f'Planning on resources {self._all_resources}') @@ -580,7 +510,7 @@ def allocate_all( done = False while not done: try: - result[instance] = self._allocate_instance( + result[instance] = self._assign_instance( instance, component, requirements[component.name], conflicting_names, virtual) @@ -686,11 +616,14 @@ def _expand_resources( """Adds an extra virtual node to the available resources.""" taken = True while taken: - new_node = 'node{:06d}'.format(self._next_virtual_node) - taken = new_node in self._all_resources.cores + new_node_name = 'node{:06d}'.format(self._next_virtual_node) + taken = new_node_name in self._all_resources.nodes() self._next_virtual_node += 1 - num_cores = len(next(iter(self._all_resources.cores.values()))) + new_node = copy(next(iter(self._all_resources))) + new_node.node_name = new_node_name + + num_cores = len(new_node.cpu_cores) if isinstance(req, ThreadedResReq): if req.threads > num_cores: raise InsufficientResourcesAvailable( @@ -704,14 +637,14 @@ def _expand_resources( f' {req.threads_per_mpi_process} threads per process,' f' which is impossible with {num_cores} cores per' ' node.') - self._all_resources.cores[new_node] = { - frozenset([i]) for i in range(num_cores)} - def _allocate_instance( + self._all_resources.add_node(new_node) + + def _assign_instance( self, instance: Reference, component: Component, requirements: ResourceRequirements, simultaneous_instances: Set[Reference], virtual: bool - ) -> Resources: + ) -> ResourceAssignment: """Allocates resources for the given instance. If we are on real resources, and the instance requires more @@ -720,7 +653,7 @@ def _allocate_instance( resources, this will raise InsufficientResourcesAvailable. Args: - instance: The instance to allocate for + instance: The instance to assign resources to component: The component it is an instance of requirements: Its resource requirements simultaneous_instances: Instances which may execute @@ -729,9 +662,9 @@ def _allocate_instance( virtual: Whether we are on virtual resources Returns: - A Resources object describing the resources allocated + The resources assigned to each process in the instance """ - allocation = Resources({}) + assignment = ResourceAssignment([]) free_resources = copy(self._all_resources) for other in self._allocations: @@ -741,8 +674,8 @@ def _allocate_instance( _logger.debug(f'Free resources: {free_resources}') try: if isinstance(requirements, ThreadedResReq): - allocation = self._allocate_thread_block( - free_resources, requirements.threads) + assignment.by_rank.append(self._assign_thread_block( + free_resources, requirements.threads)) elif isinstance(requirements, MPICoresResReq): if requirements.threads_per_mpi_process != 1: @@ -750,10 +683,10 @@ def _allocate_instance( 'Multiple threads per MPI process is not supported' ' yet. Please make an issue on GitHub.') for proc in range(requirements.mpi_processes): - allocation += self._allocate_thread_block( - free_resources, - requirements.threads_per_mpi_process) - free_resources -= allocation + block = self._assign_thread_block( + free_resources, requirements.threads_per_mpi_process) + assignment.by_rank.append(block) + free_resources -= Resources([block]) elif isinstance(requirements, MPINodesResReq): raise RuntimeError( @@ -764,37 +697,81 @@ def _allocate_instance( if not self._allocations and not virtual: # There are no other allocations and it's still not # enough. Just give it all and hope for the best. - _logger.warning(( - 'Instance {} requires more resources than are' - ' available in total. Oversubscribing this' - ' instance.').format(instance)) - allocation = copy(self._all_resources) + assignment = self._oversubscribe_instance(instance, requirements) else: raise - self._allocations[instance] = allocation - return allocation + self._allocations[instance] = assignment.as_resources() + return assignment - def _allocate_thread_block( - self, free_resources: Resources, threads: int) -> Resources: - """Allocate resources for a group of threads. + def _assign_thread_block( + self, free_resources: Resources, num_threads: int) -> OnNodeResources: + """Assign resources for a group of threads. - This chooses a set of cores on the same node. It - returns the allocated resources; it doesn't update - self._allocations or free_resources. + This chooses a set of cores on the same node. It returns the + assigned resources; it doesn't update self._allocations or free_resources. Args: - threads: Number of cores + num_threads: Number of threads to allocate for free_resources: Available resources to allocate from Returns: - The allocated resources + The assigned resources """ - for node in free_resources.nodes(): - if len(free_resources.cores[node]) >= threads: - available_cores = sorted(free_resources.cores[node], key=sorted) + for node in free_resources: + if len(node.cpu_cores) >= num_threads: + available_cores = node.cpu_cores _logger.debug(f'available cores: {available_cores}') - to_reserve = set(available_cores[:threads]) + to_reserve = available_cores.get_first_cores(num_threads) _logger.debug(f'assigned {to_reserve}') - return Resources({node: to_reserve}) + return OnNodeResources(node.node_name, to_reserve) raise InsufficientResourcesAvailable() + + def _oversubscribe_instance( + self, instance: Reference, requirements: ResourceRequirements + ) -> ResourceAssignment: + """Oversubscribe an instance. + + This is called when all resources are available and we still cannot fit an + instance, i.e. that single instance requires more resources than we have + available in total. In that case, we're just going to map it onto the resources + we have and hope for the best, which is what this function does. + + There's a lot of repetition between this and the code above. There's probably a + cleaner way to do this, but it'll do for now. Eventually we'll have an optimiser + and all this goes away anyway. + + Args: + instance: The instance we're oversubscribing + requirements: The required resources + + Returns: + An oversubscribed resource assignment + """ + _logger.warning( + f'Instance {instance} requires more resources than are available in' + ' total. Oversubscribing this instance.') + + res_by_rank: List[OnNodeResources] = list() + + if isinstance(requirements, ThreadedResReq): + res_by_rank.append(copy(next(iter(self._all_resources)))) + + elif isinstance(requirements, MPICoresResReq): + if requirements.threads_per_mpi_process != 1: + raise RuntimeError( + 'Multiple threads per MPI process is not supported yet. Please' + ' make an issue on GitHub.') + + free_resources = copy(self._all_resources) + for proc in range(requirements.mpi_processes): + if free_resources.total_cores() < requirements.threads_per_mpi_process: + free_resources = copy(self._all_resources) + + block = self._assign_thread_block( + free_resources, requirements.threads_per_mpi_process) + + res_by_rank.append(block) + free_resources -= Resources([block]) + + return ResourceAssignment(res_by_rank) diff --git a/libmuscle/python/libmuscle/planner/resources.py b/libmuscle/python/libmuscle/planner/resources.py new file mode 100644 index 00000000..0e1dd41a --- /dev/null +++ b/libmuscle/python/libmuscle/planner/resources.py @@ -0,0 +1,647 @@ +"""Module for describing compute resources + +There's a huge comment here because there's a big mess here that took me forever to +figure out, so now I'm going to document it for the future. + + +Identifying hardware resources + +Today's computers all contain multi-core CPUs, often with symmetric multithreading +(SMT), also known as hyperthreading. This means that we have hardware threads +(hwthreads) and also cores, and then there's caches and memory as well but we're not +going into NUMA here. + +Cores and hwthreads are identified by number, but they have multiple different numbers +that are referred to by different names in different contexts, making everything very +confusing. So here are some definitions to disambiguate things. Note that this is still +a rather simplified representation, but it's enough for what we're doing here in +MUSCLE3. + + +Hardware threads + +A *hardware thread (hwthread)* is, at the hardware level, an instruction decoder. It +points to wherever in the code we are currently executing, and it can read the next +couple of instructions and figure out how to execute them. It can't actually execute +anything however, because it doesn't have the hardware that does that. + +Intel refers to hwthreads as "logical processors" and so does Linux, hwloc calls them +"processing units" or PUs and so does OpenMPI unless it uses the term hwthread just to +confuse things a bit more. + +Cores + +A *core* contains at least one hwthread, and at least one functional unit, which is a +hardware component that actually does calculations and other data processing. Within a +core, the hwthread(s) read instructions and pass them to the functional units to be +executed. If a core has more than one hwthread, then the CPU supports SMT. + +Intel refers to cores as "physical processors", hwloc calls them cores and so do most +other sources. We'll use cores here. + +Since a hwthread cannot do anything on its own, it's always part of a core. + +CPUs + +The term CPU is used in many ways by various bits of documentation, sometimes referring +to a hwthread or a core, but here we'll take it to mean a collection of cores in a +plastic box. Similar terms are *package* (referring to that plastic box with very many +metal pins) and *socket* (the thing the package mounts into), or *processor*, which was +originally used to refer to all of the above when CPUs still had only one core with only +one hwthread, and has now become ambiguous. + +Weird things can happen here, I've seen CPUs that as far as I can tell are a single +package, but nevertheless claim to have two sockets. I suspect that that's two physical +chips in a single plastic box, but I don't know for sure. + +Here, we're concerned with hwthreads and cores and how to identify them and assign +instances to them. + + +Linux + +On modern operating systems, hardware access is mediated by the operating system, and +we're mainly concerned with Linux here because that is what all the clusters are running +(see the note on macOS below). Information about the CPU(s) can be obtained on Linux +from the /proc/cpuinfo file, or equivalently but more modernly, from the files in +/sys/devices/system/cpu/cpu/topology/. + +Linux collects information about processors because it needs to run processes (programs, +software threads) on them on behalf of the user. Processes are assigned to hwthreads, so +that is what Linux considers a *processor*. /proc/cpuinfo lists all these processors, +and they each have their own directory /sys/devices/system/cpu/cpu. + +On Linux, processors have an id, which is that number in the directory, and is +listed under "processor" in /proc/cpuinfo. Since this number identifies a hwthread and +is assigned by Linux rather than being baked into the hardware, I'm calling it a +"logical hwthread id", this being a logical id of a hwthread, not an id of a logical +hwthread. It's also the id of a logical processor in Intel-speak. + +Hwthreads actually have a second number associated with them, which does come from the +hardware. In /proc/cpuinfo, that's listed under "apicid"; it doesn't seem to be +available from sysfs. Hwloc call this the "physical PU (its name for a hwthread) id", +and OpenMPI's mpirun manpage also refers to it as a "physical processor location". + +There's great potential for confusion here: the "physical PU id" and "physical processor +location" both identify a hardware-specified number (a physical id or a physical +location) for a hwthread. This is something completely different than what Intel calls a +"physical processor", which they use to refer to a core. + +MUSCLE3 uses logical hwthread ids everywhere, it does not use physical ids. + +Linux knows about how hwthreads are grouped into bigger things of course. Cores are +identified in Linux using the "core id", which is listed in /proc/cpuinfo and in +/sys/devices/system/cpu/cpu/topology/core_id. So for each hwthread, identified by its +logical id, we can look up which core it is a part of. The core id is a logical id, +assigned by Linux, not by the hardware. While logical hwthread ids seem to always be +consecutive at least on the hardware I've seen so far, core ids may have gaps. + +MUSCLE3 does not use core ids, although it uses groups of hwthread ids that contain all +the hwthreads for a given core. + + +Resource binding + +Running processes need something to run on, a hwthread. The assignment of process to +hwthread is done by the operating system's scheduler: when a process is ready to run, +the scheduler will try to find it a free hwthread to run on. + +The scheduler can be constrained in which hwthreads it considers for a given process, +which is known as binding the process. This may have performance benefits, because +moving a process from one hwthread to another takes time. In MUSCLE3, when running on a +cluster, each process is assigned its own specific set of hwthreads to run on, and we +try to bind the instance to the assigned hwthreads. + +Taskset + +How this is done depends on how the instance is started. For non-MPI instances, we use a +Linux utility named 'taskset' that starts another program with a giving binding. The +binding is expressed as an *affinity mask*, a string of bits that say whether a given +processor (hwthread) can be used by the process or not. Each position in the string of +bits corresponds to the hwthread with that logical id. + +OpenMPI + +OpenMPI can bind cores in various ways, we use a rankfile and the --use-hwthread-cpus +option to specify the logical hwthread ids we want to bind each MPI process (rank) to. +Note that OpenMPI by default binds to cores, and can also bind to various other things +including sockets. + +MPICH + +MPICH doesn't support binding, as far as I can see. + +Intel MPI + +Intel MPI uses logical hwthread ids-based masks, specified in an environment variable, +to go with a machinefile that lists the nodes to put each process on. + +Slurm srun + +Slurm's srun has a CPU_BIND environment variable that likewise contains logical hwthread +ids-based masks, and a hostfile that lists the nodes to put each process on. + +Here are some disambiguation tables to help with the confusion: + + +``` +MUSCLE3 hwthread logical hwthread id physical hwthread id + +Linux processor processor apicid + (/proc/cpuinfo only) + +cgroups always uses these + +taskset always uses these + +hwloc PU PU L# PU P# + +OpenMPI hwthread used in rankfile if used in rankfile if + --use-hwthread-cpus rmaps_rank_file_physical + is specified MCA param set + +Intel logical logical processor + processor number + +srun used by --bind-to + +psutil logical returned by Process.cpu_affinity() + core counted by psutil.cpu_count(logical=True) +``` + + +``` +MUSCLE3 core core id + +Linux core core id + +Hwloc core core L# + +OpenMPI core used in rankfile if + --use-hwthread-cpus not + specified + +psutil physical counted by psutil.cpu_count(logical=False) + core +``` + +""" +from copy import copy, deepcopy +from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple + + +class Core: + """Describes a CPU core or designates a core or one or more hwthreads. + + A core is a group of functional units with one or more instruction decoders. If the + core supports symmetric multithreading (SMT, aka hyperthreading) then there will be + more than one instruction decoder or hardware thread in the core. + + Note that the term "logical CPU" refers to an instruction decoder/hwthread. If the + processor does not support SMT, then each core has a single decoder and so a logical + CPU is also a core. + + This class can be used in different ways with slighly different interpretations. + When describing hardware resources, it describes a core and all of its hwthreads. In + this case, cid is the core id, and hwthreads contains the hwthread ids of all + hwthreads on this core. If no SMT is supported, then there will be only one + hwthread id. + + When designating a whole core (e.g. for use by a process), cid is set to the id of + the core, and hwthreads contains all of the hwthreads on that core. When designating + a hwthread on a particular core, cid is set to the id of the core and hwthreads + contains the designated (single) hwthread. + + MUSCLE3 never assigns swthreads to subsets of hwthreads on a core, it assigns them + to either a single hwthread or a single whole core. So if more than one hwthread is + given, then we can assume that those are all the hwthreads on that core. + + Objects of this class automatically deepcopy when copied. This means that you can + make a copy using copy.copy() and modify that copy anywhere without changing the + original. + + Args: + cid: ID of this core, to be used to refer to it + hwthreads: Ids of hwthreads (logical CPUs) belonging to this core + """ + def __init__(self, cid: int, hwthreads: Set[int]) -> None: + """Create a Core""" + self.cid = cid + self.hwthreads = copy(hwthreads) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, Core): + return NotImplemented + + return self.cid == other.cid and self.hwthreads == other.hwthreads + + def __len__(self) -> int: + return len(self.hwthreads) + + def __copy__(self) -> 'Core': + return Core(self.cid, self.hwthreads) + + def __or__(self, other: object) -> 'Core': + if not isinstance(other, Core): + return NotImplemented + + if other.cid != self.cid: + raise ValueError('Cannot merge hwthreads on different cores') + + return Core(self.cid, self.hwthreads | other.hwthreads) + + def __ior__(self, other: object) -> 'Core': + if not isinstance(other, Core): + return NotImplemented + + if other.cid != self.cid: + raise ValueError('Cannot merge hwthreads on different cores') + + self.hwthreads |= other.hwthreads + return self + + def __isub__(self, other: object) -> 'Core': + if not isinstance(other, Core): + return NotImplemented + + if other.cid != self.cid: + raise ValueError('Cannot merge hwthreads on different cores') + + self.hwthreads -= other.hwthreads + return self + + def __str__(self) -> str: + hwthreads = ','.join(map(str, sorted(self.hwthreads))) + return f'{self.cid}({hwthreads})' + + def __repr__(self) -> str: + hwthreads = ','.join(map(str, sorted(self.hwthreads))) + return f'Core({self.cid}, {{{hwthreads}}})' + + def isdisjoint(self, other: 'Core') -> bool: + """Returns whether we share resources with other.""" + if self.cid != other.cid: + raise ValueError('Cannot compare hwthreads on different cores') + + return self.hwthreads.isdisjoint(other.hwthreads) + + +class CoreSet: + """A set of cores on a single node. + + This exists to make it a bit easier to operate on sets of cores, merging and + subtracting them. + + Objects of this class automatically deepcopy when copied. This means that you can + make a copy using copy.copy() and modify that copy anywhere without changing the + original. + """ + def __init__(self, cores: Iterable[Core]) -> None: + """Create a CoreSet + + Args: + cores: A set of cores to contain. + """ + self._cores = {c.cid: c for c in cores} + + def __eq__(self, other: object) -> bool: + if not isinstance(other, CoreSet): + return NotImplemented + + if len(self._cores) != len(other._cores): + return False + + for cid, core in self._cores.items(): + if cid not in other._cores: + return False + if core.hwthreads != other._cores[cid].hwthreads: + return False + + return True + + def __len__(self) -> int: + return len(self._cores) + + def __iter__(self) -> Iterator[Core]: + return iter(self._cores.values()) + + def __copy__(self) -> 'CoreSet': + return CoreSet(deepcopy(list(self._cores.values()))) + + def __ior__(self, other: object) -> 'CoreSet': + if not isinstance(other, CoreSet): + return NotImplemented + + for cid, core in other._cores.items(): + if cid in self._cores: + self._cores[cid] |= core + else: + self._cores[cid] = copy(core) + + return self + + def __isub__(self, other: object) -> 'CoreSet': + if not isinstance(other, CoreSet): + return NotImplemented + + for cid, core in other._cores.items(): + if cid in self._cores: + self._cores[cid] -= core + if not self._cores[cid].hwthreads: + del self._cores[cid] + + return self + + def __str__(self) -> str: + def collapse_ranges(ids: List[int]) -> str: + if len(ids) == 0: + return '' + + result = list() + start = 0 + i = 1 + while i <= len(ids): + if (i == len(ids)) or (ids[i-1] != ids[i] - 1): + if start == i - 1: + # run of one + result.append(str(ids[i-1])) + else: + # run of at least two + result.append(f'{ids[start]}-{ids[i-1]}') + start = i + i += 1 + return ','.join(result) + + cores = sorted((c.cid for c in self._cores.values())) + hwthreads = sorted((t for c in self._cores.values() for t in c.hwthreads)) + + return f'{collapse_ranges(cores)}({collapse_ranges(hwthreads)})' + + def __repr__(self) -> str: + cores = ', '.join(map(repr, sorted(self._cores.values(), key=lambda c: c.cid))) + return f'CoreSet({{{cores}}})' + + def isdisjoint(self, other: 'CoreSet') -> bool: + """Returns whether we share resources with other.""" + for cid, core in self._cores.items(): + if cid in other._cores: + if not core.isdisjoint(other._cores[cid]): + return False + return True + + def get_first_cores(self, num_cores: int) -> 'CoreSet': + """Returns the first num_cores cores in this set. + + Args: + The number of cores to select. + """ + result = copy(self) + cids = list(self._cores.keys()) + selected = cids[:num_cores] + if len(selected) < num_cores: + raise RuntimeError('Tried to get more cores than available') + + result._cores = {c.cid: c for c in result._cores.values() if c.cid in selected} + return result + + +class OnNodeResources: + """Resources on a single node, currently only CPU cores. + + This represents a set of resources on a single node, either all of the resources + available or some subset of interest. + + Objects of this class automatically deepcopy when copied. This means that you can + make a copy using copy.copy() and modify that copy anywhere without changing the + original. + """ + def __init__(self, node_name: str, cpu_cores: CoreSet) -> None: + """Create an OnNodeResources. + + Args: + name: (Host)name of the node. + cpu_cores: A set of cores for this node. + """ + self.node_name = node_name + self.cpu_cores = cpu_cores + + def __eq__(self, other: object) -> bool: + if not isinstance(other, OnNodeResources): + return NotImplemented + + return ( + isinstance(other, OnNodeResources) and + self.node_name == other.node_name and + self.cpu_cores == other.cpu_cores) + + def __copy__(self) -> 'OnNodeResources': + return OnNodeResources(self.node_name, copy(self.cpu_cores)) + + def __ior__(self, other: object) -> 'OnNodeResources': + if not isinstance(other, OnNodeResources): + return NotImplemented + + if self.node_name != other.node_name: + raise ValueError('Cannot merge resources on different nodes') + + self.cpu_cores |= other.cpu_cores + return self + + def __isub__(self, other: object) -> 'OnNodeResources': + if not isinstance(other, OnNodeResources): + return NotImplemented + + if self.node_name != other.node_name: + raise ValueError('Cannot remove resources on different nodes') + + self.cpu_cores -= other.cpu_cores + return self + + def __str__(self) -> str: + return f'OnNodeResources({self.node_name}, c: {str(self.cpu_cores)})' + + def __repr__(self) -> str: + return f'OnNodeResources("{self.node_name}", {repr(self.cpu_cores)})' + + def hwthreads(self) -> Iterable[int]: + """Return the hwthreads in this node.""" + return (thread for core in self.cpu_cores for thread in core.hwthreads) + + def total_cores(self) -> int: + """Return the number of CPU cores in this node.""" + return len(self.cpu_cores) + + def isdisjoint(self, other: 'OnNodeResources') -> bool: + """Returns whether we share resources with other.""" + return ( + self.node_name != other.node_name or + self.cpu_cores.isdisjoint(other.cpu_cores)) + + +class Resources: + """Designates a (sub)set of resources. + + Whether these resources are free or allocated in general or by something specific + depends on the context, this just says which resources we're talking about. + + Objects of this class automatically deepcopy when copied. This means that you can + make a copy using copy.copy() and modify that copy anywhere without changing the + original. + + Attributes: + nodes: A collection of nodes to include in this resource set + """ + def __init__(self, nodes: Optional[Iterable[OnNodeResources]] = None) -> None: + """Create a Resources object with the given nodes. + + Args: + nodes: OnNodeResourcess to be designated by this object. + """ + if nodes is None: + self._nodes: Dict[str, OnNodeResources] = {} + else: + self._nodes = {n.node_name: n for n in nodes} + + def __len__(self) -> int: + return len(self._nodes) + + def __iter__(self) -> Iterator[OnNodeResources]: + return iter(self._nodes.values()) + + def __getitem__(self, node_name: str) -> OnNodeResources: + return self._nodes[node_name] + + def __eq__(self, other: object) -> bool: + """Check for equality.""" + if not isinstance(other, Resources): + return NotImplemented + + if len(self._nodes) != len(other._nodes): + return False + + for node_name, node in self._nodes.items(): + if node_name not in other._nodes: + return False + if other._nodes[node_name] != node: + return False + + return True + + def __copy__(self) -> 'Resources': + """Copy the object.""" + return Resources((copy(n) for n in self._nodes.values())) + + def __ior__(self, other: object) -> 'Resources': + """Add the resources in the argument to this object.""" + if not isinstance(other, Resources): + return NotImplemented + + for node_name, other_node in other._nodes.items(): + if node_name in self._nodes: + self._nodes[node_name] |= other_node + else: + self._nodes[node_name] = copy(other_node) + + return self + + def __isub__(self, other: object) -> 'Resources': + """Remove the resources in the argument from this object.""" + if not isinstance(other, Resources): + return NotImplemented + + for node_name, other_node in other._nodes.items(): + if node_name in self._nodes: + self._nodes[node_name] -= other_node + if not self._nodes[node_name]: + del self._nodes[node_name] + + return self + + def __str__(self) -> str: + """Return a human-readable string representation.""" + nodes = ','.join( + map(str, sorted(self._nodes.values(), key=lambda n: n.node_name))) + return f'Resources({nodes})' + + def __repr__(self) -> str: + """Return a string representation.""" + nodes = sorted(self._nodes.values(), key=lambda n: n.node_name) + return f'Resources({nodes})' + + def nodes(self) -> Iterable[str]: + """Return the names of the nodes on which we designate resources.""" + return self._nodes.keys() + + def total_cores(self) -> int: + """Return the total number of cores (not hwthreads) designated.""" + return sum((len(n.cpu_cores) for n in self._nodes.values())) + + def cores(self) -> Iterable[Tuple[str, int]]: + """Return this resources as a list of node, core.""" + return ( + (node.node_name, core.cid) + for node in self._nodes.values() for core in node.cpu_cores) + + def hwthreads(self) -> Iterable[Tuple[str, int]]: + """Return this resources as a list of node, hwthread.""" + return ( + (node.node_name, hwthread) + for node in self._nodes.values() for hwthread in node.hwthreads()) + + def isdisjoint(self, other: 'Resources') -> bool: + """Return whether we share resources with other.""" + for node_name, node in self._nodes.items(): + if node_name in other._nodes: + if not node.isdisjoint(other._nodes[node_name]): + return False + return True + + def add_node(self, node_res: OnNodeResources) -> None: + """Add a node's resources. + + This absorbs node_res into this Resources object, so if you change node_res + after adding it, the changes will be reflected in this Resources. + + Args: + node_res: Resources on a node not yet included in this Resources. + + Raises: + RuntimeError: if we already have a node with this node name. + """ + if node_res.node_name in self._nodes: + raise RuntimeError( + 'Tried to add a OnNodeResources to a Resources for a node that is' + ' already present. This is a bug in MUSCLE3, please report it on' + ' GitHub.') + + self._nodes[node_res.node_name] = node_res + + def merge_node(self, node_res: OnNodeResources) -> None: + """Merges a node's resources + + This always copies the object. + + Args: + node_res: Resources on a node that may already be included in this + Resources. + """ + if node_res.node_name in self._nodes: + self._nodes[node_res.node_name] |= node_res + else: + self._nodes[node_res.node_name] = copy(node_res) + + @staticmethod + def union(resources: Iterable['Resources']) -> 'Resources': + """Combines the resources into one. + + Args: + resources: A collection of resources to merge. + + Return: + A Resources object referring to all the resources in the + input. + """ + result = Resources() + for cur_resources in resources: + result |= cur_resources + return result diff --git a/libmuscle/python/libmuscle/planner/test/test_planner.py b/libmuscle/python/libmuscle/planner/test/test_planner.py index 25883aab..273b0c7f 100644 --- a/libmuscle/python/libmuscle/planner/test/test_planner.py +++ b/libmuscle/python/libmuscle/planner/test/test_planner.py @@ -1,7 +1,3 @@ -from libmuscle.planner.planner import ( - InsufficientResourcesAvailable, ModelGraph, Planner, Resources) - -from copy import copy import pytest from typing import Dict, List @@ -9,15 +5,22 @@ Component, Conduit, Configuration, Implementation, Model, MPICoresResReq, Ports, Reference, ResourceRequirements, ThreadedResReq) -from libmuscle.test.conftest import frozenset_of as s +from libmuscle.planner.planner import ( + InsufficientResourcesAvailable, ModelGraph, Planner, ResourceAssignment) +from libmuscle.planner.resources import Resources + +from libmuscle.test.conftest import core as c, on_node_resources as onr, resources + + +Ref = Reference @pytest.fixture def all_resources() -> Resources: - return Resources({ - 'node001': {s(1), s(2), s(3), s(4)}, - 'node002': {s(1), s(2), s(3), s(4)}, - 'node003': {s(1), s(2), s(3), s(4)}}) + return resources({ + 'node001': [c(1), c(2), c(3), c(4)], + 'node002': [c(1), c(2), c(3), c(4)], + 'node003': [c(1), c(2), c(3), c(4)]}) @pytest.fixture @@ -51,17 +54,17 @@ def model(init: Component, macro: Component, micro: Component) -> Model: @pytest.fixture def implementations() -> List[Implementation]: return [ - Implementation(Reference('init'), script='init'), - Implementation(Reference('macro'), script='macro'), - Implementation(Reference('micro'), script='micro')] + Implementation(Ref('init'), script='init'), + Implementation(Ref('macro'), script='macro'), + Implementation(Ref('micro'), script='micro')] @pytest.fixture def requirements() -> Dict[Reference, ResourceRequirements]: res_list = [ - ThreadedResReq(Reference('init'), 4), - ThreadedResReq(Reference('macro'), 4), - ThreadedResReq(Reference('micro'), 4)] + ThreadedResReq(Ref('init'), 4), + ThreadedResReq(Ref('macro'), 4), + ThreadedResReq(Ref('micro'), 4)] return {r.name: r for r in res_list} @@ -72,6 +75,13 @@ def configuration( return Configuration(model, None, implementations, requirements) +@pytest.fixture +def assignment() -> ResourceAssignment: + return ResourceAssignment([ + onr('node001', {0, 1}), + onr('node002', {2, 3})]) + + def test_model_graph( init: Component, macro: Component, micro: Component, model: Model ) -> None: @@ -95,51 +105,51 @@ def test_model_graph( assert not graph.successors(micro) -def test_resources(all_resources: Resources) -> None: - res1 = all_resources - assert res1.cores == { - 'node001': {s(1), s(2), s(3), s(4)}, - 'node002': {s(1), s(2), s(3), s(4)}, - 'node003': {s(1), s(2), s(3), s(4)}} - assert set(res1.nodes()) == {'node001', 'node002', 'node003'} +def test_resource_assignment_eq() -> None: + asm1 = ResourceAssignment([]) + asm2 = ResourceAssignment([]) + + assert asm1 == asm2 - res2 = Resources({ - 'node004': {s(1), s(2), s(3), s(4), s(5), s(6)}, - 'node005': {s(1), s(2), s(3), s(4), s(5), s(6)}}) - res1 += res2 + asm1.by_rank.append(onr('node001', {0, 1})) + assert asm1 != asm2 - assert res1.cores == { - 'node001': {s(1), s(2), s(3), s(4)}, - 'node002': {s(1), s(2), s(3), s(4)}, - 'node003': {s(1), s(2), s(3), s(4)}, - 'node004': {s(1), s(2), s(3), s(4), s(5), s(6)}, - 'node005': {s(1), s(2), s(3), s(4), s(5), s(6)}} + asm2.by_rank.append(onr('node001', {0, 2})) + assert asm1 != asm2 - res3 = Resources({ - 'node003': {s(1), s(2), s(3), s(4)}, 'node005': {s(4), s(5), s(6)}}) - res1 -= res3 + asm2.by_rank[0] = onr('node001', {0, 1}) + assert asm1 == asm2 - assert res1.cores == { - 'node001': {s(1), s(2), s(3), s(4)}, - 'node002': {s(1), s(2), s(3), s(4)}, - 'node004': {s(1), s(2), s(3), s(4), s(5), s(6)}, - 'node005': {s(1), s(2), s(3)}} - assert res1.nodes() == { - 'node001', 'node002', 'node004', 'node005'} - res4 = copy(res3) - res4.cores['node003'] = {s(8)} +def test_resource_assignment_str(assignment: ResourceAssignment) -> None: + assert str(assignment) == ( + '[OnNodeResources(node001, c: 0-1(0-1)),' + ' OnNodeResources(node002, c: 2-3(2-3))]') - assert res3.cores['node003'] == {s(1), s(2), s(3), s(4)} - assert res4.cores['node003'] == {s(8)} - all_resources = Resources.union([res1, res2, res3, res4]) +def test_resource_assignment_repr(assignment: ResourceAssignment) -> None: + assert repr(assignment) == ( + 'ResourceAssignment([' + 'OnNodeResources("node001", CoreSet({Core(0, {0}), Core(1, {1})})),' + ' OnNodeResources("node002", CoreSet({Core(2, {2}), Core(3, {3})}))])') - assert all_resources.cores['node001'] == {s(1), s(2), s(3), s(4)} - assert all_resources.cores['node002'] == {s(1), s(2), s(3), s(4)} - assert all_resources.cores['node003'] == {s(1), s(2), s(3), s(4), s(8)} - assert all_resources.cores['node004'] == {s(1), s(2), s(3), s(4), s(5), s(6)} - assert all_resources.cores['node005'] == {s(1), s(2), s(3), s(4), s(5), s(6)} + +def test_resource_assignment_as_resources(assignment) -> None: + res = assignment.as_resources() + + assert res._nodes == { + 'node001': onr('node001', {0, 1}), + 'node002': onr('node002', {2, 3})} + + asm2 = ResourceAssignment([ + onr('node001', {0, 1}), onr('node001', {2, 3}), onr('node001', {2, 3}), + onr('node003', {4, 5})]) + + res = asm2.as_resources() + + assert res._nodes == { + 'node001': onr('node001', {0, 1, 2, 3}), + 'node003': onr('node003', {4, 5})} def test_planner( @@ -147,42 +157,31 @@ def test_planner( planner = Planner(all_resources) allocations = planner.allocate_all(configuration) - assert allocations[Reference('init')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('macro')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('micro')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} + assert allocations[Ref('init')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('macro')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('micro')].by_rank == [onr('node001', {1, 2, 3, 4})] def test_planner_exclusive_macro( all_resources: Resources, configuration: Configuration) -> None: planner = Planner(all_resources) - configuration.implementations[Reference('macro')].can_share_resources = ( - False) + configuration.implementations[Ref('macro')].can_share_resources = False allocations = planner.allocate_all(configuration) - assert allocations[Reference('init')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('macro')].cores == { - 'node002': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('micro')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} + assert allocations[Ref('init')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('macro')].by_rank == [onr('node002', {1, 2, 3, 4})] + assert allocations[Ref('micro')].by_rank == [onr('node001', {1, 2, 3, 4})] def test_planner_exclusive_predecessor( all_resources: Resources, configuration: Configuration) -> None: planner = Planner(all_resources) - configuration.implementations[Reference('init')].can_share_resources = ( - False) + configuration.implementations[Ref('init')].can_share_resources = False allocations = planner.allocate_all(configuration) - assert allocations[Reference('init')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('macro')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('micro')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} + assert allocations[Ref('init')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('macro')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('micro')].by_rank == [onr('node001', {1, 2, 3, 4})] def test_oversubscribe( @@ -194,97 +193,84 @@ def test_oversubscribe( planner = Planner(all_resources) allocations = planner.allocate_all(configuration) - assert allocations[Reference('init[0]')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('init[1]')].cores == { - 'node002': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('init[2]')].cores == { - 'node003': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('init[3]')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('init[4]')].cores == { - 'node002': {s(1), s(2), s(3), s(4)}} - - assert allocations[Reference('macro[0]')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('macro[1]')].cores == { - 'node002': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('macro[2]')].cores == { - 'node003': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('macro[3]')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('macro[4]')].cores == { - 'node002': {s(1), s(2), s(3), s(4)}} - - assert allocations[Reference('micro[0]')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('micro[1]')].cores == { - 'node002': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('micro[2]')].cores == { - 'node003': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('micro[3]')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} - assert allocations[Reference('micro[4]')].cores == { - 'node002': {s(1), s(2), s(3), s(4)}} + assert allocations[Ref('init[0]')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('init[1]')].by_rank == [onr('node002', {1, 2, 3, 4})] + assert allocations[Ref('init[2]')].by_rank == [onr('node003', {1, 2, 3, 4})] + assert allocations[Ref('init[3]')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('init[4]')].by_rank == [onr('node002', {1, 2, 3, 4})] + + assert allocations[Ref('macro[0]')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('macro[1]')].by_rank == [onr('node002', {1, 2, 3, 4})] + assert allocations[Ref('macro[2]')].by_rank == [onr('node003', {1, 2, 3, 4})] + assert allocations[Ref('macro[3]')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('macro[4]')].by_rank == [onr('node002', {1, 2, 3, 4})] + + assert allocations[Ref('micro[0]')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('micro[1]')].by_rank == [onr('node002', {1, 2, 3, 4})] + assert allocations[Ref('micro[2]')].by_rank == [onr('node003', {1, 2, 3, 4})] + assert allocations[Ref('micro[3]')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('micro[4]')].by_rank == [onr('node002', {1, 2, 3, 4})] def test_oversubscribe_single_instance_threaded() -> None: model = Model('single_instance', [Component('x', 'x', ports=Ports())]) - impl = [Implementation(Reference('x'), script='x')] + impl = [Implementation(Ref('x'), script='x')] reqs: Dict[Reference, ResourceRequirements] = { - Reference('x'): ThreadedResReq(Reference('x'), 24)} + Ref('x'): ThreadedResReq(Ref('x'), 24)} config = Configuration(model, None, impl, reqs) - res = Resources({'node001': {s(1), s(2), s(3), s(4)}}) + res = resources({'node001': [c(1), c(2), c(3), c(4)]}) planner = Planner(res) allocations = planner.allocate_all(config) - assert allocations[Reference('x')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} + assert allocations[Ref('x')].by_rank == [onr('node001', {1, 2, 3, 4})] def test_oversubscribe_single_instance_mpi() -> None: model = Model('single_instance', [Component('x', 'x', ports=Ports())]) - impl = [Implementation(Reference('x'), script='x')] + impl = [Implementation(Ref('x'), script='x')] reqs: Dict[Reference, ResourceRequirements] = { - Reference('x'): MPICoresResReq(Reference('x'), 24)} + Ref('x'): MPICoresResReq(Ref('x'), 24)} config = Configuration(model, None, impl, reqs) - res = Resources({'node001': {s(1), s(2), s(3), s(4)}}) + res = resources({'node001': [c(1), c(2), c(3), c(4)]}) planner = Planner(res) allocations = planner.allocate_all(config) - assert allocations[Reference('x')].cores == { - 'node001': {s(1), s(2), s(3), s(4)}} + assert len(allocations[Ref('x')].by_rank) == 24 + for r in range(24): + assert allocations[Ref('x')].by_rank[r] == onr('node001', {r % 4 + 1}) def test_virtual_allocation() -> None: model = Model('ensemble', [Component('x', 'x', 9, ports=Ports())]) - impl = [Implementation(Reference('x'), script='x')] - reqs: Dict[Reference, ResourceRequirements] = { - Reference('x'): MPICoresResReq(Reference('x'), 13)} + impl = [Implementation(Ref('x'), script='x')] + reqs: Dict[Ref, ResourceRequirements] = { + Ref('x'): MPICoresResReq(Ref('x'), 13)} config = Configuration(model, None, impl, reqs) - res = Resources({'node000001': {s(1), s(2), s(3), s(4)}}) + res = resources({'node000001': [c(1), c(2), c(3), c(4)]}) planner = Planner(res) allocations = planner.allocate_all(config, virtual=True) assert res.total_cores() == 120 - assert allocations[Reference('x[0]')].total_cores() == 13 - assert allocations[Reference('x[8]')].total_cores() == 13 + for i in range(9): + for r in range(13): + assert len(allocations[Ref(f'x[{i}]')].by_rank) == 13 + assert allocations[Ref(f'x[{i}]')].by_rank[r].total_cores() == 1 def test_impossible_virtual_allocation() -> None: model = Model('ensemble', [Component('x', 'x', 9, ports=Ports())]) - impl = [Implementation(Reference('x'), script='x')] - reqs: Dict[Reference, ResourceRequirements] = { - Reference('x'): ThreadedResReq(Reference('x'), 13)} + impl = [Implementation(Ref('x'), script='x')] + reqs: Dict[Ref, ResourceRequirements] = { + Ref('x'): ThreadedResReq(Ref('x'), 13)} config = Configuration(model, None, impl, reqs) - res = Resources({'node000001': {s(1), s(2), s(3), s(4)}}) + res = resources({'node000001': [c(1), c(2), c(3), c(4)]}) planner = Planner(res) with pytest.raises(InsufficientResourcesAvailable): diff --git a/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py b/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py index f1f5b02a..13ec5ce3 100644 --- a/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py +++ b/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py @@ -1,17 +1,15 @@ from copy import deepcopy -from libmuscle.planner.planner import ModelGraph, Planner, Resources - -from typing import Dict, FrozenSet, Tuple +from typing import Dict, Tuple import pytest from ymmsl import ( Component, Conduit, Configuration, Implementation, Model, MPICoresResReq, Ports, Reference, ResourceRequirements, ThreadedResReq) +from libmuscle.planner.planner import ModelGraph, Planner, ResourceAssignment +from libmuscle.planner.resources import Resources -def c(hwthread_id: int) -> FrozenSet[int]: - """Helper that defines a core with the given hwthread id.""" - return frozenset({hwthread_id}) +from libmuscle.test.conftest import core as c, on_node_resources as onr, resources _ResReqs = Dict[Reference, ResourceRequirements] @@ -43,12 +41,12 @@ def c(hwthread_id: int) -> FrozenSet[int]: s0_model, None, s0_implementations, s0_requirements) -s0_resources = Resources({'node001': {c(0), c(1), c(2), c(3)}}) +s0_resources = resources({'node001': [c(0), c(1), c(2), c(3)]}) s0_solution = { - Reference('macro'): Resources({'node001': {c(0), c(1)}}), - Reference('micro'): Resources({'node001': {c(2), c(3)}})} + Reference('macro'): ResourceAssignment([onr('node001', {0, 1})]), + Reference('micro'): ResourceAssignment([onr('node001', {2, 3})])} s1_model = Model( @@ -88,14 +86,14 @@ def c(hwthread_id: int) -> FrozenSet[int]: s1_model, None, s1_implementations, s1_requirements) -s1_resources = Resources({'node001': {c(0), c(1), c(2), c(3)}}) +s1_resources = resources({'node001': [c(0), c(1), c(2), c(3)]}) s1_solution = { - Reference('macro'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('micro1'): Resources({'node001': {c(0), c(1)}}), - Reference('micro2'): Resources({'node001': {c(0), c(1)}}), - Reference('micro3'): Resources({'node001': {c(0)}})} + Reference('macro'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro1'): ResourceAssignment([onr('node001', {0, 1})]), + Reference('micro2'): ResourceAssignment([onr('node001', {0, 1})]), + Reference('micro3'): ResourceAssignment([onr('node001', 0)])} s2_model = Model( @@ -130,14 +128,14 @@ def c(hwthread_id: int) -> FrozenSet[int]: s2_model, None, s2_implementations, s2_requirements) -s2_resources = Resources( - {'node001': {c(0), c(1), c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}}) +s2_resources = resources( + {'node001': [c(0), c(1), c(2), c(3)], 'node002': [c(0), c(1), c(2), c(3)]}) s2_solution = { - Reference('macro'): Resources({'node001': {c(0)}}), - Reference('micro1'): Resources({'node001': {c(0), c(1), c(2)}}), - Reference('micro2'): Resources({'node002': {c(0), c(1)}})} + Reference('macro'): ResourceAssignment([onr('node001', 0)]), + Reference('micro1'): ResourceAssignment([onr('node001', {0, 1, 2})]), + Reference('micro2'): ResourceAssignment([onr('node002', {0, 1})])} s3_model = Model( @@ -176,16 +174,17 @@ def c(hwthread_id: int) -> FrozenSet[int]: s3_model, None, s3_implementations, s3_requirements) -s3_resources = Resources( - {'node001': {c(0), c(1), c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}}) +s3_resources = resources( + {'node001': [c(0), c(1), c(2), c(3)], 'node002': [c(0), c(1), c(2), c(3)]}) s3_solution = { - Reference('a'): Resources({'node001': {c(0)}}), - Reference('b1'): Resources( - {'node001': {c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}}), - Reference('b2'): Resources({'node001': {c(0), c(1)}}), - Reference('c'): Resources({'node001': {c(0), c(1), c(2), c(3)}})} + Reference('a'): ResourceAssignment([onr('node001', 0)]), + Reference('b1'): ResourceAssignment([ + onr('node001', 2), onr('node001', 3), onr('node002', 0), onr('node002', 1), + onr('node002', 2), onr('node002', 3)]), + Reference('b2'): ResourceAssignment([onr('node001', {0, 1})]), + Reference('c'): ResourceAssignment([onr('node001', {0, 1, 2, 3})])} s4_model = Model( @@ -221,14 +220,14 @@ def c(hwthread_id: int) -> FrozenSet[int]: s4_model, None, s4_implementations, s4_requirements) -s4_resources = Resources( - {'node001': {c(0), c(1), c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}}) +s4_resources = resources( + {'node001': [c(0), c(1), c(2), c(3)], 'node002': [c(0), c(1), c(2), c(3)]}) s4_solution = { - Reference('macro1'): Resources({'node002': {c(0), c(1)}}), - Reference('macro2'): Resources({'node001': {c(0), c(1), c(2)}}), - Reference('micro'): Resources({'node001': {c(0), c(1), c(2)}})} + Reference('macro1'): ResourceAssignment([onr('node002', {0, 1})]), + Reference('macro2'): ResourceAssignment([onr('node001', {0, 1, 2})]), + Reference('micro'): ResourceAssignment([onr('node001', {0, 1, 2})])} s5_model = Model( @@ -270,19 +269,19 @@ def c(hwthread_id: int) -> FrozenSet[int]: s5_model, None, s5_implementations, s5_requirements) -s5_resources = Resources({ - 'node001': {c(0), c(1), c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}, - 'node003': {c(0), c(1)}}) +s5_resources = resources({ + 'node001': [c(0), c(1), c(2), c(3)], 'node002': [c(0), c(1), c(2), c(3)], + 'node003': [c(0), c(1)]}) # This is inefficient, as the models can all share resources. But repeater # is funny, and the algorithm cannot deal with it yet. It does give a valid # result with no overlap, so we'll accept that for the time being. s5_solution = { - Reference('init'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('macro'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('micro'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - Reference('repeater'): Resources({'node003': {c(0)}})} + Reference('init'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('repeater'): ResourceAssignment([onr('node003', 0)])} s6_model = Model( @@ -318,22 +317,22 @@ def c(hwthread_id: int) -> FrozenSet[int]: s6_model, None, s6_implementations, s6_requirements) -s6_resources = Resources({ - 'node001': {c(0), c(1), c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}, - 'node003': {c(0), c(1), c(2), c(3)}, 'node004': {c(0), c(1), c(2), c(3)}, - 'node005': {c(0), c(1), c(2), c(3)}, 'node006': {c(0), c(1), c(2), c(3)} +s6_resources = resources({ + 'node001': [c(0), c(1), c(2), c(3)], 'node002': [c(0), c(1), c(2), c(3)], + 'node003': [c(0), c(1), c(2), c(3)], 'node004': [c(0), c(1), c(2), c(3)], + 'node005': [c(0), c(1), c(2), c(3)], 'node006': [c(0), c(1), c(2), c(3)] }) s6_solution = { - Reference('a'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('tcf'): Resources({'node002': {c(0)}}), - Reference('b'): Resources({ - 'node002': {c(1), c(2), c(3)}, - 'node003': {c(0), c(1), c(2), c(3)}, - 'node004': {c(0), c(1), c(2), c(3)}, - 'node005': {c(0), c(1), c(2), c(3)}, - 'node006': {c(0)}})} + Reference('a'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('tcf'): ResourceAssignment([onr('node002', 0)]), + Reference('b'): ResourceAssignment([ + onr('node002', 1), onr('node002', 2), onr('node002', 3), onr('node003', 0), + onr('node003', 1), onr('node003', 2), onr('node003', 3), onr('node004', 0), + onr('node004', 1), onr('node004', 2), onr('node004', 3), onr('node005', 0), + onr('node005', 1), onr('node005', 2), onr('node005', 3), onr('node006', 0)]) + } s7_model = Model( @@ -374,47 +373,70 @@ def c(hwthread_id: int) -> FrozenSet[int]: s7_model, None, s7_implementations, s7_requirements) -s7_resources = Resources({ - 'node001': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, - 'node002': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, - 'node003': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, - 'node004': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, - 'node005': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, +s7_resources = resources({ + 'node001': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node002': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node003': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node004': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node005': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], }) s7_solution = { - Reference('mc'): Resources({'node001': {c(0)}}), - Reference('init[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('init[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), - Reference('init[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - Reference('init[3]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), - Reference('init[4]'): Resources({'node003': {c(0), c(1), c(2), c(3)}}), - Reference('init[5]'): Resources({'node003': {c(4), c(5), c(6), c(7)}}), - Reference('init[6]'): Resources({'node004': {c(0), c(1), c(2), c(3)}}), - Reference('init[7]'): Resources({'node004': {c(4), c(5), c(6), c(7)}}), - Reference('init[8]'): Resources({'node005': {c(0), c(1), c(2), c(3)}}), - Reference('init[9]'): Resources({'node005': {c(4), c(5), c(6), c(7)}}), - Reference('macro[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('macro[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), - Reference('macro[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - Reference('macro[3]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), - Reference('macro[4]'): Resources({'node003': {c(0), c(1), c(2), c(3)}}), - Reference('macro[5]'): Resources({'node003': {c(4), c(5), c(6), c(7)}}), - Reference('macro[6]'): Resources({'node004': {c(0), c(1), c(2), c(3)}}), - Reference('macro[7]'): Resources({'node004': {c(4), c(5), c(6), c(7)}}), - Reference('macro[8]'): Resources({'node005': {c(0), c(1), c(2), c(3)}}), - Reference('macro[9]'): Resources({'node005': {c(4), c(5), c(6), c(7)}}), - Reference('micro[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('micro[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), - Reference('micro[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - Reference('micro[3]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), - Reference('micro[4]'): Resources({'node003': {c(0), c(1), c(2), c(3)}}), - Reference('micro[5]'): Resources({'node003': {c(4), c(5), c(6), c(7)}}), - Reference('micro[6]'): Resources({'node004': {c(0), c(1), c(2), c(3)}}), - Reference('micro[7]'): Resources({'node004': {c(4), c(5), c(6), c(7)}}), - Reference('micro[8]'): Resources({'node005': {c(0), c(1), c(2), c(3)}}), - Reference('micro[9]'): Resources({'node005': {c(4), c(5), c(6), c(7)}})} + Reference('mc'): ResourceAssignment([onr('node001', 0)]), + + Reference('init[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('init[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('init[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('init[3]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), + Reference('init[4]'): ResourceAssignment([onr('node003', {0, 1, 2, 3})]), + Reference('init[5]'): ResourceAssignment([onr('node003', {4, 5, 6, 7})]), + Reference('init[6]'): ResourceAssignment([onr('node004', {0, 1, 2, 3})]), + Reference('init[7]'): ResourceAssignment([onr('node004', {4, 5, 6, 7})]), + Reference('init[8]'): ResourceAssignment([onr('node005', {0, 1, 2, 3})]), + Reference('init[9]'): ResourceAssignment([onr('node005', {4, 5, 6, 7})]), + + Reference('macro[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('macro[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('macro[3]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), + Reference('macro[4]'): ResourceAssignment([onr('node003', {0, 1, 2, 3})]), + Reference('macro[5]'): ResourceAssignment([onr('node003', {4, 5, 6, 7})]), + Reference('macro[6]'): ResourceAssignment([onr('node004', {0, 1, 2, 3})]), + Reference('macro[7]'): ResourceAssignment([onr('node004', {4, 5, 6, 7})]), + Reference('macro[8]'): ResourceAssignment([onr('node005', {0, 1, 2, 3})]), + Reference('macro[9]'): ResourceAssignment([onr('node005', {4, 5, 6, 7})]), + + Reference('micro[0]'): ResourceAssignment([ + onr('node001', 0), onr('node001', 1), onr('node001', 2), + onr('node001', 3)]), + Reference('micro[1]'): ResourceAssignment([ + onr('node001', 4), onr('node001', 5), onr('node001', 6), + onr('node001', 7)]), + Reference('micro[2]'): ResourceAssignment([ + onr('node002', 0), onr('node002', 1), onr('node002', 2), + onr('node002', 3)]), + Reference('micro[3]'): ResourceAssignment([ + onr('node002', 4), onr('node002', 5), onr('node002', 6), + onr('node002', 7)]), + Reference('micro[4]'): ResourceAssignment([ + onr('node003', 0), onr('node003', 1), onr('node003', 2), + onr('node003', 3)]), + Reference('micro[5]'): ResourceAssignment([ + onr('node003', 4), onr('node003', 5), onr('node003', 6), + onr('node003', 7)]), + Reference('micro[6]'): ResourceAssignment([ + onr('node004', 0), onr('node004', 1), onr('node004', 2), + onr('node004', 3)]), + Reference('micro[7]'): ResourceAssignment([ + onr('node004', 4), onr('node004', 5), onr('node004', 6), + onr('node004', 7)]), + Reference('micro[8]'): ResourceAssignment([ + onr('node005', 0), onr('node005', 1), onr('node005', 2), + onr('node005', 3)]), + Reference('micro[9]'): ResourceAssignment([ + onr('node005', 4), onr('node005', 5), onr('node005', 6), + onr('node005', 7)])} s8_model = Model( @@ -451,14 +473,14 @@ def c(hwthread_id: int) -> FrozenSet[int]: s8_model, None, s8_implementations, s8_requirements) -s8_resources = Resources( - {'node001': {c(0), c(1), c(2), c(3)}, 'node002': {c(0), c(1), c(2), c(3)}}) +s8_resources = resources( + {'node001': [c(0), c(1), c(2), c(3)], 'node002': [c(0), c(1), c(2), c(3)]}) s8_solution = { - Reference('macro'): Resources({'node001': {c(3)}}), - Reference('micro1'): Resources({'node001': {c(0), c(1), c(2)}}), - Reference('micro2'): Resources({'node001': {c(0), c(1)}})} + Reference('macro'): ResourceAssignment([onr('node001', 3)]), + Reference('micro1'): ResourceAssignment([onr('node001', {0, 1, 2})]), + Reference('micro2'): ResourceAssignment([onr('node001', {0, 1})])} s9_model = Model( @@ -500,15 +522,15 @@ def c(hwthread_id: int) -> FrozenSet[int]: s9_model, None, s9_implementations, s9_requirements) -s9_resources = Resources({'node001': {c(0), c(1), c(2), c(3)}}) +s9_resources = resources({'node001': [c(0), c(1), c(2), c(3)]}) s9_solution = { - Reference('a'): Resources({'node001': {c(1)}}), - Reference('b'): Resources({'node001': {c(0)}}), - Reference('c'): Resources({'node001': {c(0)}}), - Reference('d'): Resources({'node001': {c(1)}}), - Reference('e'): Resources({'node001': {c(0)}})} + Reference('a'): ResourceAssignment([onr('node001', 1)]), + Reference('b'): ResourceAssignment([onr('node001', 0)]), + Reference('c'): ResourceAssignment([onr('node001', 0)]), + Reference('d'): ResourceAssignment([onr('node001', 1)]), + Reference('e'): ResourceAssignment([onr('node001', 0)])} s10_model = Model( @@ -552,38 +574,40 @@ def c(hwthread_id: int) -> FrozenSet[int]: s10_model, None, s10_implementations, s10_requirements) -s10_resources = Resources({ - 'node001': { +s10_resources = resources({ + 'node001': [ c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7), - c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)}, - 'node002': { + c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)], + 'node002': [ c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7), - c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)}, - 'node003': { + c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)], + 'node003': [ c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7), - c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)}, + c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)], }) s10_solution = { - Reference('mc'): Resources({'node001': {c(0)}}), - Reference('rr'): Resources({'node001': {c(0)}}), - Reference('macro[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('macro[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), - Reference('macro[2]'): Resources({'node001': {c(8), c(9), c(10), c(11)}}), - Reference('macro[3]'): Resources({'node001': {c(12), c(13), c(14), c(15)}}), - Reference('macro[4]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - Reference('macro[5]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), - Reference('macro[6]'): Resources({'node002': {c(8), c(9), c(10), c(11)}}), - Reference('macro[7]'): Resources({'node002': {c(12), c(13), c(14), c(15)}}), - Reference('micro[0]'): Resources({'node001': {c(0), c(1)}}), - Reference('micro[1]'): Resources({'node001': {c(4), c(5)}}), - Reference('micro[2]'): Resources({'node001': {c(8), c(9)}}), - Reference('micro[3]'): Resources({'node001': {c(12), c(13)}}), - Reference('micro[4]'): Resources({'node002': {c(0), c(1)}}), - Reference('micro[5]'): Resources({'node002': {c(4), c(5)}}), - Reference('micro[6]'): Resources({'node002': {c(8), c(9)}}), - Reference('micro[7]'): Resources({'node002': {c(12), c(13)}})} + Reference('mc'): ResourceAssignment([onr('node001', 0)]), + Reference('rr'): ResourceAssignment([onr('node001', 0)]), + + Reference('macro[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('macro[2]'): ResourceAssignment([onr('node001', {8, 9, 10, 11})]), + Reference('macro[3]'): ResourceAssignment([onr('node001', {12, 13, 14, 15})]), + Reference('macro[4]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('macro[5]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), + Reference('macro[6]'): ResourceAssignment([onr('node002', {8, 9, 10, 11})]), + Reference('macro[7]'): ResourceAssignment([onr('node002', {12, 13, 14, 15})]), + + Reference('micro[0]'): ResourceAssignment([onr('node001', {0, 1})]), + Reference('micro[1]'): ResourceAssignment([onr('node001', {4, 5})]), + Reference('micro[2]'): ResourceAssignment([onr('node001', {8, 9})]), + Reference('micro[3]'): ResourceAssignment([onr('node001', {12, 13})]), + Reference('micro[4]'): ResourceAssignment([onr('node002', {0, 1})]), + Reference('micro[5]'): ResourceAssignment([onr('node002', {4, 5})]), + Reference('micro[6]'): ResourceAssignment([onr('node002', {8, 9})]), + Reference('micro[7]'): ResourceAssignment([onr('node002', {12, 13})])} s11_model = Model( @@ -622,26 +646,25 @@ def c(hwthread_id: int) -> FrozenSet[int]: s11_config = Configuration(s11_model, None, s11_implementations, s11_requirements) -s11_resources = Resources({ - 'node001': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, - 'node002': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, +s11_resources = resources({ + 'node001': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node002': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], }) s11_solution = { - Reference('macro1[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('macro1[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), - Reference('macro1[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - Reference('micro1[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('micro1[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), - Reference('micro1[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - Reference('macro2[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('macro2[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), - Reference('macro2[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - Reference('micro2[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('micro2[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), - Reference('micro2[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - } + Reference('macro1[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro1[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('macro1[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('micro1[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro1[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('micro1[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('macro2[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro2[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('macro2[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('micro2[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro2[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('micro2[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})])} s12_model = deepcopy(s11_model) @@ -663,16 +686,16 @@ def c(hwthread_id: int) -> FrozenSet[int]: s12_solution = { - Reference('macro1'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('micro1[0]'): Resources({'node001': { - c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}}), - Reference('micro1[1]'): Resources({'node002': { - c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}}), - Reference('macro2'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('micro2[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('micro2[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), - Reference('micro2[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - Reference('micro2[3]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), + Reference('macro1'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro1[0]'): ResourceAssignment([ + onr('node001', {0, 1, 2, 3, 4, 5, 6, 7})]), + Reference('micro1[1]'): ResourceAssignment([ + onr('node002', {0, 1, 2, 3, 4, 5, 6, 7})]), + Reference('macro2'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro2[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro2[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('micro2[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('micro2[3]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), } @@ -694,59 +717,59 @@ def c(hwthread_id: int) -> FrozenSet[int]: s13_config = Configuration(s13_model, None, s11_implementations, s13_requirements) -s13_resources = Resources({ - 'node001': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, - 'node002': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, - 'node003': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, - 'node004': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, - 'node005': {c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)}, +s13_resources = resources({ + 'node001': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node002': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node003': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node004': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node005': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], }) s13_solution = { - Reference('macro1[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('macro1[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), - Reference('macro1[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - Reference('macro1[3]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), - Reference('macro1[4]'): Resources({'node003': {c(0), c(1), c(2), c(3)}}), - - Reference('micro1[0][0]'): Resources({'node001': {c(0), c(1)}}), - Reference('micro1[0][1]'): Resources({'node001': {c(2), c(3)}}), - Reference('micro1[0][2]'): Resources({'node003': {c(4), c(5)}}), - Reference('micro1[0][3]'): Resources({'node003': {c(6), c(7)}}), - Reference('micro1[1][0]'): Resources({'node001': {c(4), c(5)}}), - Reference('micro1[1][1]'): Resources({'node001': {c(6), c(7)}}), - Reference('micro1[1][2]'): Resources({'node004': {c(0), c(1)}}), - Reference('micro1[1][3]'): Resources({'node004': {c(2), c(3)}}), - Reference('micro1[2][0]'): Resources({'node002': {c(0), c(1)}}), - Reference('micro1[2][1]'): Resources({'node002': {c(2), c(3)}}), - Reference('micro1[2][2]'): Resources({'node004': {c(4), c(5)}}), - Reference('micro1[2][3]'): Resources({'node004': {c(6), c(7)}}), - Reference('micro1[3][0]'): Resources({'node002': {c(4), c(5)}}), - Reference('micro1[3][1]'): Resources({'node002': {c(6), c(7)}}), - Reference('micro1[3][2]'): Resources({'node005': {c(0), c(1)}}), - Reference('micro1[3][3]'): Resources({'node005': {c(2), c(3)}}), - Reference('micro1[4][0]'): Resources({'node003': {c(0), c(1)}}), - Reference('micro1[4][1]'): Resources({'node003': {c(2), c(3)}}), - Reference('micro1[4][2]'): Resources({'node005': {c(4), c(5)}}), - Reference('micro1[4][3]'): Resources({'node005': {c(6), c(7)}}), - - Reference('macro2[0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('macro2[1]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), - Reference('macro2[2]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - Reference('macro2[3]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), - Reference('macro2[4]'): Resources({'node003': {c(0), c(1), c(2), c(3)}}), - - Reference('micro2[0][0]'): Resources({'node001': {c(0), c(1), c(2), c(3)}}), - Reference('micro2[0][1]'): Resources({'node003': {c(4), c(5), c(6), c(7)}}), - Reference('micro2[1][0]'): Resources({'node001': {c(4), c(5), c(6), c(7)}}), - Reference('micro2[1][1]'): Resources({'node004': {c(0), c(1), c(2), c(3)}}), - Reference('micro2[2][0]'): Resources({'node002': {c(0), c(1), c(2), c(3)}}), - Reference('micro2[2][1]'): Resources({'node004': {c(4), c(5), c(6), c(7)}}), - Reference('micro2[3][0]'): Resources({'node002': {c(4), c(5), c(6), c(7)}}), - Reference('micro2[3][1]'): Resources({'node005': {c(0), c(1), c(2), c(3)}}), - Reference('micro2[4][0]'): Resources({'node003': {c(0), c(1), c(2), c(3)}}), - Reference('micro2[4][1]'): Resources({'node005': {c(4), c(5), c(6), c(7)}}), + Reference('macro1[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro1[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('macro1[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('macro1[3]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), + Reference('macro1[4]'): ResourceAssignment([onr('node003', {0, 1, 2, 3})]), + + Reference('micro1[0][0]'): ResourceAssignment([onr('node001', {0, 1})]), + Reference('micro1[0][1]'): ResourceAssignment([onr('node001', {2, 3})]), + Reference('micro1[0][2]'): ResourceAssignment([onr('node003', {4, 5})]), + Reference('micro1[0][3]'): ResourceAssignment([onr('node003', {6, 7})]), + Reference('micro1[1][0]'): ResourceAssignment([onr('node001', {4, 5})]), + Reference('micro1[1][1]'): ResourceAssignment([onr('node001', {6, 7})]), + Reference('micro1[1][2]'): ResourceAssignment([onr('node004', {0, 1})]), + Reference('micro1[1][3]'): ResourceAssignment([onr('node004', {2, 3})]), + Reference('micro1[2][0]'): ResourceAssignment([onr('node002', {0, 1})]), + Reference('micro1[2][1]'): ResourceAssignment([onr('node002', {2, 3})]), + Reference('micro1[2][2]'): ResourceAssignment([onr('node004', {4, 5})]), + Reference('micro1[2][3]'): ResourceAssignment([onr('node004', {6, 7})]), + Reference('micro1[3][0]'): ResourceAssignment([onr('node002', {4, 5})]), + Reference('micro1[3][1]'): ResourceAssignment([onr('node002', {6, 7})]), + Reference('micro1[3][2]'): ResourceAssignment([onr('node005', {0, 1})]), + Reference('micro1[3][3]'): ResourceAssignment([onr('node005', {2, 3})]), + Reference('micro1[4][0]'): ResourceAssignment([onr('node003', {0, 1})]), + Reference('micro1[4][1]'): ResourceAssignment([onr('node003', {2, 3})]), + Reference('micro1[4][2]'): ResourceAssignment([onr('node005', {4, 5})]), + Reference('micro1[4][3]'): ResourceAssignment([onr('node005', {6, 7})]), + + Reference('macro2[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro2[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('macro2[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('macro2[3]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), + Reference('macro2[4]'): ResourceAssignment([onr('node003', {0, 1, 2, 3})]), + + Reference('micro2[0][0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro2[0][1]'): ResourceAssignment([onr('node003', {4, 5, 6, 7})]), + Reference('micro2[1][0]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('micro2[1][1]'): ResourceAssignment([onr('node004', {0, 1, 2, 3})]), + Reference('micro2[2][0]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('micro2[2][1]'): ResourceAssignment([onr('node004', {4, 5, 6, 7})]), + Reference('micro2[3][0]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), + Reference('micro2[3][1]'): ResourceAssignment([onr('node005', {0, 1, 2, 3})]), + Reference('micro2[4][0]'): ResourceAssignment([onr('node003', {0, 1, 2, 3})]), + Reference('micro2[4][1]'): ResourceAssignment([onr('node005', {4, 5, 6, 7})]), } @@ -782,7 +805,7 @@ def c(hwthread_id: int) -> FrozenSet[int]: s14_model, None, s14_implementations, s14_requirements) -s14_resources = Resources({'node001': {c(0), c(1), c(2), c(3), c(4), c(5)}}) +s14_resources = resources({'node001': [c(0), c(1), c(2), c(3), c(4), c(5)]}) s14_solution = RuntimeError @@ -829,16 +852,20 @@ def test_scenarios(scenario: _Scenario) -> None: if isinstance(req, ThreadedResReq): for instance in component.instances(): - assert len(list(allocations[instance].nodes())) == 1 - assert allocations[instance].total_cores() == req.threads + assert len(allocations[instance].by_rank) == 1 + assert allocations[instance].by_rank[0].total_cores() == req.threads elif isinstance(req, MPICoresResReq): for instance in component.instances(): - tcores = allocations[instance].total_cores() - assert tcores == req.mpi_processes + nranks = len(allocations[instance].by_rank) + assert nranks == req.mpi_processes + for r in range(nranks): + assert allocations[instance].by_rank[r].total_cores() == 1 # check for any overlapping instances - for instance1, res1 in allocations.items(): - for instance2, res2 in allocations.items(): + for instance1, res_asm1 in allocations.items(): + for instance2, res_asm2 in allocations.items(): + res1 = res_asm1.as_resources() + res2 = res_asm2.as_resources() cname1 = instance1.without_trailing_ints() cname2 = instance2.without_trailing_ints() if cname1 != cname2: diff --git a/libmuscle/python/libmuscle/planner/test/test_resources.py b/libmuscle/python/libmuscle/planner/test/test_resources.py new file mode 100644 index 00000000..f0158850 --- /dev/null +++ b/libmuscle/python/libmuscle/planner/test/test_resources.py @@ -0,0 +1,435 @@ +from copy import copy + +import pytest + +from libmuscle.planner.resources import Core, CoreSet, OnNodeResources, Resources + + +@pytest.fixture +def c1(): + return Core(0, {0, 1}) + + +def test_core_equals(c1): + c2 = Core(0, {0, 1}) + c3 = Core(1, {0, 1}) + c4 = Core(0, {2, 3}) + + assert c1 == c2 + assert not c1 != c2 + assert c1 != c3 + assert c1 != c4 + assert c3 != c4 + + +def test_core_length(c1): + assert len(c1) == 2 + + c2 = Core(1, {4, 5, 6, 7}) + assert len(c2) == 4 + + +def test_core_copy(c1): + c2 = copy(c1) + assert c2.cid == 0 + assert c2.hwthreads == {0, 1} + + c2.hwthreads.add(2) + assert c1.hwthreads == {0, 1} + assert c2.hwthreads == {0, 1, 2} + + +def test_core_union(): + c1 = Core(3, {3}) + c2 = Core(3, {4}) + + assert c1 | c2 == Core(3, {3, 4}) + + c3 = Core(2, {2}) + with pytest.raises(ValueError): + c1 | c3 + + +def test_core_union_onto(c1): + c2 = Core(0, {2, 3}) + + c1 |= c2 + assert c1.hwthreads == {0, 1, 2, 3} + assert c2.hwthreads == {2, 3} + + c3 = Core(3, {6, 7}) + with pytest.raises(ValueError): + c1 |= c3 + + +def test_core_subtract(): + c1 = Core(0, {0, 1, 2, 3}) + c2 = Core(0, {0, 3}) + + c1 -= c2 + assert c1.cid == 0 + assert c1.hwthreads == {1, 2} + + c3 = Core(0, {2, 3}) + c1 -= c3 + assert c1.cid == 0 + assert c1.hwthreads == {1} + + c4 = Core(1, {1, 2}) + with pytest.raises(ValueError): + c1 -= c4 + + +def test_core_isdisjoint(c1): + c2 = Core(0, {0}) + c3 = Core(0, {2, 3}) + c4 = Core(1, {0, 1}) + + assert not c1.isdisjoint(c2) + assert not c2.isdisjoint(c1) + assert c1.isdisjoint(c3) + + with pytest.raises(ValueError): + c1.isdisjoint(c4) + + +def test_core_str(c1): + assert str(c1) == '0(0,1)' + + +def test_core_repr(c1): + assert repr(c1) == 'Core(0, {0,1})' + + +@pytest.fixture +def cs1(): + return CoreSet([Core(0, {0, 1}), Core(1, {2, 3})]) + + +def test_core_set_equals(cs1): + cs2 = CoreSet([Core(0, {0, 1}), Core(1, {2, 3})]) + cs3 = CoreSet([Core(1, {2, 3})]) + cs4 = CoreSet([]) + cs5 = CoreSet([Core(0, {0, 1}), Core(1, {2, 3}), Core(2, {4, 5})]) + cs6 = CoreSet([Core(3, {6, 7})]) + + assert cs1 == cs2 + assert not cs1 != cs2 + assert cs1 != cs3 + assert cs1 != cs4 + assert cs1 != cs5 + assert cs1 != cs6 + assert not cs3 == cs4 + assert cs4 != cs5 + + +def test_core_set_length(cs1): + cs2 = CoreSet([]) + cs3 = CoreSet([Core(3, {6, 7})]) + + assert len(cs1) == 2 + assert len(cs2) == 0 + assert len(cs3) == 1 + + +def test_core_set_iter(cs1): + for i, core in enumerate(cs1): + assert i == core.cid + assert core.hwthreads == {i * 2, i * 2 + 1} + + assert i == 1 + + +def test_core_set_copy(cs1): + cs2 = copy(cs1) + assert cs1 == cs2 + + cs2._cores[2] = Core(2, {4, 5}) + assert len(cs1._cores) == 2 + + cs2._cores[0].hwthreads.add(2) + assert 2 not in cs1._cores[0].hwthreads + + +def test_core_set_union_onto(cs1): + cs2 = CoreSet([Core(3, {6, 7})]) + cs1 |= cs2 + + assert len(cs1) == 3 + assert 0 in cs1._cores + assert cs1._cores[0].cid == 0 + assert cs1._cores[0].hwthreads == {0, 1} + assert 1 in cs1._cores + assert cs1._cores[1].cid == 1 + assert cs1._cores[1].hwthreads == {2, 3} + assert 3 in cs1._cores + assert cs1._cores[3].cid == 3 + assert cs1._cores[3].hwthreads == {6, 7} + + assert id(cs1._cores[3]) != id(cs2._cores[3]) + assert id(cs1._cores[3].hwthreads) != id(cs2._cores[3].hwthreads) + + +def test_core_set_subtract_disjunct(cs1): + cs2 = CoreSet([Core(3, {6, 7})]) + cs1 -= cs2 + + assert len(cs1) == 2 + assert 0 in cs1._cores + assert 1 in cs1._cores + + assert len(cs2) == 1 + assert 3 in cs2._cores + + +def test_core_set_subtract_whole_core(cs1): + cs2 = CoreSet([Core(0, {0, 1})]) + cs1 -= cs2 + + assert len(cs1) == 1 + assert 0 not in cs1._cores + assert 1 in cs1._cores + + assert len(cs2) == 1 + assert 0 in cs2._cores + + +def test_core_set_subtract_threads(cs1): + cs2 = CoreSet([Core(1, {2})]) + i1 = id(cs1._cores[1]) + + cs1 -= cs2 + + assert len(cs1) == 2 + assert 0 in cs1._cores + assert 1 in cs1._cores + assert id(cs1._cores[1]) == i1 + assert len(cs1._cores[1]) == 1 + assert cs1._cores[1].hwthreads == {3} + assert cs1._cores[0].hwthreads == {0, 1} + + +def test_core_set_str(cs1): + assert str(cs1) == '0-1(0-3)' + + +def test_core_set_repr(cs1): + assert repr(cs1) == 'CoreSet({Core(0, {0,1}), Core(1, {2,3})})' + + +def test_core_set_get_first_cores(cs1): + assert cs1.get_first_cores(0)._cores == {} + assert cs1.get_first_cores(1)._cores == {0: Core(0, {0, 1})} + assert cs1.get_first_cores(2)._cores == { + 0: Core(0, {0, 1}), + 1: Core(1, {2, 3})} + with pytest.raises(RuntimeError): + cs1.get_first_cores(3) + + +@pytest.fixture +def n1(cs1): + return OnNodeResources('node001', cs1) + + +def test_node_resources_equals(n1): + n2 = OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})])) + n3 = OnNodeResources('node002', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})])) + n4 = OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(1, {4, 3})])) + + assert n1 == n2 + assert n1 != n3 + assert n1 != n4 + + +def test_node_resources_copy(n1): + n2 = copy(n1) + + assert n1 == n2 + assert id(n1.cpu_cores) != id(n2.cpu_cores) + assert id(n1.cpu_cores._cores[0]) != id(n2.cpu_cores._cores[0]) + assert id(n1.cpu_cores._cores[1].hwthreads) != id(n2.cpu_cores._cores[1].hwthreads) + + +def test_node_resources_union_onto(n1): + n2 = OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(4, {8, 9, 10, 11})])) + n3 = OnNodeResources('node001', CoreSet([Core(0, {3})])) + n4 = OnNodeResources('node002', CoreSet([Core(3, {3})])) + + n1 |= n2 + + assert len(n1.cpu_cores) == 3 + assert id(n1.cpu_cores._cores[4]) != id(n2.cpu_cores._cores[4]) + + n1 |= n3 + + assert len(n1.cpu_cores) == 3 + assert n1.cpu_cores._cores[0].hwthreads == {0, 1, 3} + + with pytest.raises(ValueError): + n1 |= n4 + + +def test_node_resources_hwthreads(n1): + assert list(n1.hwthreads()) == [0, 1, 2, 3] + + +def test_node_resources_subtract(n1): + n2 = OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(4, {8, 9, 10, 11})])) + n3 = OnNodeResources('node001', CoreSet([Core(1, {3})])) + n4 = OnNodeResources('node002', CoreSet([Core(3, {3})])) + + n1 -= n2 + + assert len(n1.cpu_cores) == 1 + assert len(n1.cpu_cores._cores[1]) == 2 + + n1 -= n3 + + assert len(n1.cpu_cores) == 1 + assert len(n1.cpu_cores._cores[1]) == 1 + + with pytest.raises(ValueError): + n1 -= n4 + + +@pytest.fixture +def r1(n1): + return Resources([n1]) + + +def test_resources_length(r1, n1): + r2 = Resources([n1, OnNodeResources('node002', CoreSet([Core(0, {0, 1})]))]) + + assert len(r1) == 1 + assert len(r2) == 2 + + +def test_resources_iter(cs1, n1): + n2 = OnNodeResources('node004', cs1) + n3 = OnNodeResources('node002', CoreSet([Core(3, {3})])) + nodes = [n1, n2, n3] + res = Resources(nodes) + + for i, n in enumerate(res): + assert n == nodes[i] + + +def test_resources_equals(r1): + assert r1 == Resources( + [OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})]))]) + + r2 = Resources( + [OnNodeResources('node002', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})]))]) + assert r1 != r2 + + r3 = Resources( + [OnNodeResources( + 'node001', CoreSet([Core(0, {0, 1}), Core(1, {1, 2, 3})]))]) + assert r1 != r3 + + r4 = Resources([OnNodeResources('node001', CoreSet([Core(1, {1, 2})]))]) + assert r1 != r4 + + r5 = Resources([ + OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})])), + OnNodeResources('node002', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})])) + ]) + assert r1 != r5 + + +def test_resources_copy(r1): + r2 = copy(r1) + assert id(r2._nodes['node001']) != id(r1._nodes['node001']) + assert id(r2._nodes['node001'].cpu_cores) != id(r1._nodes['node001'].cpu_cores) + + +def test_resources_union_onto(r1): + r2 = Resources([]) + r2 |= r1 + assert r2 == r1 + + r3 = Resources([OnNodeResources('node002', CoreSet([Core(0, {0})]))]) + r3 |= r1 + assert len(r3._nodes) == 2 + assert id(r3._nodes['node001']) != id(r1._nodes['node001']) + assert sorted(r3._nodes.keys()) == ['node001', 'node002'] + + +def test_resources_subtract(r1): + r2 = Resources([]) + r2 -= r1 + assert len(r2._nodes) == 0 + + r1 -= r2 + assert len(r1._nodes) == 1 + + r3 = Resources([OnNodeResources('node001', CoreSet([Core(0, {0})]))]) + r1 -= r3 + assert len(r1._nodes) == 1 + assert r1._nodes['node001'].cpu_cores._cores[0].hwthreads == {1} + + +def test_resources_nodes(): + r1 = Resources([ + OnNodeResources('node001', CoreSet([Core(0, {0})])), + OnNodeResources('node003', CoreSet([Core(1, {1})])), + OnNodeResources('node004', CoreSet([Core(2, {2})]))]) + + assert sorted(r1.nodes()) == ['node001', 'node003', 'node004'] + + +def test_resources_total_cores(): + r1 = Resources([ + OnNodeResources('node001', CoreSet([Core(0, {0, 1})])), + OnNodeResources('node003', CoreSet([Core(1, {1}), Core(5, {5})])), + OnNodeResources('node004', CoreSet([Core(2, {2})]))]) + + assert r1.total_cores() == 4 + + +def test_resource_hwthreads(n1, r1): + hwthreads = list(r1.hwthreads()) + assert hwthreads == [('node001', 0), ('node001', 1), ('node001', 2), ('node001', 3)] + + n2 = OnNodeResources('node007', CoreSet([Core(7, {7}), Core(3, {3})])) + res = Resources([n1, n2]) + + hwthreads = list(res.hwthreads()) + assert hwthreads == [ + ('node001', 0), ('node001', 1), ('node001', 2), ('node001', 3), + ('node007', 7), ('node007', 3)] + + +def test_resources_isdisjoint(r1): + r2 = Resources([]) + assert r1.isdisjoint(r2) + + r3 = Resources([OnNodeResources('node001', CoreSet([Core(0, {0})]))]) + assert not r1.isdisjoint(r3) + + r4 = Resources([OnNodeResources('node001', CoreSet([Core(0, {2})]))]) + assert r1.isdisjoint(r4) + + r5 = Resources([OnNodeResources('node002', CoreSet([Core(0, {0})]))]) + assert r1.isdisjoint(r5) + + +def test_resources_union(r1): + r2 = Resources([]) + r3 = Resources([OnNodeResources('node001', CoreSet([Core(0, {0})]))]) + r4 = Resources([OnNodeResources('node001', CoreSet([Core(0, {2})]))]) + r5 = Resources([OnNodeResources('node002', CoreSet([Core(0, {0})]))]) + + assert Resources.union([r1, r2]) == r1 + assert Resources.union([r1, r3]) == r1 + assert Resources.union([r1, r4]) == Resources([ + OnNodeResources('node001', CoreSet([Core(0, {0, 1, 2}), Core(1, {2, 3})]))]) + + assert Resources.union([r1, r5]) == Resources([ + OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})])), + OnNodeResources('node002', CoreSet([Core(0, {0})]))]) + + assert Resources.union([r1, r2, r3, r4, r5]) == Resources([ + OnNodeResources('node001', CoreSet([Core(0, {0, 1, 2}), Core(1, {2, 3})])), + OnNodeResources('node002', CoreSet([Core(0, {0})]))]) diff --git a/libmuscle/python/libmuscle/test/conftest.py b/libmuscle/python/libmuscle/test/conftest.py index 3215517f..b64a51e3 100644 --- a/libmuscle/python/libmuscle/test/conftest.py +++ b/libmuscle/python/libmuscle/test/conftest.py @@ -1,5 +1,6 @@ from copy import copy import pytest +from typing import Dict, List, Set, Union from unittest.mock import patch from ymmsl import Operator, Reference, Settings @@ -8,6 +9,7 @@ from libmuscle.communicator import Message from libmuscle.mcp.transport_client import ProfileData from libmuscle.mmp_client import MMPClient +from libmuscle.planner.resources import Core, CoreSet, OnNodeResources, Resources from libmuscle.port import Port from libmuscle.profiler import Profiler from libmuscle.timestamp import Timestamp @@ -101,10 +103,20 @@ def port_exists(name): return port_manager -def frozenset_of(*args): - """Create a frozenset containing the arguments. +def core(hwthread_id: int) -> Core: + """Helper that defines a core with the given core and hwthread id.""" + return Core(hwthread_id, {hwthread_id}) - This is a helper to shorten notation used in some of the planning and - launching-related tests. - """ - return frozenset(args) + +def on_node_resources(node_name: str, cores: Union[int, Set[int]]) -> OnNodeResources: + """Helper that defines resources on a node from the name and a CPU core.""" + if isinstance(cores, int): + cores = {cores} + return OnNodeResources(node_name, CoreSet([Core(core, {core}) for core in cores])) + + +def resources(node_resources: Dict[str, List[Core]]) -> Resources: + """Helper that defines a Resources from a dict.""" + return Resources([ + OnNodeResources(node_name, CoreSet(cores)) + for node_name, cores in node_resources.items()]) diff --git a/muscle3/muscle3.py b/muscle3/muscle3.py index c4f39af1..b1373bc9 100644 --- a/muscle3/muscle3.py +++ b/muscle3/muscle3.py @@ -8,8 +8,8 @@ from ymmsl import PartialConfiguration -from libmuscle.planner.planner import ( - Planner, Resources, InsufficientResourcesAvailable) +from libmuscle.planner.planner import Planner, InsufficientResourcesAvailable +from libmuscle.planner.resources import Core, CoreSet, OnNodeResources, Resources from libmuscle.snapshot_manager import SnapshotManager from muscle3.profiling import ( plot_instances, plot_resources, plot_timeline, show_plots) @@ -138,8 +138,9 @@ def resources( click.echo(_RESOURCES_INCOMPLETE_MODEL, err=True) sys.exit(1) - resources = Resources({ - 'node000001': {frozenset([r]) for r in range(cores_per_node)}}) + resources = Resources([ + OnNodeResources( + 'node000001', CoreSet([Core(i, {i}) for i in range(cores_per_node)]))]) planner = Planner(resources) try: From c4ef9eba95d96c5fdb77453b133db0b7ea641152 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Mon, 30 Dec 2024 18:29:37 +0100 Subject: [PATCH 46/49] Add support for Intel MPI to the native instantiator --- integration_test/cluster_test/Makefile | 6 +- integration_test/cluster_test/conftest.py | 73 ++++++++++++------- .../cluster_test/double_intelmpi.sh | 12 +++ .../implementations_intelmpi.ymmsl | 9 +++ .../cluster_test/macro_micro_intelmpi.sh | 12 +++ integration_test/cluster_test/test_cluster.py | 4 +- integration_test/fake_cluster/Dockerfile | 8 +- integration_test/fake_cluster/old.Dockerfile | 8 +- .../native_instantiator/run_script.py | 45 +++++++++++- scripts/gmake/check_tools.make | 2 + 10 files changed, 139 insertions(+), 40 deletions(-) create mode 100755 integration_test/cluster_test/double_intelmpi.sh create mode 100644 integration_test/cluster_test/implementations_intelmpi.ymmsl create mode 100755 integration_test/cluster_test/macro_micro_intelmpi.sh diff --git a/integration_test/cluster_test/Makefile b/integration_test/cluster_test/Makefile index 4ef1fd9e..44f5e012 100644 --- a/integration_test/cluster_test/Makefile +++ b/integration_test/cluster_test/Makefile @@ -1,5 +1,5 @@ .PHONY: all -all: component_openmpi +all: component_$(MPI_TYPE) CXXFLAGS += $(shell pkg-config --cflags libmuscle_mpi ymmsl) @@ -7,6 +7,6 @@ LDLIBS += $(shell pkg-config --libs libmuscle_mpi ymmsl) CXXFLAGS += -g -component_openmpi: component.cpp - mpic++ -o $@ $(CXXFLAGS) $^ $(LDLIBS) +component_$(MPI_TYPE): component.cpp + mpicxx -o $@ $(CXXFLAGS) $^ $(LDLIBS) diff --git a/integration_test/cluster_test/conftest.py b/integration_test/cluster_test/conftest.py index 97b7b255..c0a65131 100644 --- a/integration_test/cluster_test/conftest.py +++ b/integration_test/cluster_test/conftest.py @@ -21,7 +21,7 @@ ])) # Shut down the containers after running the tests. Set to False to debug. -CLEAN_UP_CONTAINERS = True +CLEAN_UP_CONTAINERS = False skip_unless_cluster = pytest.mark.skipif( @@ -223,44 +223,65 @@ def _install_muscle3_native_openmpi( f'make distclean && ' f'PREFIX={prefix} make install"')) - return prefix, module_name + return 'openmpi', prefix, module_name + + +def _install_muscle3_native_intelmpi( + remote_source, remote_term, remote_fs): + prefix = remote_fs / REMOTE_SHARED / 'muscle3-intelmpi' + prefix.mkdir() + + module_name = 'intel-oneapi-mpi' + + run_cmd(remote_term, 600, ( + f'/bin/bash -l -c "' + f'module load {module_name} && ' + f'cd {remote_source} && ' + f'make distclean && ' + f'PREFIX={prefix} make install"')) + + return 'intelmpi', prefix, module_name def _install_muscle3(local_term, repo_root, remote_term, remote_fs, slurm_version): remote_source = _install_remote_source( local_term, repo_root, remote_fs, slurm_version) _create_muscle3_venv(remote_term, remote_source) - return _install_muscle3_native_openmpi( + openmpi_install = _install_muscle3_native_openmpi( remote_source, remote_term, remote_fs, slurm_version) + intelmpi_install = _install_muscle3_native_intelmpi( + remote_source, remote_term, remote_fs) + return openmpi_install, intelmpi_install -def _install_tests(repo_root, remote_term, remote_fs, remote_m3_openmpi): +def _install_tests(repo_root, remote_term, remote_fs, remote_m3_installs): remote_home = remote_fs / REMOTE_SHARED - remote_m3, openmpi_module = remote_m3_openmpi - cerulean.copy( - repo_root / 'integration_test' / 'cluster_test', remote_home, - copy_permissions=True) + for mpi_type, remote_m3, mpi_module in remote_m3_installs: + cerulean.copy( + repo_root / 'integration_test' / 'cluster_test', remote_home, + copy_permissions=True) - remote_source = remote_home / 'cluster_test' + remote_source = remote_home / 'cluster_test' - run_cmd(remote_term, 30, ( - '/bin/bash -c "' - f'sed -i \\"s^modules: openmpi^modules: {openmpi_module}^\\"' - f' {remote_source}/implementations_openmpi.ymmsl' - '"')) + if mpi_type == 'openmpi': + run_cmd(remote_term, 30, ( + '/bin/bash -c "' + f'sed -i \\"s^modules: openmpi^modules: {mpi_module}^\\"' + f' {remote_source}/implementations_openmpi.ymmsl' + '"')) - run_cmd(remote_term, 30, ( - '/bin/bash -c "' - f'sed -i \\"s^modules: openmpi^modules: {openmpi_module}^\\"' - f' {remote_source}/implementations_srunmpi.ymmsl' - '"')) + run_cmd(remote_term, 30, ( + '/bin/bash -c "' + f'sed -i \\"s^modules: openmpi^modules: {mpi_module}^\\"' + f' {remote_source}/implementations_srunmpi.ymmsl' + '"')) - run_cmd(remote_term, 30, ( - f'/bin/bash -l -c "' - f'module load {openmpi_module} && ' - f'. {remote_m3}/bin/muscle3.env && ' - f'make -C {remote_source}"')) + run_cmd(remote_term, 30, ( + f'/bin/bash -l -c "' + f'module load {mpi_module} && ' + f'. {remote_m3}/bin/muscle3.env && ' + f'make -C {remote_source} MPI_TYPE={mpi_type}"')) def _clean_up_base_cluster(local_term, slurm_version): @@ -285,9 +306,9 @@ def installed_cluster( remote_term, remote_fs, headnode_port = _start_base_cluster( local_term, request.param, local_shared_dir) - remote_m3_openmpi = _install_muscle3( + remote_m3_installs = _install_muscle3( local_term, repo_root, remote_term, remote_fs, slurm_version) - _install_tests(repo_root, remote_term, remote_fs, remote_m3_openmpi) + _install_tests(repo_root, remote_term, remote_fs, remote_m3_installs) yield headnode_port diff --git a/integration_test/cluster_test/double_intelmpi.sh b/integration_test/cluster_test/double_intelmpi.sh new file mode 100755 index 00000000..e6e47859 --- /dev/null +++ b/integration_test/cluster_test/double_intelmpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/double.ymmsl $CT/settings.ymmsl $CT/implementations_intelmpi.ymmsl + diff --git a/integration_test/cluster_test/implementations_intelmpi.ymmsl b/integration_test/cluster_test/implementations_intelmpi.ymmsl new file mode 100644 index 00000000..b216138d --- /dev/null +++ b/integration_test/cluster_test/implementations_intelmpi.ymmsl @@ -0,0 +1,9 @@ +ymmsl_version: v0.1 + +implementations: + component_cpp: + modules: intel-oneapi-mpi + env: + +LD_LIBRARY_PATH: /home/cerulean/shared/muscle3-intelmpi/lib + execution_model: intelmpi + executable: /home/cerulean/shared/cluster_test/component_intelmpi diff --git a/integration_test/cluster_test/macro_micro_intelmpi.sh b/integration_test/cluster_test/macro_micro_intelmpi.sh new file mode 100755 index 00000000..77bec53a --- /dev/null +++ b/integration_test/cluster_test/macro_micro_intelmpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/macro_micro.ymmsl $CT/settings.ymmsl $CT/implementations_intelmpi.ymmsl + diff --git a/integration_test/cluster_test/test_cluster.py b/integration_test/cluster_test/test_cluster.py index d8a52c67..81e02eaa 100644 --- a/integration_test/cluster_test/test_cluster.py +++ b/integration_test/cluster_test/test_cluster.py @@ -185,7 +185,7 @@ def test_multiple( @skip_unless_cluster @pytest.mark.parametrize('mode', ['local', 'slurm']) -@pytest.mark.parametrize('execution_model', ['openmpi', 'srunmpi']) +@pytest.mark.parametrize('execution_model', ['openmpi', 'intelmpi', 'srunmpi']) def test_double( fake_cluster, remote_test_files, remote_out_dir, hwthread_to_core, mode, execution_model): @@ -219,7 +219,7 @@ def test_double( @skip_unless_cluster @pytest.mark.parametrize('mode', ['local', 'slurm']) -@pytest.mark.parametrize('execution_model', ['openmpi', 'srunmpi']) +@pytest.mark.parametrize('execution_model', ['openmpi', 'intelmpi', 'srunmpi']) def test_macro_micro( fake_cluster, remote_test_files, remote_out_dir, hwthread_to_core, mode, execution_model): diff --git a/integration_test/fake_cluster/Dockerfile b/integration_test/fake_cluster/Dockerfile index 25a85ebe..419ec852 100644 --- a/integration_test/fake_cluster/Dockerfile +++ b/integration_test/fake_cluster/Dockerfile @@ -43,13 +43,13 @@ RUN . /opt/spack/share/spack/setup-env.sh && \ ^$(spack find --deps slurm@24-11 | grep pmix | tr -d ' ') \ ^$(spack find --format "slurm/{hash}" slurm@24-11) -# RUN . /opt/spack/share/spack/setup-env.sh && \ -# . $(spack location -i lmod)/lmod/lmod/init/bash && \ -# spack install mpich+slurm pmi=pmix ^pmix@3.2.3 +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install intel-oneapi-mpi@2021.14.0 target=zen2 # RUN . /opt/spack/share/spack/setup-env.sh && \ # . $(spack location -i lmod)/lmod/lmod/init/bash && \ -# spack install intel-oneapi-mpi ^pmix@3.2.3 +# spack install mpich+slurm pmi=pmix ^pmix@3.2.3 COPY integration_test/fake_cluster/cgroup.conf /etc/slurm/cgroup.conf diff --git a/integration_test/fake_cluster/old.Dockerfile b/integration_test/fake_cluster/old.Dockerfile index 700075c7..9da30db9 100644 --- a/integration_test/fake_cluster/old.Dockerfile +++ b/integration_test/fake_cluster/old.Dockerfile @@ -31,13 +31,13 @@ RUN . /opt/spack/share/spack/setup-env.sh && \ ^$(spack find --deps slurm@20-02 | grep pmix | tr -d ' ') \ ^$(spack find --format "slurm/{hash}" slurm@20-02) -# RUN . /opt/spack/share/spack/setup-env.sh && \ -# . $(spack location -i lmod)/lmod/lmod/init/bash && \ -# spack install mpich+slurm pmi=pmix ^pmix@3.2.3 +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install intel-oneapi-mpi@2021.14.0 # RUN . /opt/spack/share/spack/setup-env.sh && \ # . $(spack location -i lmod)/lmod/lmod/init/bash && \ -# spack install intel-oneapi-mpi ^pmix@3.2.3 +# spack install mpich+slurm pmi=pmix ^pmix@3.2.3 # Disable ssh debug output RUN sed -i -e 's/^LogLevel DEBUG3$//' /etc/ssh/sshd_config diff --git a/libmuscle/python/libmuscle/native_instantiator/run_script.py b/libmuscle/python/libmuscle/native_instantiator/run_script.py index faa14a68..a2ec2cfd 100644 --- a/libmuscle/python/libmuscle/native_instantiator/run_script.py +++ b/libmuscle/python/libmuscle/native_instantiator/run_script.py @@ -59,15 +59,58 @@ def openmpi_prep_resources(resources: ResourceAssignment) -> Tuple[str, Dict[str def impi_prep_resources(resources: ResourceAssignment) -> Tuple[str, Dict[str, str]]: """Create resource description for Intel MPI mpirun + Intel MPI mpirun accepts either one core for each MPI process, or one hwthread. It + cannot bind a process to more than one explicitly specified core or hwthread the way + srun and OpenMPI can. At the moment, we bind each process to one core, and that's + what we do here as well, but this will become a problem for MPI+OpenMP codes. Those + can be pinned to sockets, NUMA domains or caches, which does make sense, so we'll + have to figure that out when we add support. + Args: resources: The resource assignment to describe Return: The contents of the machinefile, and a set of environment variables """ + env: Dict[str, str] = dict() + machine_nodes: List[str] = list() + pin_masks: List[int] = list() + + for rank, res in enumerate(resources.by_rank): + machine_nodes.append(res.node_name) + pin_masks.append(sum((1 << c for c in res.hwthreads()))) + + # coalesce machine lines + proc_counts = [1] * len(machine_nodes) + i = 1 + while i < len(machine_nodes): + if machine_nodes[i-1] == machine_nodes[i]: + del machine_nodes[i] + proc_counts[i-1] += proc_counts[i] + del proc_counts[i] + else: + i += 1 + + machinefile = '\n'.join( + (f'{m}:{c}' for m, c in zip(machine_nodes, proc_counts))) + '\n' + + # disable pinning to SLURM-specified resources + # env['I_MPI_PIN_RESPECT_CPUSET'] = '0' + env['I_MPI_JOB_RESPECT_PROCESS_PLACEMENT'] = 'off' + + # which cores to bind each rank to + pin_masks_str = ','.join(format(mask, '#x') for mask in pin_masks) + env['I_MPI_PIN_DOMAIN'] = f'[{pin_masks_str}]' + + # I_MPI_PIN_DOMAIN=[55,aa] + # pins the first rank to 0,2,16,18 and the second to 1,3,17,19 # I_MPI_PIN_PROCESSOR_LIST=0,1,5,6 # pins rank 0 to core 0, rank 1 to core 1, rank 2 to core 5, rank 3 to core 6 - raise NotImplementedError() + # machinefile: + # host1:2 + # host2:4 + # runs two processes on host1 and four on host2 + return machinefile, env def mpich_prep_resources(resources: ResourceAssignment) -> Tuple[str, Dict[str, str]]: diff --git a/scripts/gmake/check_tools.make b/scripts/gmake/check_tools.make index 0adc8ff0..51113dab 100644 --- a/scripts/gmake/check_tools.make +++ b/scripts/gmake/check_tools.make @@ -67,6 +67,8 @@ tool_command := mpi$(CXX) include $(TOOLDIR)/detect_tool.make tool_command := mpic++ include $(TOOLDIR)/detect_tool.make +tool_command := mpicxx +include $(TOOLDIR)/detect_tool.make ifndef MPICXX $(info - No MPI C++ compiler found! Maybe there's no MPI installed?) From 4580b5dd1a733ed0f0be087979429f8386986c89 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 31 Dec 2024 18:03:51 +0100 Subject: [PATCH 47/49] Improve error reporting on Instantiator crash --- libmuscle/python/libmuscle/manager/instance_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/manager/instance_manager.py b/libmuscle/python/libmuscle/manager/instance_manager.py index 23980903..6f4021e3 100644 --- a/libmuscle/python/libmuscle/manager/instance_manager.py +++ b/libmuscle/python/libmuscle/manager/instance_manager.py @@ -104,7 +104,7 @@ def __init__( 'Instantiator crashed. This should not happen, please file a bug' ' report.') _logger.error(msg) - raise RuntimeError(msg) + raise RuntimeError(msg) from resources.exception self._planner = Planner(resources) self._num_running = 0 From 4d7525b9966d49e42eef007a7d5bf515e07c59dc Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 1 Jan 2025 10:00:52 +0100 Subject: [PATCH 48/49] Enable IntelMPI debug output at log level debug --- .../libmuscle/native_instantiator/run_script.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/libmuscle/python/libmuscle/native_instantiator/run_script.py b/libmuscle/python/libmuscle/native_instantiator/run_script.py index a2ec2cfd..e566d123 100644 --- a/libmuscle/python/libmuscle/native_instantiator/run_script.py +++ b/libmuscle/python/libmuscle/native_instantiator/run_script.py @@ -306,9 +306,17 @@ def cluster_command(implementation: Implementation, enable_debug: bool) -> str: fstr = ' '.join(fargs) elif implementation.execution_model == ExecutionModel.INTELMPI: - fstr = ( - 'mpirun -n $MUSCLE_MPI_PROCESSES -machinefile $MUSCLE_RANKFILE' - ' {command} {args}') + fargs = [ + 'mpirun -n $MUSCLE_MPI_PROCESSES', + '-machinefile $MUSCLE_RANKFILE'] + + if enable_debug: + fargs.append('-genv I_MPI_DEBUG=4') + + fargs.append('{command} {args}') + + fstr = ' '.join(fargs) + elif implementation.execution_model == ExecutionModel.SRUNMPI: fargs = ['srun -n $MUSCLE_MPI_PROCESSES -m arbitrary'] From 5b0ea5c98e4297729868bc184e2a387d03d85378 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 1 Jan 2025 10:01:54 +0100 Subject: [PATCH 49/49] Log resources more compactly at info level --- libmuscle/python/libmuscle/manager/instance_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/instance_manager.py b/libmuscle/python/libmuscle/manager/instance_manager.py index 6f4021e3..51a7a67f 100644 --- a/libmuscle/python/libmuscle/manager/instance_manager.py +++ b/libmuscle/python/libmuscle/manager/instance_manager.py @@ -124,7 +124,7 @@ def start_all(self) -> None: """Starts all the instances of the model.""" self._allocations = self._planner.allocate_all(self._configuration) for instance, resources in self._allocations.items(): - _logger.info(f'Planned {instance} on {resources}') + _logger.info(f'Planned {instance} on {resources.as_resources()}') components = {c.name: c for c in self._configuration.model.components} for instance, resources in self._allocations.items(): @@ -147,7 +147,7 @@ def start_all(self) -> None: instance, implementation, self._configuration.resources[component.name], resources, idir, workdir, stdout_path, stderr_path) - _logger.info(f'Instantiating {instance} on {resources}') + _logger.info(f'Instantiating {instance}') self._requests_out.put(request) self._num_running += 1