diff --git a/Makefile b/Makefile index b248468c..74d355cb 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,9 @@ endif .PHONY: test test: test_python test_scripts test_cpp test_fortran +.PHONY: test_all +test_all: test test_cluster + .PHONY: test_python_only test_python_only: MUSCLE_TEST_PYTHON_ONLY=1 tox @@ -37,6 +40,10 @@ test_cpp: cpp test_fortran: fortran_tests cd libmuscle/fortran && $(MAKE) test +.PHONY: test_cluster +test_cluster: + tox -e cluster + .PHONY: test_scripts test_scripts: cd scripts && $(MAKE) test diff --git a/integration_test/cluster_test/Makefile b/integration_test/cluster_test/Makefile new file mode 100644 index 00000000..44f5e012 --- /dev/null +++ b/integration_test/cluster_test/Makefile @@ -0,0 +1,12 @@ +.PHONY: all +all: component_$(MPI_TYPE) + + +CXXFLAGS += $(shell pkg-config --cflags libmuscle_mpi ymmsl) +LDLIBS += $(shell pkg-config --libs libmuscle_mpi ymmsl) + +CXXFLAGS += -g + +component_$(MPI_TYPE): component.cpp + mpicxx -o $@ $(CXXFLAGS) $^ $(LDLIBS) + diff --git a/integration_test/cluster_test/__init__.py b/integration_test/cluster_test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/integration_test/cluster_test/component.cpp b/integration_test/cluster_test/component.cpp new file mode 100644 index 00000000..0cc9726f --- /dev/null +++ b/integration_test/cluster_test/component.cpp @@ -0,0 +1,99 @@ +#include +#include +#include + +// This is a Linux-specific API, but this test always runs on Linux so that's okay. +#define _GNU_SOURCE +#include +#include + +#include "mpi.h" + +#include "libmuscle/libmuscle.hpp" +#include "ymmsl/ymmsl.hpp" + +using std::ofstream; +using std::to_string; + +using libmuscle::Instance; +using libmuscle::Message; +using ymmsl::Operator; + + +/** Log where we are running so that the test can check for it. */ +void log_location() { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + char nodeid[1024]; + gethostname(nodeid, sizeof(nodeid)); + + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set); + + { + ofstream outfile("out_" + to_string(rank) + ".txt"); + outfile << nodeid << std::endl; + + bool first = true; + for (int i = 0; i < CPU_SETSIZE; ++i) { + if (CPU_ISSET(i, &cpu_set)) { + if (!first) + outfile << ","; + outfile << i; + first = false; + } + } + outfile << std::endl; + } +} + + +/** A simple dummy component. */ +void component(int argc, char * argv[]) { + const int root_rank = 0; + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + Instance instance(argc, argv, { + {Operator::F_INIT, {"init_in"}}, + {Operator::O_I, {"inter_out"}}, + {Operator::S, {"inter_in"}}, + {Operator::O_F, {"final_out"}}}, + MPI_COMM_WORLD, root_rank); + + // outfile << "Starting reuse loop" << std::endl; + while (instance.reuse_instance()) { + // F_INIT + + int64_t steps = instance.get_setting_as("steps"); + + instance.receive("init_in", Message(0.0)); + + for (int step = 0; step < steps; ++step) { + // O_I + if (rank == root_rank) { + instance.send("inter_out", Message(step)); + } + + // S + instance.receive("inter_in", Message(0.0)); + } + + // O_F + if (rank == root_rank) { + instance.send("final_out", Message(steps)); + } + } +} + + +int main(int argc, char * argv[]) { + MPI_Init(&argc, &argv); + log_location(); + component(argc, argv); + MPI_Finalize(); + return EXIT_SUCCESS; +} + diff --git a/integration_test/cluster_test/component.py b/integration_test/cluster_test/component.py new file mode 100644 index 00000000..a22c7d96 --- /dev/null +++ b/integration_test/cluster_test/component.py @@ -0,0 +1,49 @@ +import logging +import os +import socket + +from libmuscle import Instance, Message +from ymmsl import Operator + + +def log_location() -> None: + """Log where we are running so that the test can check for it.""" + print(socket.gethostname()) + print(','.join(map(str, sorted(os.sched_getaffinity(0))))) + + +def component() -> None: + """A simple dummy component. + + This sends and receives on all operators, allowing different coupling patterns + with a single program. + """ + instance = Instance({ + Operator.F_INIT: ['init_in'], + Operator.O_I: ['inter_out'], + Operator.S: ['inter_in'], + Operator.O_F: ['final_out']}) + + while instance.reuse_instance(): + # F_INIT + steps = instance.get_setting('steps', 'int') + + instance.receive('init_in', default=Message(0.0)) + + for step in range(steps): + # O_I + instance.send('inter_out', Message(step)) + + # S + instance.receive('inter_in', default=Message(0.0)) + + # O_F + instance.send('final_out', Message(steps)) + + +if __name__ == '__main__': + logging.basicConfig() + logging.getLogger().setLevel(logging.INFO) + + log_location() + component() diff --git a/integration_test/cluster_test/conftest.py b/integration_test/cluster_test/conftest.py new file mode 100644 index 00000000..c0a65131 --- /dev/null +++ b/integration_test/cluster_test/conftest.py @@ -0,0 +1,358 @@ +import logging +import os +from pathlib import Path +from tempfile import TemporaryDirectory +import time + +import cerulean +import pytest + + +logger_ = logging.getLogger(__name__) + + +IMAGE_NAME = 'muscle3_test_cluster' + +REMOTE_SHARED = '/home/cerulean/shared' + +IDX_SLURM_VERSIONS = list(enumerate([ + '17-02', '17-11', '18-08', '19-05', '20-02', '20-11', '21-08', '22-05', '23-02', + '23-11', '24-05', '24-11' + ])) + +# Shut down the containers after running the tests. Set to False to debug. +CLEAN_UP_CONTAINERS = False + + +skip_unless_cluster = pytest.mark.skipif( + 'MUSCLE_TEST_CLUSTER' not in os.environ, + reason='Cluster tests were not explicitly enabled') + + +def run_cmd(term, timeout, command): + exit_code, out, err = term.run(timeout, command, []) + if exit_code != 0: + logger_.error(err) + assert exit_code == 0 + return out + + +@pytest.fixture(scope='session') +def local_term(): + return cerulean.LocalTerminal() + + +@pytest.fixture(scope='session') +def local_fs(): + return cerulean.LocalFileSystem() + + +@pytest.fixture(scope='session') +def repo_root(local_fs): + root_dir = Path(__file__).parents[2] + return local_fs / str(root_dir) + + +@pytest.fixture(scope='session') +def fake_cluster_image(local_term): + run_cmd(local_term, 5400, ( + f'docker buildx build -t {IMAGE_NAME}' + ' -f integration_test/fake_cluster/Dockerfile .')) + + +@pytest.fixture(scope='session') +def fake_cluster_image_old(local_term): + run_cmd(local_term, 5400, ( + f'docker buildx build -t {IMAGE_NAME}_old' + ' -f integration_test/fake_cluster/old.Dockerfile .')) + + +def _image_name(slurm_version): + if slurm_version <= '20-02': + return IMAGE_NAME + '_old' + return IMAGE_NAME + + +def _gcc_version(slurm_version): + if slurm_version <= '20-02': + return '7.5.0' + return '11.4.0' + + +def ssh_term(port, timeout_msg): + cred = cerulean.PasswordCredential('cerulean', 'kingfisher') + ready = False + start = time.monotonic() + while not ready: + if (time.monotonic() - start) > 60.0: + raise Exception(timeout_msg) + + try: + term = cerulean.SshTerminal('localhost', port, cred) + ready = True + except Exception: + time.sleep(3.0) + + return term + + +@pytest.fixture(scope='session') +def shared_dir(): + # Note that pytest's tmp_path is function-scoped, so cannot be used here + with TemporaryDirectory(ignore_cleanup_errors=True) as tmp_dir: + path = Path(tmp_dir) + path.chmod(0o1777) + yield path + + +@pytest.fixture(scope='session') +def cleanup_docker(local_term): + for _, slurm_version in IDX_SLURM_VERSIONS: + _clean_up_base_cluster(local_term, slurm_version) + + +def _create_network(local_term, slurm_version): + name = f'muscle3-net-{slurm_version}' + run_cmd(local_term, 60, f'docker network create {name}') + return name + + +def _start_nodes(local_term, slurm_version, net_name, shared_dir): + for i in range(5): + node_name = f'node-{i}' + + image_name = _image_name(slurm_version) + + run_cmd(local_term, 60, ( + f'docker run -d --name={node_name}-{slurm_version} --hostname={node_name}' + f' --network={net_name} --cap-add=CAP_SYS_NICE' + f' --env SLURM_VERSION={slurm_version}' + f' --mount type=bind,source={shared_dir},target={REMOTE_SHARED}' + f' {image_name}')) + + +def _start_headnode(local_term, slurm_version, net_name, shared_dir, headnode_port): + image_name = _image_name(slurm_version) + + run_cmd(local_term, 60, ( + f'docker run -d --name=headnode-{slurm_version} --hostname=headnode' + f' --network={net_name} -p {headnode_port}:22' + f' --env SLURM_VERSION={slurm_version}' + f' --mount type=bind,source={shared_dir},target={REMOTE_SHARED}' + f' {image_name}')) + + ssh_term(headnode_port, 'Virtual cluster container start timed out') + + +def _start_base_cluster(local_term, idx_slurm_version, shared_dir): + slurm_index, slurm_version = idx_slurm_version + + headnode_port = 10022 + slurm_index + + net_name = _create_network(local_term, slurm_version) + _start_nodes(local_term, slurm_version, net_name, shared_dir) + _start_headnode(local_term, slurm_version, net_name, shared_dir, headnode_port) + + term = ssh_term(headnode_port, 'Connection to virtual cluster container timed out') + fs = cerulean.SftpFileSystem(term, False) + + return term, fs, headnode_port + + +def _install_remote_source(local_term, repo_root, remote_fs, slurm_version): + muscle3_tgt = remote_fs / 'home' / 'cerulean' / 'muscle3' + muscle3_tgt.mkdir() + + container = f'headnode-{slurm_version}' + + for f in ( + 'muscle3', 'libmuscle', 'scripts', 'docs', 'setup.py', 'Makefile', + 'MANIFEST.in', 'LICENSE', 'NOTICE', 'VERSION', 'README.rst'): + run_cmd(local_term, 60, ( + f'docker cp {repo_root / f} {container}:{muscle3_tgt / f}')) + + # needs to run as root, so not run through remote_term + run_cmd(local_term, 60, ( + f'docker exec {container} /bin/bash -c' + f' "chown -R cerulean:cerulean {muscle3_tgt}"')) + + return muscle3_tgt + + +def _create_muscle3_venv(remote_term, remote_source): + run_cmd(remote_term, 10, f'python3 -m venv {REMOTE_SHARED}/venv') + in_venv = f'source {REMOTE_SHARED}/venv/bin/activate && ' + + run_cmd(remote_term, 30, ( + f'/bin/bash -c "{in_venv} python3 -m pip install pip wheel setuptools"')) + + run_cmd(remote_term, 60, f'/bin/bash -c "{in_venv} pip install {remote_source}"') + + +def _install_muscle3_native_openmpi( + remote_source, remote_term, remote_fs, slurm_version): + prefix = remote_fs / REMOTE_SHARED / 'muscle3-openmpi' + prefix.mkdir() + + openmpi_hash = run_cmd(remote_term, 600, ( + '/bin/bash -c "' + 'for phash in $(/opt/spack/bin/spack find --format \\"{hash}\\" openmpi' + ' | tr \'\\n\' \' \') ; do' + ' if /opt/spack/bin/spack find --deps /\\${phash} |' + f' grep -q slurm@{slurm_version} ; then' + ' echo \\${phash} ;' + ' fi ;' + 'done' + '"')) + + openmpi_version = run_cmd(remote_term, 600, ( + '/bin/bash -c "' + f'/opt/spack/bin/spack find --format \\"{{version}}\\" /{openmpi_hash}' + '"')).strip() + + gcc_version = _gcc_version(slurm_version) + + module_name = f'openmpi/{openmpi_version}-gcc-{gcc_version}-{openmpi_hash[:7]}' + + logger_.info(f'Slurm {slurm_version} and module {module_name}') + + run_cmd(remote_term, 600, ( + f'/bin/bash -l -c "' + f'module load {module_name} && ' + f'cd {remote_source} && ' + f'make distclean && ' + f'PREFIX={prefix} make install"')) + + return 'openmpi', prefix, module_name + + +def _install_muscle3_native_intelmpi( + remote_source, remote_term, remote_fs): + prefix = remote_fs / REMOTE_SHARED / 'muscle3-intelmpi' + prefix.mkdir() + + module_name = 'intel-oneapi-mpi' + + run_cmd(remote_term, 600, ( + f'/bin/bash -l -c "' + f'module load {module_name} && ' + f'cd {remote_source} && ' + f'make distclean && ' + f'PREFIX={prefix} make install"')) + + return 'intelmpi', prefix, module_name + + +def _install_muscle3(local_term, repo_root, remote_term, remote_fs, slurm_version): + remote_source = _install_remote_source( + local_term, repo_root, remote_fs, slurm_version) + _create_muscle3_venv(remote_term, remote_source) + openmpi_install = _install_muscle3_native_openmpi( + remote_source, remote_term, remote_fs, slurm_version) + intelmpi_install = _install_muscle3_native_intelmpi( + remote_source, remote_term, remote_fs) + return openmpi_install, intelmpi_install + + +def _install_tests(repo_root, remote_term, remote_fs, remote_m3_installs): + remote_home = remote_fs / REMOTE_SHARED + + for mpi_type, remote_m3, mpi_module in remote_m3_installs: + cerulean.copy( + repo_root / 'integration_test' / 'cluster_test', remote_home, + copy_permissions=True) + + remote_source = remote_home / 'cluster_test' + + if mpi_type == 'openmpi': + run_cmd(remote_term, 30, ( + '/bin/bash -c "' + f'sed -i \\"s^modules: openmpi^modules: {mpi_module}^\\"' + f' {remote_source}/implementations_openmpi.ymmsl' + '"')) + + run_cmd(remote_term, 30, ( + '/bin/bash -c "' + f'sed -i \\"s^modules: openmpi^modules: {mpi_module}^\\"' + f' {remote_source}/implementations_srunmpi.ymmsl' + '"')) + + run_cmd(remote_term, 30, ( + f'/bin/bash -l -c "' + f'module load {mpi_module} && ' + f'. {remote_m3}/bin/muscle3.env && ' + f'make -C {remote_source} MPI_TYPE={mpi_type}"')) + + +def _clean_up_base_cluster(local_term, slurm_version): + node_names = [f'node-{i}-{slurm_version}' for i in range(5)] + run_cmd(local_term, 60, f'docker rm -f {" ".join(node_names)}') + + run_cmd(local_term, 60, f'docker rm -f headnode-{slurm_version}') + + net_name = f'muscle3-net-{slurm_version}' + run_cmd(local_term, 60, f'docker network rm -f {net_name}') + + +@pytest.fixture(scope='session', params=IDX_SLURM_VERSIONS) +def installed_cluster( + request, cleanup_docker, fake_cluster_image, fake_cluster_image_old, shared_dir, + repo_root, local_term): + + slurm_version = request.param[1] + local_shared_dir = shared_dir / slurm_version + local_shared_dir.mkdir() + local_shared_dir.chmod(0o1777) + + remote_term, remote_fs, headnode_port = _start_base_cluster( + local_term, request.param, local_shared_dir) + remote_m3_installs = _install_muscle3( + local_term, repo_root, remote_term, remote_fs, slurm_version) + _install_tests(repo_root, remote_term, remote_fs, remote_m3_installs) + + yield headnode_port + + # Because it's been made inside of the container, the shared directory has a + # different owner than what we're running with on the host, and the host user cannot + # remove the files. So we do it here from inside the container + if CLEAN_UP_CONTAINERS: + run_cmd(remote_term, 60, f'rm -rf {REMOTE_SHARED}/*') + + remote_fs.close() + remote_term.close() + + if CLEAN_UP_CONTAINERS: + _clean_up_base_cluster(local_term, slurm_version) + + +@pytest.fixture(scope='session') +def hwthread_to_core(): + """Translates hwthreads to core ids. + + In our tests, we use sched_getaffinity to check which cores we're bound to. This + returns numbers identifying hwthreads, but our planner binds swthreads and processes + to entire cores. So we get a comma-separated list of hwthread ids and want to + compare that to a list of core ids. + + This reads /proc/cpuinfo to get the mapping between hwthreads and cores, and returns + a function that takes a comma-separated list of hwthread ids and returns a list of + corresponding core ids. + """ + with open('/proc/cpuinfo', 'r') as f: + cpuinfo = f.readlines() + + def get_values(cpuinfo, field): + return [ + int(line.split(':')[1].strip()) + for line in cpuinfo if line.startswith(field)] + + hwthread_ids = get_values(cpuinfo, 'processor') + core_ids = get_values(cpuinfo, 'core id') + + table = dict(zip(hwthread_ids, core_ids)) + + def convert(aff_ids): + cores = {table[i] for i in map(int, aff_ids.split(','))} + return sorted(cores) + + return convert diff --git a/integration_test/cluster_test/dispatch.sh b/integration_test/cluster_test/dispatch.sh new file mode 100755 index 00000000..aef00e66 --- /dev/null +++ b/integration_test/cluster_test/dispatch.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -e + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/dispatch.ymmsl $CT/settings.ymmsl $CT/implementations.ymmsl + diff --git a/integration_test/cluster_test/dispatch.ymmsl b/integration_test/cluster_test/dispatch.ymmsl new file mode 100644 index 00000000..d8b5a715 --- /dev/null +++ b/integration_test/cluster_test/dispatch.ymmsl @@ -0,0 +1,24 @@ +ymmsl_version: v0.1 + +model: + name: dispatch + components: + c1: + ports: + f_init: init_in + o_f: final_out + implementation: component_python + c2: + ports: + f_init: init_in + o_f: final_out + implementation: component_python + + conduits: + c1.final_out: c2.init_in + +resources: + c1: + threads: 1 + c2: + threads: 1 diff --git a/integration_test/cluster_test/double.ymmsl b/integration_test/cluster_test/double.ymmsl new file mode 100644 index 00000000..16f9094f --- /dev/null +++ b/integration_test/cluster_test/double.ymmsl @@ -0,0 +1,25 @@ +ymmsl_version: v0.1 + +model: + name: double + components: + c1: + ports: + o_i: inter_out + s: inter_in + implementation: component_cpp + c2: + ports: + o_i: inter_out + s: inter_in + implementation: component_cpp + + conduits: + c1.inter_out: c2.inter_in + c2.inter_out: c1.inter_in + +resources: + c1: + mpi_processes: 2 + c2: + mpi_processes: 2 diff --git a/integration_test/cluster_test/double_intelmpi.sh b/integration_test/cluster_test/double_intelmpi.sh new file mode 100755 index 00000000..e6e47859 --- /dev/null +++ b/integration_test/cluster_test/double_intelmpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/double.ymmsl $CT/settings.ymmsl $CT/implementations_intelmpi.ymmsl + diff --git a/integration_test/cluster_test/double_openmpi.sh b/integration_test/cluster_test/double_openmpi.sh new file mode 100755 index 00000000..12e117b8 --- /dev/null +++ b/integration_test/cluster_test/double_openmpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/double.ymmsl $CT/settings.ymmsl $CT/implementations_openmpi.ymmsl + diff --git a/integration_test/cluster_test/double_srunmpi.sh b/integration_test/cluster_test/double_srunmpi.sh new file mode 100755 index 00000000..2e7dbbf4 --- /dev/null +++ b/integration_test/cluster_test/double_srunmpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/double.ymmsl $CT/settings.ymmsl $CT/implementations_srunmpi.ymmsl + diff --git a/integration_test/cluster_test/implementations.ymmsl b/integration_test/cluster_test/implementations.ymmsl new file mode 100644 index 00000000..df88e24d --- /dev/null +++ b/integration_test/cluster_test/implementations.ymmsl @@ -0,0 +1,8 @@ +ymmsl_version: v0.1 + +implementations: + component_python: + virtual_env: /home/cerulean/shared/venv + executable: python + args: + - /home/cerulean/shared/cluster_test/component.py diff --git a/integration_test/cluster_test/implementations_intelmpi.ymmsl b/integration_test/cluster_test/implementations_intelmpi.ymmsl new file mode 100644 index 00000000..b216138d --- /dev/null +++ b/integration_test/cluster_test/implementations_intelmpi.ymmsl @@ -0,0 +1,9 @@ +ymmsl_version: v0.1 + +implementations: + component_cpp: + modules: intel-oneapi-mpi + env: + +LD_LIBRARY_PATH: /home/cerulean/shared/muscle3-intelmpi/lib + execution_model: intelmpi + executable: /home/cerulean/shared/cluster_test/component_intelmpi diff --git a/integration_test/cluster_test/implementations_openmpi.ymmsl b/integration_test/cluster_test/implementations_openmpi.ymmsl new file mode 100644 index 00000000..4a0d1dab --- /dev/null +++ b/integration_test/cluster_test/implementations_openmpi.ymmsl @@ -0,0 +1,9 @@ +ymmsl_version: v0.1 + +implementations: + component_cpp: + modules: openmpi + env: + +LD_LIBRARY_PATH: /home/cerulean/shared/muscle3-openmpi/lib + execution_model: openmpi + executable: /home/cerulean/shared/cluster_test/component_openmpi diff --git a/integration_test/cluster_test/implementations_srunmpi.ymmsl b/integration_test/cluster_test/implementations_srunmpi.ymmsl new file mode 100644 index 00000000..0ccf1265 --- /dev/null +++ b/integration_test/cluster_test/implementations_srunmpi.ymmsl @@ -0,0 +1,9 @@ +ymmsl_version: v0.1 + +implementations: + component_cpp: + modules: openmpi + env: + +LD_LIBRARY_PATH: /home/cerulean/shared/muscle3-openmpi/lib + execution_model: srunmpi + executable: /home/cerulean/shared/cluster_test/component_openmpi diff --git a/integration_test/cluster_test/macro_micro.ymmsl b/integration_test/cluster_test/macro_micro.ymmsl new file mode 100644 index 00000000..22cbf8a5 --- /dev/null +++ b/integration_test/cluster_test/macro_micro.ymmsl @@ -0,0 +1,25 @@ +ymmsl_version: v0.1 + +model: + name: macro_micro + components: + c1: + ports: + o_i: inter_out + s: inter_in + implementation: component_cpp + c2: + ports: + f_init: init_in + o_f: final_out + implementation: component_cpp + + conduits: + c1.inter_out: c2.init_in + c2.final_out: c1.inter_in + +resources: + c1: + mpi_processes: 2 + c2: + mpi_processes: 2 diff --git a/integration_test/cluster_test/macro_micro_intelmpi.sh b/integration_test/cluster_test/macro_micro_intelmpi.sh new file mode 100755 index 00000000..77bec53a --- /dev/null +++ b/integration_test/cluster_test/macro_micro_intelmpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/macro_micro.ymmsl $CT/settings.ymmsl $CT/implementations_intelmpi.ymmsl + diff --git a/integration_test/cluster_test/macro_micro_openmpi.sh b/integration_test/cluster_test/macro_micro_openmpi.sh new file mode 100755 index 00000000..6b7fccb3 --- /dev/null +++ b/integration_test/cluster_test/macro_micro_openmpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/macro_micro.ymmsl $CT/settings.ymmsl $CT/implementations_openmpi.ymmsl + diff --git a/integration_test/cluster_test/macro_micro_srunmpi.sh b/integration_test/cluster_test/macro_micro_srunmpi.sh new file mode 100755 index 00000000..a98aca57 --- /dev/null +++ b/integration_test/cluster_test/macro_micro_srunmpi.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +env + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/macro_micro.ymmsl $CT/settings.ymmsl $CT/implementations_srunmpi.ymmsl + diff --git a/integration_test/cluster_test/multiple.sh b/integration_test/cluster_test/multiple.sh new file mode 100755 index 00000000..49093155 --- /dev/null +++ b/integration_test/cluster_test/multiple.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -e + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/multiple.ymmsl $CT/settings.ymmsl $CT/implementations.ymmsl + diff --git a/integration_test/cluster_test/multiple.ymmsl b/integration_test/cluster_test/multiple.ymmsl new file mode 100644 index 00000000..64cb8b42 --- /dev/null +++ b/integration_test/cluster_test/multiple.ymmsl @@ -0,0 +1,57 @@ +ymmsl_version: v0.1 + +model: + name: multiple + components: + c1: + ports: + o_i: inter_out + s: inter_in + implementation: component_python + c2: + ports: + o_i: inter_out + s: inter_in + implementation: component_python + c3: + ports: + o_i: inter_out + s: inter_in + implementation: component_python + c4: + ports: + o_i: inter_out + s: inter_in + implementation: component_python + c5: + ports: + o_i: inter_out + s: inter_in + implementation: component_python + c6: + ports: + o_i: inter_out + s: inter_in + implementation: component_python + + conduits: + c1.inter_out: c2.inter_in + c2.inter_out: c3.inter_in + c3.inter_out: c4.inter_in + c4.inter_out: c5.inter_in + c5.inter_out: c6.inter_in + c6.inter_out: c1.inter_in + +resources: + c1: + threads: 1 + c2: + threads: 1 + c3: + threads: 1 + c4: + threads: 1 + c5: + threads: 1 + c6: + threads: 1 diff --git a/integration_test/cluster_test/settings.ymmsl b/integration_test/cluster_test/settings.ymmsl new file mode 100644 index 00000000..be4fb16f --- /dev/null +++ b/integration_test/cluster_test/settings.ymmsl @@ -0,0 +1,5 @@ +ymmsl_version: v0.1 + +settings: + muscle_remote_log_level: DEBUG + steps: 10 diff --git a/integration_test/cluster_test/single.sh b/integration_test/cluster_test/single.sh new file mode 100755 index 00000000..00f7e0b9 --- /dev/null +++ b/integration_test/cluster_test/single.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -e + +source /home/cerulean/shared/venv/bin/activate + +CT=/home/cerulean/shared/cluster_test + +muscle_manager --log-level=DEBUG --start-all $CT/single.ymmsl $CT/settings.ymmsl $CT/implementations.ymmsl + diff --git a/integration_test/cluster_test/single.ymmsl b/integration_test/cluster_test/single.ymmsl new file mode 100644 index 00000000..957023f2 --- /dev/null +++ b/integration_test/cluster_test/single.ymmsl @@ -0,0 +1,10 @@ +ymmsl_version: v0.1 + +model: + name: single + components: + c1: component_python + +resources: + c1: + threads: 1 diff --git a/integration_test/cluster_test/test_cluster.py b/integration_test/cluster_test/test_cluster.py new file mode 100644 index 00000000..81e02eaa --- /dev/null +++ b/integration_test/cluster_test/test_cluster.py @@ -0,0 +1,251 @@ +import cerulean +import logging +import pytest + +from integration_test.cluster_test.conftest import ( + REMOTE_SHARED, ssh_term, skip_unless_cluster) + + +logger_ = logging.getLogger(__name__) + + +@pytest.fixture +def fake_cluster(installed_cluster): + headnode_port = installed_cluster + term = ssh_term(headnode_port, 'Connection to virtual cluster container timed out') + with cerulean.SftpFileSystem(term, True) as fs: + local_sched = cerulean.DirectGnuScheduler(term) + slurm_sched = cerulean.SlurmScheduler(term) + yield term, fs, local_sched, slurm_sched + + +@pytest.fixture +def remote_home(fake_cluster): + remote_fs = fake_cluster[1] + return remote_fs / REMOTE_SHARED + + +@pytest.fixture +def remote_test_files(remote_home): + return remote_home / 'cluster_test' + + +@pytest.fixture +def remote_out_dir(remote_home): + return remote_home / 'test_results' + + +def _make_base_job(name, remote_out_dir, dir_name): + job_dir = remote_out_dir / dir_name + job_dir.mkdir(0o755, True, True) + + job = cerulean.JobDescription() + job.name = name + job.working_directory = job_dir + job.stdout_file = job_dir / 'stdout.txt' + job.stderr_file = job_dir / 'stderr.txt' + job.queue_name = 'debug' + job.time_reserved = 60 + job.system_out_file = job_dir / 'sysout.txt' + job.system_err_file = job_dir / 'syserr.txt' + job.extra_scheduler_options = '--ntasks-per-node=4' + + return job + + +def _make_job(name, mode, remote_test_files, remote_out_dir): + job = _make_base_job(name, remote_out_dir, f'test_{name}_{mode}') + job.command = str(remote_test_files / f'{name}.sh') + return job + + +def _make_mpi_job(name, mode, execution_model, remote_test_files, remote_out_dir): + job = _make_base_job(name, remote_out_dir, f'test_{name}_{mode}_{execution_model}') + job.command = str(remote_test_files / f'{name}_{execution_model}.sh') + return job + + +def _sched(fake_cluster, mode): + if mode == 'local': + return fake_cluster[2] + else: + return fake_cluster[3] + + +def _run_cmd_dir(remote_out_dir, testname, mode, execution_model=None): + results_name = f'test_{testname}_{mode}' + if execution_model is not None: + results_name += f'_{execution_model}' + + for p in (remote_out_dir / results_name).iterdir(): + if p.name.startswith('run_'): + return p + + +def _get_stdout(remote_out_dir, testname, mode, instance): + run_dir = _run_cmd_dir(remote_out_dir, testname, mode) + stdout_file = run_dir / 'instances' / instance / 'stdout.txt' + assert stdout_file.exists() # test output redirection + return stdout_file.read_text() + + +def _get_outfile(remote_out_dir, testname, mode, execution_model, instance, rank): + run_dir = _run_cmd_dir(remote_out_dir, testname, mode, execution_model) + work_dir = run_dir / 'instances' / instance / 'workdir' + out_file = work_dir / f'out_{rank}.txt' + assert out_file.exists() # test working directory + return out_file.read_text() + + +_SCHED_OVERHEAD = 60 + + +@skip_unless_cluster +@pytest.mark.parametrize('mode', ['local', 'slurm']) +def test_single( + fake_cluster, remote_test_files, remote_out_dir, mode, hwthread_to_core): + sched = _sched(fake_cluster, mode) + + job = _make_job('single', mode, remote_test_files, remote_out_dir) + if mode == 'slurm': + job.num_nodes = 1 + job.mpi_processes_per_node = 1 + job.extra_scheduler_options += ' --nodelist=node-0' + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 + + output = _get_stdout(remote_out_dir, 'single', mode, 'c1') + + if mode == 'local': + assert output.split('\n')[0] == 'headnode' + else: + node, hwthreads, _ = output.split('\n') + assert node == 'node-0' + assert hwthread_to_core(hwthreads) == [0] + + +@skip_unless_cluster +@pytest.mark.parametrize('mode', ['local', 'slurm']) +def test_dispatch( + fake_cluster, remote_test_files, remote_out_dir, mode, hwthread_to_core): + sched = _sched(fake_cluster, mode) + + job = _make_job('dispatch', mode, remote_test_files, remote_out_dir) + if mode == 'slurm': + job.num_nodes = 1 + job.mpi_processes_per_node = 1 + job.extra_scheduler_options += ' --nodelist=node-1' + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 + + c1_out = _get_stdout(remote_out_dir, 'dispatch', mode, 'c1') + c2_out = _get_stdout(remote_out_dir, 'dispatch', mode, 'c2') + if mode == 'local': + assert c1_out.split('\n')[0] == 'headnode' + assert c2_out.split('\n')[0] == 'headnode' + else: + node, hwthreads, _ = c1_out.split('\n') + assert node == 'node-1' + assert hwthread_to_core(hwthreads) == [0] + + node, hwthreads, _ = c2_out.split('\n') + assert node == 'node-1' + assert hwthread_to_core(hwthreads) == [0] + + +@skip_unless_cluster +@pytest.mark.parametrize('mode', ['local', 'slurm']) +def test_multiple( + fake_cluster, remote_test_files, remote_out_dir, mode, hwthread_to_core): + sched = _sched(fake_cluster, mode) + + job = _make_job('multiple', mode, remote_test_files, remote_out_dir) + if mode == 'slurm': + job.num_nodes = 3 + job.extra_scheduler_options += ' --nodelist=node-[0-2]' + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 + + for i in range(1, 7): + instance = f'c{i}' + out = _get_stdout(remote_out_dir, 'multiple', mode, instance) + if mode == 'local': + assert out.split('\n')[0] == 'headnode' + else: + node, hwthreads, _ = out.split('\n') + assert (instance, node) == (instance, f'node-{(i - 1) // 2}') + assert (instance, hwthread_to_core(hwthreads)) == (instance, [(i - 1) % 2]) + + +@skip_unless_cluster +@pytest.mark.parametrize('mode', ['local', 'slurm']) +@pytest.mark.parametrize('execution_model', ['openmpi', 'intelmpi', 'srunmpi']) +def test_double( + fake_cluster, remote_test_files, remote_out_dir, hwthread_to_core, + mode, execution_model): + + if mode == 'local' and execution_model == 'srunmpi': + pytest.skip('srun does not work without slurm') + + sched = _sched(fake_cluster, mode) + + job = _make_mpi_job( + 'double', mode, execution_model, remote_test_files, remote_out_dir) + if mode == 'slurm': + job.num_nodes = 2 + job.extra_scheduler_options += ' --nodelist=node-[3-4]' + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 + + for i in range(1, 3): + for rank in range(2): + out = _get_outfile( + remote_out_dir, 'double', mode, execution_model, f'c{i}', rank) + if mode == 'local': + assert out.split('\n')[0] == 'headnode' + else: + node, hwthreads, _ = out.split('\n') + assert node == f'node-{i + 2}' + assert hwthread_to_core(hwthreads) == [rank] + + +@skip_unless_cluster +@pytest.mark.parametrize('mode', ['local', 'slurm']) +@pytest.mark.parametrize('execution_model', ['openmpi', 'intelmpi', 'srunmpi']) +def test_macro_micro( + fake_cluster, remote_test_files, remote_out_dir, hwthread_to_core, + mode, execution_model): + + if mode == 'local' and execution_model == 'srunmpi': + pytest.skip('srun does not work without slurm') + + sched = _sched(fake_cluster, mode) + + job = _make_mpi_job( + 'macro_micro', mode, execution_model, remote_test_files, remote_out_dir) + if mode == 'slurm': + job.num_nodes = 1 + job.extra_scheduler_options += ' --nodelist=node-4' + + job_id = sched.submit(job) + assert sched.wait(job_id, job.time_reserved + _SCHED_OVERHEAD) is not None + assert sched.get_exit_code(job_id) == 0 + + for i in range(1, 3): + for rank in range(2): + out = _get_outfile( + remote_out_dir, 'macro_micro', mode, execution_model, f'c{i}', rank) + if mode == 'local': + assert out.split('\n')[0] == 'headnode' + else: + node, hwthreads, _ = out.split('\n') + assert node == 'node-4' + assert hwthread_to_core(hwthreads) == [rank] diff --git a/integration_test/fake_cluster/Dockerfile b/integration_test/fake_cluster/Dockerfile new file mode 100644 index 00000000..419ec852 --- /dev/null +++ b/integration_test/fake_cluster/Dockerfile @@ -0,0 +1,64 @@ +FROM ghcr.io/naturalhpc/cerulean-fake-slurm-base:latest +# FROM naturalhpc/cerulean-fake-slurm-base:latest + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@3.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@20-11 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@20-11) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@3.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@21-08 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@21-08) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@3.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@22-05 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@22-05) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@3.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@23-02 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@23-02) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@4.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@23-11 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@23-11) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@4.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@24-05 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@24-05) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@4.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@24-11 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@24-11) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install intel-oneapi-mpi@2021.14.0 target=zen2 + +# RUN . /opt/spack/share/spack/setup-env.sh && \ +# . $(spack location -i lmod)/lmod/lmod/init/bash && \ +# spack install mpich+slurm pmi=pmix ^pmix@3.2.3 + +COPY integration_test/fake_cluster/cgroup.conf /etc/slurm/cgroup.conf + +# Disable ssh debug output +RUN sed -i -e 's/^LogLevel DEBUG3$//' /etc/ssh/sshd_config +RUN sed -i -e 's^Subsystem sftp /usr/lib/openssh/sftp-server -l DEBUG3^Subsystem sftp /usr/lib/openssh/sftp-server^' /etc/ssh/sshd_config + + +RUN apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /home/cerulean + diff --git a/integration_test/fake_cluster/__init__.py b/integration_test/fake_cluster/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/integration_test/fake_cluster/cgroup.conf b/integration_test/fake_cluster/cgroup.conf new file mode 100644 index 00000000..4c11eb00 --- /dev/null +++ b/integration_test/fake_cluster/cgroup.conf @@ -0,0 +1,6 @@ +IgnoreSystemd=yes +CgroupPlugin=cgroup/v1 +ConstrainSwapSpace=no +ConstrainCores=yes +# ConstrainDevices=yes + diff --git a/integration_test/fake_cluster/old.Dockerfile b/integration_test/fake_cluster/old.Dockerfile new file mode 100644 index 00000000..9da30db9 --- /dev/null +++ b/integration_test/fake_cluster/old.Dockerfile @@ -0,0 +1,50 @@ +FROM ghcr.io/naturalhpc/cerulean-fake-slurm-base-old:latest +# FROM naturalhpc/cerulean-fake-slurm-base-old:latest + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@2.0.0 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@17-02 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@17-02) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@2.0.0 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@17-11 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@17-11) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@2.0.0 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@18-08 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@18-08) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@2.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@19-05 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@19-05) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install openmpi@2.1.6 +legacylaunchers +pmi schedulers=slurm \ + ^$(spack find --deps slurm@20-02 | grep pmix | tr -d ' ') \ + ^$(spack find --format "slurm/{hash}" slurm@20-02) + +RUN . /opt/spack/share/spack/setup-env.sh && \ + . $(spack location -i lmod)/lmod/lmod/init/bash && \ + spack install intel-oneapi-mpi@2021.14.0 + +# RUN . /opt/spack/share/spack/setup-env.sh && \ +# . $(spack location -i lmod)/lmod/lmod/init/bash && \ +# spack install mpich+slurm pmi=pmix ^pmix@3.2.3 + +# Disable ssh debug output +RUN sed -i -e 's/^LogLevel DEBUG3$//' /etc/ssh/sshd_config +RUN sed -i -e 's^Subsystem sftp /usr/lib/openssh/sftp-server -l DEBUG3^Subsystem sftp /usr/lib/openssh/sftp-server^' /etc/ssh/sshd_config + + +RUN apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /home/cerulean + diff --git a/libmuscle/python/libmuscle/errors.py b/libmuscle/python/libmuscle/errors.py new file mode 100644 index 00000000..9e819602 --- /dev/null +++ b/libmuscle/python/libmuscle/errors.py @@ -0,0 +1,2 @@ +class ConfigurationError(Exception): + """Signals an issue with the user's configuration.""" diff --git a/libmuscle/python/libmuscle/manager/instance_manager.py b/libmuscle/python/libmuscle/manager/instance_manager.py index 8d06c45e..51a7a67f 100644 --- a/libmuscle/python/libmuscle/manager/instance_manager.py +++ b/libmuscle/python/libmuscle/manager/instance_manager.py @@ -8,14 +8,17 @@ from ymmsl import Configuration, Reference +from libmuscle.errors import ConfigurationError from libmuscle.manager.instance_registry import InstanceRegistry from libmuscle.manager.instantiator import ( CancelAllRequest, CrashedResult, InstantiatorRequest, - InstantiationRequest, ProcessStatus, ShutdownRequest) + InstantiationRequest, Process, ProcessStatus, ShutdownRequest) from libmuscle.manager.logger import last_lines -from libmuscle.manager.qcgpj_instantiator import Process, QCGPJInstantiator +# from libmuscle.manager.qcgpj_instantiator import QCGPJInstantiator from libmuscle.manager.run_dir import RunDir -from libmuscle.planner.planner import Planner, Resources +from libmuscle.native_instantiator.native_instantiator import NativeInstantiator +from libmuscle.planner.planner import Planner, ResourceAssignment +from libmuscle.planner.resources import Resources _logger = logging.getLogger(__name__) @@ -61,7 +64,7 @@ class InstanceManager: def __init__( self, configuration: Configuration, run_dir: RunDir, instance_registry: InstanceRegistry) -> None: - """Create a ProcessManager. + """Create an InstanceManager. Args: configuration: The global configuration @@ -77,16 +80,33 @@ def __init__( self._results_in: Queue[_ResultType] = Queue() self._log_records_in: Queue[logging.LogRecord] = Queue() + # TODO: Instantiator factory function + # TODO: Add argument that specifies whether to use QCG or not + ''' self._instantiator = QCGPJInstantiator( self._resources_in, self._requests_out, self._results_in, self._log_records_in, self._run_dir.path) + ''' + self._instantiator = NativeInstantiator( + self._resources_in, self._requests_out, self._results_in, + self._log_records_in, self._run_dir.path) self._instantiator.start() self._log_handler = LogHandlingThread(self._log_records_in) self._log_handler.start() - self._allocations: Optional[Dict[Reference, Resources]] = None - self._planner = Planner(self._resources_in.get()) + self._allocations: Optional[Dict[Reference, ResourceAssignment]] = None + + resources = self._resources_in.get() + _logger.debug(f'Got resources {resources}') + if isinstance(resources, CrashedResult): + msg = ( + 'Instantiator crashed. This should not happen, please file a bug' + ' report.') + _logger.error(msg) + raise RuntimeError(msg) from resources.exception + + self._planner = Planner(resources) self._num_running = 0 def set_manager_location(self, location: str) -> None: @@ -104,7 +124,7 @@ def start_all(self) -> None: """Starts all the instances of the model.""" self._allocations = self._planner.allocate_all(self._configuration) for instance, resources in self._allocations.items(): - _logger.info(f'Planned {instance} on {resources}') + _logger.info(f'Planned {instance} on {resources.as_resources()}') components = {c.name: c for c in self._configuration.model.components} for instance, resources in self._allocations.items(): @@ -127,11 +147,11 @@ def start_all(self) -> None: instance, implementation, self._configuration.resources[component.name], resources, idir, workdir, stdout_path, stderr_path) - _logger.info(f'Instantiating {instance} on {resources}') + _logger.info(f'Instantiating {instance}') self._requests_out.put(request) self._num_running += 1 - def get_resources(self) -> Dict[Reference, Resources]: + def get_resources(self) -> Dict[Reference, ResourceAssignment]: """Returns the resources allocated to each instance. Only call this after start_all() has been called, or it will raise @@ -142,8 +162,7 @@ def get_resources(self) -> Dict[Reference, Resources]: """ if self._allocations is None: raise RuntimeError( - 'Tried to get resources but we are running without' - ' --start-all') + 'Tried to get resources but we are running without --start-all') return self._allocations @@ -164,9 +183,12 @@ def cancel_all() -> None: result = self._results_in.get() if isinstance(result, CrashedResult): - _logger.error( - 'Instantiator crashed. This should not happen, please file' - ' a bug report.') + if isinstance(result.exception, ConfigurationError): + _logger.error(str(result.exception)) + else: + _logger.error( + 'Instantiator crashed. This should not happen, please file' + ' a bug report.') return False results.append(result) diff --git a/libmuscle/python/libmuscle/manager/instantiator.py b/libmuscle/python/libmuscle/manager/instantiator.py index 9afca712..e29e48c2 100644 --- a/libmuscle/python/libmuscle/manager/instantiator.py +++ b/libmuscle/python/libmuscle/manager/instantiator.py @@ -1,13 +1,14 @@ import enum import logging import multiprocessing as mp +import os from pathlib import Path import traceback -from typing import Optional +from typing import Dict, Optional from ymmsl import Implementation, Reference, ResourceRequirements -from libmuscle.planner.planner import Resources +from libmuscle.planner.planner import ResourceAssignment class ProcessStatus(enum.Enum): @@ -39,7 +40,7 @@ class Process: exit_code: Exit code, if status is ERROR error_msg: Error message, if status is ERROR """ - def __init__(self, instance: Reference, resources: Resources) -> None: + def __init__(self, instance: Reference, resources: ResourceAssignment) -> None: """Create a Process object. Args: @@ -71,12 +72,17 @@ class InstantiationRequest(InstantiatorRequest): Attributes: instance: The name of the instance implementation: The implementation to start for it - resources: The resources to start it on + res_req: The resource requirements for this instance + resources: The specific resources to start it on + instance_dir: The main directory for this instance + work_dir: The directory in which to start it + stdout_path: Path of file to redirect stdout to + stderr_path: Path of file to redirect stderr to """ def __init__( self, instance: Reference, implementation: Implementation, - res_req: ResourceRequirements, resources: Resources, instance_dir: - Path, work_dir: Path, stdout_path: Path, stderr_path: Path + res_req: ResourceRequirements, resources: ResourceAssignment, + instance_dir: Path, work_dir: Path, stdout_path: Path, stderr_path: Path ) -> None: """Create an InstantiationRequest. @@ -84,7 +90,7 @@ def __init__( instance: The name of the instance implementation: The implementation to start for it res_req: The resource requirements for this instance - resources: The resources to instantiate on + resources: The specific resources to instantiate on instance_dir: The main directory for this instance work_dir: The directory in which to start it stdout_path: Path of file to redirect stdout to @@ -107,7 +113,8 @@ class CancelAllRequest(InstantiatorRequest): class CrashedResult: """Signals that the instantiator process crashed.""" - pass + def __init__(self, exception: Optional[BaseException] = None) -> None: + self.exception = exception class QueueingLogHandler(logging.Handler): @@ -133,3 +140,43 @@ def emit(self, record: logging.LogRecord) -> None: record.exc_info = None self._queue.put(record) + + +def reconfigure_logging(queue: mp.Queue) -> None: + """Reconfigure logging to send to queue. + + This reconfigures the logging subsystem to intercept all log + messages and send them to the given queue, rather than to the + previously configured handler. + """ + root_logger = logging.getLogger() + for h in list(root_logger.handlers): + root_logger.removeHandler(h) + + handler = QueueingLogHandler(queue) + root_logger.addHandler(handler) + + +def create_instance_env( + instance: Reference, overlay: Dict[str, str]) -> Dict[str, str]: + """Creates an environment for an instance. + + This takes the current (manager) environment variables and makes + a copy, then adds or extends it according to the overlay given. + + Keys from overlay that start with + will have the corresponding + value appended to the matching (by key, without the +) value in + env, otherwise the value in env gets overwritten. + """ + env = os.environ.copy() + env['MUSCLE_INSTANCE'] = str(instance) + + for key, value in overlay.items(): + if key.startswith('+'): + if key[1:] in env: + env[key[1:]] += value + else: + env[key[1:]] = value + else: + env[key] = value + return env diff --git a/libmuscle/python/libmuscle/manager/profile_store.py b/libmuscle/python/libmuscle/manager/profile_store.py index 036dea85..3ee262b8 100644 --- a/libmuscle/python/libmuscle/manager/profile_store.py +++ b/libmuscle/python/libmuscle/manager/profile_store.py @@ -5,7 +5,7 @@ from threading import Thread from typing import cast, Dict, Iterable, List, Optional, Tuple -from libmuscle.planner.planner import Resources +from libmuscle.planner.planner import ResourceAssignment from libmuscle.profiling import ProfileEvent, ProfileEventType from libmuscle.manager.profile_database import ProfileDatabase from ymmsl import Operator, Reference @@ -77,7 +77,7 @@ def store_instances( cur.execute("COMMIT") cur.close() - def store_resources(self, resources: Dict[Reference, Resources]) -> None: + def store_resources(self, resources: Dict[Reference, ResourceAssignment]) -> None: """Store resource assignments into the database. Args: @@ -90,9 +90,9 @@ def store_resources(self, resources: Dict[Reference, Resources]) -> None: instance_oid = self._get_instance_oid(cur, instance_id) tuples = [ - (instance_oid, node, core) - for node, cores in res.cores.items() - for core in cores] + (instance_oid, node.node_name, core.cid) + for node in res.as_resources() + for core in node.cpu_cores] cur.executemany( "INSERT INTO assigned_cores (instance_oid, node, core)" diff --git a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py index 9b5836d4..f54e96e2 100644 --- a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py +++ b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py @@ -26,9 +26,9 @@ from ymmsl import ExecutionModel, MPICoresResReq, Reference, ThreadedResReq from libmuscle.manager.instantiator import ( - CancelAllRequest, CrashedResult, InstantiationRequest, Process, - ProcessStatus, QueueingLogHandler, ShutdownRequest) -from libmuscle.planner.planner import Resources + CancelAllRequest, CrashedResult, create_instance_env, InstantiationRequest, + Process, ProcessStatus, reconfigure_logging, ShutdownRequest) +from libmuscle.planner.resources import Core, CoreSet, OnNodeResources, Resources _logger = logging.getLogger(__name__) @@ -95,7 +95,7 @@ class QCGPJInstantiator(mp.Process): def __init__( self, resources: mp.Queue, requests: mp.Queue, results: mp.Queue, log_records: mp.Queue, run_dir: Path) -> None: - """Create a QCGPJProcessManager. + """Create a QCGPJInstantiator. Args: resources: Queue for returning the available resources @@ -103,7 +103,7 @@ def __init__( results: Queue to communicate finished processes over log_messages: Queue to push log messages to """ - super().__init__(name='QCGPJProcessManager') + super().__init__(name='QCGPJInstantiator') self._resources_out = resources self._requests_in = requests self._results_out = results @@ -120,7 +120,7 @@ def run(self) -> None: qcgpj_dir.mkdir(exist_ok=True) os.chdir(qcgpj_dir) - self._reconfigure_logging() + reconfigure_logging(self._log_records_out) # Executor needs to be instantiated before we go async qcg_config: Dict[str, str] = {qcg_Config.AUX_DIR: str(qcgpj_dir)} @@ -196,20 +196,15 @@ async def _main(self) -> None: _logger.debug('Stopping executor') await self._executor.stop() - def _reconfigure_logging(self) -> None: - """Reconfigure logging to send to log_records_out.""" - root_logger = logging.getLogger() - for h in list(root_logger.handlers): - root_logger.removeHandler(h) - - handler = QueueingLogHandler(self._log_records_out) - root_logger.addHandler(handler) - def _send_resources(self) -> None: """Converts and sends QCG available resources.""" - resources = Resources() + resources = Resources([]) for node in self._qcg_resources.nodes: - resources.cores[node.name] = {int(n.split(',')[0]) for n in node.free_ids} + cs = CoreSet([ + Core(cid, set(map(int, hwthreads_str.split(',')))) + for cid, hwthreads_str in enumerate(node.free_ids)]) + nr = OnNodeResources(node.name, cs) + resources.add_node(nr) self._resources_out.put(resources) @@ -245,9 +240,10 @@ def _create_job( qcg_resources_type: qcg_ResourcesType ) -> Tuple[qcg_Allocation, qcg_SchedulingIteration]: """Creates a QCG allocation and job for a request.""" - total_cores = sum(map(len, request.resources.cores.values())) + total_cores = sum([ + nres.total_cores() for nres in request.resources.by_rank]) - env = self._create_env(request.instance, request.implementation.env) + env = create_instance_env(request.instance, request.implementation.env) if request.implementation.script: execution = self._qcg_job_execution_with_script(request, env) @@ -263,38 +259,18 @@ def _create_job( resources=resources) qcg_allocation = qcg_Allocation() - for node_name, cores in request.resources.cores.items(): - qcg_cores = [str(i) for i in cores] + res = request.resources.as_resources() + for node in res: + qcg_cores = [ + ','.join(map(str, core.hwthreads)) + for core in node.cpu_cores] qcg_allocation.add_node( - qcg_NodeAllocation(qcg_Node(node_name), qcg_cores, {})) + qcg_NodeAllocation(qcg_Node(node.node_name), qcg_cores, {})) sjob = qcg_SchedulingJob(self._state_tracker, qcg_job) qcg_iteration = qcg_SchedulingIteration(sjob, None, None, resources, []) return qcg_allocation, qcg_iteration - def _create_env( - self, instance: Reference, overlay: Dict[str, str] - ) -> Dict[str, str]: - """Updates the environment with the implementation's env. - - This updates env in-place. Keys from overlay that start with - + will have the corresponding value appended to the matching - (by key, without the +) value in env, otherwise the value in - env gets overwritten. - """ - env = os.environ.copy() - env['MUSCLE_INSTANCE'] = str(instance) - - for key, value in overlay.items(): - if key.startswith('+'): - if key[1:] in env: - env[key[1:]] += value - else: - env[key[1:]] = value - else: - env[key] = value - return env - def _qcg_job_execution_with_script( self, request: InstantiationRequest, env: Dict[str, str] ) -> qcg_JobExecution: @@ -315,16 +291,19 @@ def _qcg_job_execution_with_script( rank_file = request.instance_dir / 'rankfile' with rank_file.open('w') as f: i = 0 - for node, cores in request.resources.cores.items(): - for c in sorted(cores): - f.write(f'rank {i}={node} slot={c}\n') + res = request.resources.as_resources() + for node in res: + for cid in sorted([c.cid for c in node.cpu_cores]): + f.write(f'rank {i}={node.node_name} slot={cid}\n') i += 1 env['MUSCLE_OPENMPI_RANK_FILE'] = str(rank_file) # IntelMPI support mpi_res_args = list() - for node, cores in request.resources.cores.items(): - mpi_res_args.extend(['-host', node, '-n', str(len(cores))]) + res = request.resources.as_resources() + for node in res: + mpi_res_args.extend([ + '-host', node.node_name, '-n', str(node.total_cores())]) env['MUSCLE_INTELMPI_RESOURCES'] = ' '.join(mpi_res_args) # General environment @@ -346,7 +325,7 @@ def _qcg_job_execution_normal( qcg_resources_type: qcg_ResourcesType) -> qcg_JobExecution: """Create a JobExecution for a normal description.""" impl = request.implementation - total_cores = sum(map(len, request.resources.cores.values())) + total_cores = request.resources.as_resources().total_cores() if impl.execution_model == ExecutionModel.DIRECT: env['OMP_NUM_THREADS'] = str(total_cores) diff --git a/libmuscle/python/libmuscle/manager/test/test_profile_database.py b/libmuscle/python/libmuscle/manager/test/test_profile_database.py index 2d6d472c..b72c964a 100644 --- a/libmuscle/python/libmuscle/manager/test/test_profile_database.py +++ b/libmuscle/python/libmuscle/manager/test/test_profile_database.py @@ -2,10 +2,12 @@ from libmuscle.manager.profile_database import ProfileDatabase from libmuscle.manager.profile_store import ProfileStore -from libmuscle.planner.planner import Resources +from libmuscle.planner.planner import ResourceAssignment from libmuscle.profiling import ( ProfileEvent, ProfileEventType, ProfileTimestamp) +from libmuscle.test.conftest import on_node_resources as onr + from ymmsl import Operator, Port, Reference import pytest @@ -21,13 +23,11 @@ def db_file(tmp_path) -> Path: store.store_instances([Reference('instance1'), Reference('instance2')]) - resources1 = Resources({ - 'node001': {0, 1}, - 'node002': {0, 1}}) + resources1 = ResourceAssignment([ + onr('node001', {0, 1}), onr('node002', {0, 1})]) - resources2 = Resources({ - 'node001': {0}, - 'node002': {0, 1, 2}}) + resources2 = ResourceAssignment([ + onr('node001', {0}), onr('node002', {0, 1, 2})]) store.store_resources({ Reference('instance1'): resources1, diff --git a/libmuscle/python/libmuscle/mcp/protocol.py b/libmuscle/python/libmuscle/mcp/protocol.py index 5d1217ed..b6f662a2 100644 --- a/libmuscle/python/libmuscle/mcp/protocol.py +++ b/libmuscle/python/libmuscle/mcp/protocol.py @@ -8,10 +8,10 @@ class RequestType(Enum): Call protocol in which a request is sent to the server and a response is sent back to the calling client. In MCP, both of these are chunks of bytes. - The MUSCLE Manager Protocol and MUSCLE Peer Protocol define the encoded - messages sent in those chunks, using MsgPack encoding. To distinguish - different kinds of requests, a request type identifier is used, as - represented by this class. + The MUSCLE Manager Protocol, MUSCLE Peer Protocol and MUSCLE Agent Protocol + define the encoded messages sent in those chunks, using MsgPack encoding. + To distinguish different kinds of requests, a request type identifier is + used, as represented by this class. """ # MUSCLE Manager Protocol REGISTER_INSTANCE = 1 @@ -26,6 +26,11 @@ class RequestType(Enum): # MUSCLE Peer Protocol GET_NEXT_MESSAGE = 21 + # MUSCLE Agent Protocol + REPORT_RESOURCES = 41 + GET_COMMAND = 42 + REPORT_RESULT = 43 + class ResponseType(Enum): """Identifier for different types of response @@ -37,3 +42,14 @@ class ResponseType(Enum): SUCCESS = 0 ERROR = 1 PENDING = 2 + + +class AgentCommandType(Enum): + """Identifier for different types of commands + + These are requested from the manager by the agent, and tell it what to do. Part + of the MUSCLE Agent Protocol, used in the response to RequestType.GET_COMMAND. + """ + START = 1 + CANCEL_ALL = 2 + SHUTDOWN = 3 diff --git a/libmuscle/python/libmuscle/native_instantiator/__init__.py b/libmuscle/python/libmuscle/native_instantiator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/libmuscle/python/libmuscle/native_instantiator/agent/__init__.py b/libmuscle/python/libmuscle/native_instantiator/agent/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py b/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py new file mode 100644 index 00000000..a85f2096 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/agent/__main__.py @@ -0,0 +1,181 @@ +import logging +import os +import psutil +from socket import gethostname +import sys +from time import sleep +from typing import Dict, Set + +from libmuscle.native_instantiator.process_manager import ProcessManager +from libmuscle.native_instantiator.agent.map_client import MAPClient +from libmuscle.native_instantiator.agent.agent_commands import ( + CancelAllCommand, ShutdownCommand, StartCommand) +from libmuscle.planner.resources import Core, CoreSet, OnNodeResources + + +_logger = logging.getLogger(__name__) + + +class Agent: + """Runs on a compute node and starts processes there.""" + def __init__(self, node_name: str, server_location: str) -> None: + """Create an Agent. + + Args: + node_name: Name (hostname) of this node + server_location: MAP server of the manager to connect to + """ + _logger.info(f'Agent at {node_name} starting') + + self._process_manager = ProcessManager() + + self._node_name = node_name + + _logger.info(f'Connecting to manager at {server_location}') + self._server = MAPClient(self._node_name, server_location) + _logger.info('Connected to manager') + + def run(self) -> None: + """Execute commands and monitor processes.""" + _logger.info('Reporting resources') + self._server.report_resources(self._inspect_resources()) + + shutting_down = False + while not shutting_down: + command = self._server.get_command() + if isinstance(command, StartCommand): + _logger.info(f'Starting process {command.name}') + _logger.debug(f'Args: {command.args}') + _logger.debug(f'Env: {command.env}') + + self._process_manager.start( + command.name, command.work_dir, command.args, command.env, + command.stdout, command.stderr) + elif isinstance(command, CancelAllCommand): + _logger.info('Cancelling all instances') + self._process_manager.cancel_all() + + elif isinstance(command, ShutdownCommand): + # check that nothing is running + shutting_down = True + _logger.info('Agent shutting down') + + finished = self._process_manager.get_finished() + if finished: + for name, exit_code in finished: + _logger.info(f'Process {name} finished with exit code {exit_code}') + self._server.report_result(finished) + + sleep(0.1) + + def _inspect_resources(self) -> OnNodeResources: + """Inspect the node to find resources and report on them. + + The terminology for identifying processors gets very convoluted, with Linux, + Slurm, OpenMPI and IntelMPI all using different terms, or sometimes the same + terms for different things. See the comment in planner/resources.py for what is + what and how we use it. + + Returns: + A dict mapping resource types to resource descriptions. + """ + if hasattr(os, 'sched_getaffinity'): + hwthreads_by_core: Dict[int, Set[int]] = dict() + + # these are the logical hwthread ids that we can use + hwthread_ids = list(os.sched_getaffinity(0)) + + for i in hwthread_ids: + with open(f'/sys/devices/system/cpu/cpu{i}/topology/core_id', 'r') as f: + # this gets the logical core id for the hwthread + core_id = int(f.read()) + hwthreads_by_core.setdefault(core_id, set()).add(i) + + cores = CoreSet(( + Core(core_id, hwthreads) + for core_id, hwthreads in hwthreads_by_core.items())) + + else: + # MacOS doesn't support thread affinity, but older Macs with Intel + # processors do have SMT. Getting the hwthread to core mapping is not so + # easy, and if we're running on macOS then we're not on a cluster and don't + # do binding anyway. So we're going to get the number of hwthreads and the + # number of cores here, and synthesise a mapping that may be wrong, but will + # at least represent the number of cores and threads per core correctly. + nhwthreads = psutil.cpu_count(logical=True) + ncores = psutil.cpu_count(logical=False) + + if nhwthreads is None: + if ncores is not None: + _logger.warning( + 'Could not determine number of hwthreads, assuming no SMT') + nhwthreads = ncores + else: + _logger.warning( + 'Could not determine CPU configuration, assuming a single' + ' core') + ncores = 1 + nhwthreads = 1 + elif ncores is None: + _logger.warning( + 'Could not determine number of cores, assuming no SMT') + ncores = nhwthreads + + hwthreads_per_core = nhwthreads // ncores + + if ncores * hwthreads_per_core != nhwthreads: + # As far as I know, there are no Macs with heterogeneous SMT, like in + # the latest Intel CPUs. + _logger.warning( + 'Only some cores seem to have SMT, core ids are probably' + ' wrong. If this is a cluster then this will cause problems,' + ' please report an issue on GitHub and report the machine and' + ' what kind of OS and hardware it has. If we\'re running on a' + ' local machine, then this won\'t affect the run, but I\'d' + ' still appreciate an issue, because it is unexpected for sure.' + ) + + cores = CoreSet(( + Core( + cid, + set(range( + cid * hwthreads_per_core, (cid + 1) * hwthreads_per_core)) + ) + for cid in range(ncores) + )) + + resources = OnNodeResources(self._node_name, cores) + _logger.info(f'Found resources: {resources}') + return resources + + +def configure_logging(node_name: str, log_level: int) -> None: + """Make us output logs to a custom log file.""" + fmt = '%(asctime)s %(levelname)s %(message)s' + formatter = logging.Formatter(fmt) + + handler = logging.FileHandler(f'muscle3_agent_{node_name}.log', mode='w') + handler.setFormatter(formatter) + + # Find and remove default handler to disable automatic console output + # Testing for 'stderr' in the stringified version is not nice, but + # seems reliable, and doesn't mess up pytest's caplog mechanism while + # it also doesn't introduce a runtime dependency on pytest. + logging.getLogger().handlers = [ + h for h in logging.getLogger().handlers + if 'stderr' not in str(h)] + + logging.getLogger().addHandler(handler) + + logging.getLogger().setLevel(log_level) + + +if __name__ == '__main__': + node_name = gethostname() + server_location = sys.argv[1] + log_level = int(sys.argv[2]) + + configure_logging(node_name, log_level) + + agent = Agent(node_name, server_location) + agent.run() diff --git a/libmuscle/python/libmuscle/native_instantiator/agent/agent_commands.py b/libmuscle/python/libmuscle/native_instantiator/agent/agent_commands.py new file mode 100644 index 00000000..56a830d1 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/agent/agent_commands.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + + +class AgentCommand: + pass + + +@dataclass +class StartCommand(AgentCommand): + name: str + work_dir: Path + args: List[str] + env: Dict[str, str] + stdout: Path + stderr: Path + + +class CancelAllCommand(AgentCommand): + pass + + +class ShutdownCommand(AgentCommand): + pass diff --git a/libmuscle/python/libmuscle/native_instantiator/agent/map_client.py b/libmuscle/python/libmuscle/native_instantiator/agent/map_client.py new file mode 100644 index 00000000..e402b29f --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/agent/map_client.py @@ -0,0 +1,99 @@ +from pathlib import Path +from typing import Any, List, Optional, Tuple + +import msgpack + +from libmuscle.mcp.protocol import AgentCommandType, RequestType, ResponseType +from libmuscle.mcp.tcp_transport_client import TcpTransportClient +from libmuscle.native_instantiator.agent.agent_commands import ( + AgentCommand, StartCommand, CancelAllCommand, ShutdownCommand) +from libmuscle.planner.resources import OnNodeResources + + +class MAPClient: + """The client for the MUSCLE Agent Protocol. + + This class connects to the AgentManager and communicates with it. + """ + def __init__(self, node_name: str, location: str) -> None: + """Create a MAPClient + + Args: + node_name: Name (hostname) of the local node + location: A connection string of the form hostname:port + """ + self._node_name = node_name + self._transport_client = TcpTransportClient(location) + + def close(self) -> None: + """Close the connection + + This closes the connection. After this no other member functions can be called. + """ + self._transport_client.close() + + def report_resources(self, resources: OnNodeResources) -> None: + """Report local resources + + Args: + resources: Description of the resources on this node + """ + enc_cpu_resources = [[c.cid] + list(c.hwthreads) for c in resources.cpu_cores] + request = [ + RequestType.REPORT_RESOURCES.value, + resources.node_name, {'cpu': enc_cpu_resources}] + self._call_agent_manager(request) + + def get_command(self) -> Optional[AgentCommand]: + """Get a command from the agent manager. + + Returns: + A command, or None if there are no commands pending. + """ + request = [RequestType.GET_COMMAND.value, self._node_name] + response = self._call_agent_manager(request) + + if response[0] == ResponseType.PENDING.value: + return None + else: + command = msgpack.unpackb(response[1], raw=False) + + if command[0] == AgentCommandType.START.value: + name = command[1] + workdir = Path(command[2]) + args = command[3] + env = command[4] + stdout = Path(command[5]) + stderr = Path(command[6]) + + return StartCommand(name, workdir, args, env, stdout, stderr) + + elif command[0] == AgentCommandType.CANCEL_ALL.value: + return CancelAllCommand() + + elif command[0] == AgentCommandType.SHUTDOWN.value: + return ShutdownCommand() + + raise Exception('Unknown AgentCommand') + + def report_result(self, names_exit_codes: List[Tuple[str, int]]) -> None: + """Report results of finished processes. + + Args: + names_exit_codes: A list of names and exit codes of finished processes. + """ + request = [RequestType.REPORT_RESULT.value, names_exit_codes] + self._call_agent_manager(request) + + def _call_agent_manager(self, request: Any) -> Any: + """Call the manager and do en/decoding. + + Args: + request: The request to encode and send + + Returns: + The decoded response + """ + encoded_request = msgpack.packb(request, use_bin_type=True) + response, _ = self._transport_client.call(encoded_request) + return msgpack.unpackb(response, raw=False) diff --git a/libmuscle/python/libmuscle/native_instantiator/agent_manager.py b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py new file mode 100644 index 00000000..37883749 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/agent_manager.py @@ -0,0 +1,224 @@ +import logging +from pathlib import Path +from subprocess import Popen, TimeoutExpired +import sys +from threading import Lock +from time import sleep +from typing import Dict, List, Tuple + +from libmuscle.native_instantiator.agent.agent_commands import ( + CancelAllCommand, StartCommand, ShutdownCommand) +from libmuscle.native_instantiator.iagent_manager import IAgentManager +from libmuscle.native_instantiator.map_server import MAPServer +from libmuscle.native_instantiator.global_resources import global_resources +from libmuscle.planner.resources import OnNodeResources, Resources + + +_logger = logging.getLogger(__name__) + + +class AgentManager(IAgentManager): + """Manage the node agents. + + Each node of our allocated resources gets an agent, which launches and monitors + processes or that node. This class launches those agents across the nodes, + and communicates with them. + + The AgentManager sits in between the NativeInstantiator and the MAPServer. It gets + called by NativeInstantiator with requests for resources and commands to start and + cancel processes on nodes, and it gets called by MAPServer with requests from the + agents. + """ + def __init__(self, agent_dir: Path) -> None: + """Create an AgentManager. + + Create the object, then launch the agents and wait for them to connect and send + information about the available resources. + + Args: + agent_dir: Directory in which agents can write log files. + """ + self._nodes: List[str] = list() + self._resources: Resources = Resources([]) + self._resources_lock = Lock() # protects _nodes and _resources + + self._finished_processes: List[Tuple[str, int]] = list() + self._finished_processes_lock = Lock() + + self._server = MAPServer(self) + self._launch_agents(agent_dir, self._server.get_location()) + + def get_resources(self) -> Resources: + """Return detected resources. + + This returns a list of sets of logical hwthread ids per core, per node. + + Called by NativeInstantiator. + """ + # no need to lock, _resources is already in its final state + return self._resources + + def start( + self, node_name: str, name: str, work_dir: Path, args: List[str], + env: Dict[str, str], stdout: Path, stderr: Path) -> None: + """Start a process on a node. + + The files that the output is directed to will be overwritten if they already + exist. + + Args: + node_name: Name of the node to run the process on + name: Name under which this process will be known + work_dir: Working directory in which to start + args: Executable and arguments to run + env: Environment variables to set + stdout: File to redirect stdout to + stderr: File to redirect stderr to + """ + command = StartCommand(name, work_dir, args, env, stdout, stderr) + self._server.deposit_command(node_name, command) + + def cancel_all(self) -> None: + """Cancel all processes. + + This tells the agents to stop all running processes they've started. + + Called by NativeInstantiator. + """ + for node_name in self._nodes: + self._server.deposit_command(node_name, CancelAllCommand()) + + def get_finished(self) -> List[Tuple[str, int]]: + """Returns names and exit codes of finished processes. + + This returns all processes that have finished running since the previous call; + each started process will be returned exactly once. The names are the ones + passed to start(). + + Called by NativeInstantiator. + """ + with self._finished_processes_lock: + next_batch = self._finished_processes + self._finished_processes = list() + + return next_batch + + def shutdown(self) -> None: + """Shut down the manager and its agents.""" + command = ShutdownCommand() + for node_name in self._nodes: + self._server.deposit_command(node_name, command) + + try: + self._agents_process.wait(60) + except TimeoutExpired: + _logger.warning( + 'Agents did not shut down within one minute, sending signal...') + self._agents_process.kill() + + try: + self._agents_process.wait(10) + self._agents_stdout.close() + self._agents_stderr.close() + except TimeoutExpired: + _logger.warning('Agents still not down, continuing shutdown anyway.') + + self._server.stop() + + def report_resources(self, resources: OnNodeResources) -> None: + """Report resources found on a node. + + Called by MAPServer from a server thread. + + Args: + resources: Description of a node's resources + """ + _logger.debug(f'Agent reported {resources}') + with self._resources_lock: + self._nodes.append(resources.node_name) + self._resources.add_node(resources) + + def report_result(self, names_exit_codes: List[Tuple[str, int]]) -> None: + """Report results of finished processes. + + Called by MAPServer from a server thread. + + Args: + names_exit_codes: A list of names and exit codes of finished processes. + """ + with self._finished_processes_lock: + self._finished_processes.extend(names_exit_codes) + + def _launch_agents(self, agent_dir: Path, server_location: str) -> None: + """Actually launch the agents. + + This runs a local process, either to start a single agent locally, or on a + cluster to start all of them in one go. + + Args: + agent_dir: Working directory for the agents + server_location: MAPServer network location string for the agents to + connect to + """ + _logger.info('Launching MUSCLE agents...') + + python = sys.executable + if not python: + raise RuntimeError( + 'Could not launch agents because sys.executable is not set.') + + log_level = logging.getLogger('libmuscle').getEffectiveLevel() + + args = [ + sys.executable, '-m', 'libmuscle.native_instantiator.agent', + server_location, str(log_level)] + + args = global_resources().agent_launch_command(args) + + self._agents_stdout = (agent_dir / 'agent_launch.out').open('a') + self._agents_stderr = (agent_dir / 'agent_launch.err').open('a') + + _logger.debug(f'Launching agents using {args}') + self._agents_process = Popen( + args, cwd=agent_dir, stdout=self._agents_stdout, + stderr=self._agents_stderr) + + expected_nodes = global_resources().nodes + + resources_complete = False + while not resources_complete: + sleep(0.1) + with self._resources_lock: + resources_complete = len(self._nodes) == len(expected_nodes) + too_many_agents = len(self._nodes) > len(expected_nodes) + + _logger.debug(f'{len(self._nodes)} agents up of {len(expected_nodes)}') + + if self._agents_process.poll() is not None: + msg = ( + 'Agents unexpectedly stopped running. This is not supposed' + ' to happen. Please see the agent log for more information,' + ' and please file an issue on GitHub.') + _logger.error(msg) + raise RuntimeError(msg) + + if too_many_agents: + msg = ( + 'More agents were started than MUSCLE3 asked for. This is not' + ' supposed to happen. Please file an issue on GitHub, with the' + ' SLURM version (use "sbatch -v") and the sbatch command used' + ' to submit the job.') + _logger.error(msg) + raise RuntimeError(msg) + + _logger.info(f'All agents running on {self._nodes}') + + if sorted(expected_nodes) != sorted(self._nodes): + _logger.error( + 'Agent-reported node hostnames do not match what we got from the' + ' resource manager.') + _logger.error( + 'According to the resource manager, we have' + f' {sorted(expected_nodes)}') + _logger.error( + f'The agents are reporting {sorted(self._nodes)}') diff --git a/libmuscle/python/libmuscle/native_instantiator/global_resources.py b/libmuscle/python/libmuscle/native_instantiator/global_resources.py new file mode 100644 index 00000000..ce5ab82c --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/global_resources.py @@ -0,0 +1,91 @@ +from enum import Enum +import logging +from socket import gethostname +from typing import List, Optional + +import psutil + +from libmuscle.native_instantiator.slurm import slurm + + +_logger = logging.getLogger(__name__) + + +class Scheduler(Enum): + NONE = 0 + SLURM = 1 + + +class GlobalResources: + """Detects available compute resources. + + This detects whether we're running locally or in a SLURM allocation, and returns + available resources on request. This class describes all the available resources, + not the ones local to a node. + + Attributes: + scheduler: The HPC scheduler we're running under, if any. + nodes: List of hostnames of available nodes to run on. + logical_cpus_per_node: Number of cores available on each node. + List alongside nodes. + """ + def __init__(self) -> None: + """Create a GlobalResources. + + Detects available resources and initialises the object, which can then be + queried. + """ + if slurm.in_slurm_allocation(): + _logger.info('Detected a SLURM allocation') + self.scheduler = Scheduler.SLURM + self.nodes = slurm.get_nodes() + self.logical_cpus_per_node = slurm.get_logical_cpus_per_node() + _logger.info( + f'We have {len(self.nodes)} nodes and a total of' + f' {sum(self.logical_cpus_per_node)} logical CPUs available') + else: + _logger.info('Running locally without a cluster scheduler') + self.scheduler = Scheduler.NONE + self.nodes = [gethostname()] + self.logical_cpus_per_node = [psutil.cpu_count(logical=True) or 0] + _logger.info( + f'We have {self.logical_cpus_per_node[0]} logical CPUS available') + + def on_cluster(self) -> bool: + """Return whether we're running on a cluster.""" + return self.scheduler != Scheduler.NONE + + def agent_launch_command(self, agent_cmd: List[str]) -> List[str]: + """Return a command for launching one agent on each node. + + Args: + agent_cmd: A command that will start the agent. + """ + if self.scheduler == Scheduler.SLURM: + return slurm.agent_launch_command(agent_cmd, len(self.nodes)) + return agent_cmd + + +_global_resources: Optional[GlobalResources] = None +"""Global resources object. + +This is a singleton, and that's fine because it's created once and then read-only. Also, +it's used in two places, and making two objects logs everything twice which is annoying. +""" + + +def global_resources() -> GlobalResources: + """Wrapper for _global_resources. + + This is here to ensure that the object gets created after we've configured logging, + so that the log output it generates actually ends up in the manager log. + + The users are all in the main thread of the NativeInstantiator background process, + so there's no need for a lock right now. + """ + global _global_resources + + if _global_resources is None: + _global_resources = GlobalResources() + + return _global_resources diff --git a/libmuscle/python/libmuscle/native_instantiator/iagent_manager.py b/libmuscle/python/libmuscle/native_instantiator/iagent_manager.py new file mode 100644 index 00000000..badf6a46 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/iagent_manager.py @@ -0,0 +1,31 @@ +from typing import List, Tuple + +from libmuscle.planner.resources import OnNodeResources + + +class IAgentManager: + """Interface for Agent Managers. + + Only implemented by AgentManager, and only exists to avoid a circular dependency + between AgentManager, MAPServer, and MAPRequestHandler. Ugh. + """ + def report_resources(self, resources: OnNodeResources) -> None: + """Report resources found on a node. + + Called by MAPServer from a server thread. + + Args: + node_name: Id of the node these resources are on + resources: Dict mapping resource type to resource ids + """ + raise NotImplementedError() + + def report_result(self, names_exit_codes: List[Tuple[str, int]]) -> None: + """Report results of finished processes. + + Called by MAPServer from a server thread. + + Args: + names_exit_codes: A list of names and exit codes of finished processes. + """ + raise NotImplementedError() diff --git a/libmuscle/python/libmuscle/native_instantiator/map_server.py b/libmuscle/python/libmuscle/native_instantiator/map_server.py new file mode 100644 index 00000000..87c3f5ca --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/map_server.py @@ -0,0 +1,176 @@ +import errno +import logging +from typing import Any, Dict, cast, List, Optional + +import msgpack + +from libmuscle.mcp.protocol import AgentCommandType, RequestType, ResponseType +from libmuscle.mcp.tcp_transport_server import TcpTransportServer +from libmuscle.mcp.transport_server import RequestHandler +from libmuscle.native_instantiator.agent.agent_commands import ( + AgentCommand, CancelAllCommand, ShutdownCommand, StartCommand) +from libmuscle.native_instantiator.iagent_manager import IAgentManager +from libmuscle.planner.resources import Core, CoreSet, OnNodeResources +from libmuscle.post_office import PostOffice + +from ymmsl import Reference + + +_logger = logging.getLogger(__name__) + + +class MAPRequestHandler(RequestHandler): + """Handles Agent requests.""" + def __init__(self, agent_manager: IAgentManager, post_office: PostOffice) -> None: + """Create a MAPRequestHandler. + + Args: + agent_manager: The AgentManager to forward reports to + post_office: The PostOffice to get commands from + """ + self._agent_manager = agent_manager + self._post_office = post_office + + def handle_request(self, request: bytes) -> bytes: + """Handles an agent request. + + Args: + request: The encoded request + + Returns: + response: An encoded response + """ + req_list = msgpack.unpackb(request, raw=False) + req_type = req_list[0] + req_args = req_list[1:] + if req_type == RequestType.REPORT_RESOURCES.value: + response = self._report_resources(*req_args) + elif req_type == RequestType.GET_COMMAND.value: + response = self._get_command(*req_args) + elif req_type == RequestType.REPORT_RESULT.value: + response = self._report_result(*req_args) + + return cast(bytes, msgpack.packb(response, use_bin_type=True)) + + def _report_resources( + self, node_name: str, data: Dict[str, Any]) -> Any: + """Handle a report resources request. + + This is used by the agent to report available resources on its node when + it starts up. + + Args: + node_name: Name (hostname) of the node + data: Resource dictionary, containing a single key 'cpu' which maps to a + list of cores, where each core is a list of ints, starting with the core + id at index [0] followed by the hwthread ids of all hwthreads in this + core. + """ + cores = CoreSet((Core(ids[0], set(ids[1:])) for ids in data['cpu'])) + node_resources = OnNodeResources(node_name, cores) + self._agent_manager.report_resources(node_resources) + return [ResponseType.SUCCESS.value] + + def _get_command(self, node_name: str) -> Any: + """Handle a get command request. + + This is used by the agent to ask if there's anything we would like it to do. + Command sounds a bit brusque, but we already have the agent sending requests + to this handler, so I needed a different word to distinguish them. Requests + are sent by the agent to the manager (because it's the client in an RPC setup), + commands are returned by the manager to the agent (because it tells it what to + do). + + Args: + node_name: Hostname (name) of the agent's node + """ + node_ref = Reference(node_name.replace('-', '_')) + next_request: Optional[bytes] = None + if self._post_office.have_message(node_ref): + next_request = self._post_office.get_message(node_ref) + + if next_request is not None: + return [ResponseType.SUCCESS.value, next_request] + + return [ResponseType.PENDING.value] + + def _report_result(self, instances: List[List[Any]]) -> Any: + """Handle a report result rquest. + + This is sent by the agent if an instance it launched exited. + + Args: + instances: List of instance descriptions, comprising an id str and exit + code int. Really a List[Tuple[str, int]] but msgpack doesn't know + about tuples. + """ + self._agent_manager.report_result(list(map(tuple, instances))) + return [ResponseType.SUCCESS.value] + + +class MAPServer: + """The MUSCLE Agent Protocol server. + + This class accepts connections from the agents and services them using a + MAPRequestHandler. + """ + def __init__(self, agent_manager: IAgentManager) -> None: + """Create a MAPServer. + + This starts a TCP Transport server and connects it to a MAPRequestHandler, + which uses the given agent manager to service the requests. By default, we + listen on port 9009, unless it's not available in which case we use a random + other one. + + Args: + agent_manager: AgentManager to forward requests to + """ + self._post_office = PostOffice() + self._handler = MAPRequestHandler(agent_manager, self._post_office) + try: + self._server = TcpTransportServer(self._handler, 9009) + except OSError as e: + if e.errno != errno.EADDRINUSE: + raise + self._server = TcpTransportServer(self._handler) + + def get_location(self) -> str: + """Return this server's network location. + + This is a string of the form tcp::. + """ + return self._server.get_location() + + def stop(self) -> None: + """Stop the server. + + This makes the server stop serving requests, and shuts down its + background threads. + """ + self._server.close() + + def deposit_command(self, node_name: str, command: AgentCommand) -> None: + """Deposit a command for the given agent. + + This takes the given command and queues it for the given agent to pick up next + time it asks us for one. + + Args: + node_name: Name of the node whose agent should execute the command + command: The command to send + """ + agent = Reference(node_name.replace('-', '_')) + + if isinstance(command, StartCommand): + command_obj = [ + AgentCommandType.START.value, command.name, str(command.work_dir), + command.args, command.env, str(command.stdout), str(command.stderr) + ] + elif isinstance(command, CancelAllCommand): + command_obj = [AgentCommandType.CANCEL_ALL.value] + elif isinstance(command, ShutdownCommand): + command_obj = [AgentCommandType.SHUTDOWN.value] + + encoded_command = cast(bytes, msgpack.packb(command_obj, use_bin_type=True)) + + self._post_office.deposit(agent, encoded_command) diff --git a/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py new file mode 100644 index 00000000..5c3ecd95 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/native_instantiator.py @@ -0,0 +1,289 @@ +import logging +import multiprocessing as mp +from os import chdir +from pathlib import Path +import queue +import sys +from time import sleep +import traceback +from typing import Dict, List, Optional + +from libmuscle.errors import ConfigurationError +from libmuscle.manager.instantiator import ( + CancelAllRequest, CrashedResult, create_instance_env, InstantiationRequest, + Process, ProcessStatus, reconfigure_logging, ShutdownRequest) +from libmuscle.native_instantiator.agent_manager import AgentManager +from libmuscle.native_instantiator.global_resources import global_resources +from libmuscle.native_instantiator.run_script import make_script, prep_resources +from libmuscle.planner.resources import OnNodeResources, Resources +from ymmsl import MPICoresResReq, MPINodesResReq, ResourceRequirements, ThreadedResReq + + +_logger = logging.getLogger(__name__) + + +class NativeInstantiator(mp.Process): + """Instantiates instances on the local machine.""" + def __init__( + self, resources: mp.Queue, requests: mp.Queue, results: mp.Queue, + log_records: mp.Queue, run_dir: Path) -> None: + """Create a NativeInstantiator + + Args: + resources: Queue for returning the available resources + requests: Queue to take requests from + results: Queue to communicate finished processes over + log_messages: Queue to push log messages to + run_dir: Run directory for the current run + """ + super().__init__(name='NativeInstantiator') + self._resources_out = resources + self._requests_in = requests + self._results_out = results + self._log_records_out = log_records + self._run_dir = run_dir + + self._processes: Dict[str, Process] = dict() + + def run(self) -> None: + """Entry point for the process""" + try: + m3_dir = self._run_dir / 'muscle3' + m3_dir.mkdir(exist_ok=True) + chdir(m3_dir) + + self._agent_manager = AgentManager(m3_dir) + + reconfigure_logging(self._log_records_out) + self._send_resources() + self._main() + + except ConfigurationError as e: + self._results_out.put(CrashedResult(e)) + + except: # noqa + for line in traceback.format_exception(*sys.exc_info()): + _logger.error(line) + + result = CrashedResult(sys.exc_info()[1]) + self._resources_out.put(result) + self._results_out.put(result) + + def _main(self) -> None: + """Main function for the background process. + + This accepts requests for instantiating jobs, stopping them, or shutting down. + Results of finished jobs are returned via the results queue. + """ + shutting_down = False + done = False + while not done: + while not shutting_down: + try: + request = self._requests_in.get_nowait() + if isinstance(request, ShutdownRequest): + _logger.debug('Got ShutdownRequest') + shutting_down = True + + elif isinstance(request, CancelAllRequest): + _logger.debug('Got CancelAllRequest') + self._agent_manager.cancel_all() + + elif isinstance(request, InstantiationRequest): + _logger.debug('Got InstantiationRequest') + if not shutting_down: + self._instantiate(request) + + except queue.Empty: + break + + self._report_failed_processes() + self._report_finished_processes() + + if shutting_down: + _logger.debug(f'Remaining processes: {self._processes}') + done = not self._processes + + if not done: + sleep(0.1) + + self._agent_manager.shutdown() + + def _send_resources(self) -> None: + """Detect resources and report them to the manager. + + We have potentially two sources of truth here: the Slurm environment variables + and what the agents report based on what they're bound to. These should be + consistent, but we check that and then try to be conservative to try to not + step outside our bounds even if the cluster doesn't constrain processes to their + assigned processors. + """ + already_logged_smt = False + resources = Resources() + + agent_res = self._agent_manager.get_resources() + + env_ncpus = dict( + zip(global_resources().nodes, global_resources().logical_cpus_per_node) + ) + + for node_name in env_ncpus: + if node_name not in agent_res.nodes(): + _logger.warning( + f'The environment suggests we should have node {node_name},' + ' but no agent reported running on it. We won''t be able' + ' to use this node.') + else: + env_nncpus = env_ncpus[node_name] + ag_nncores = len(agent_res[node_name].cpu_cores) + ag_nnthreads = len(list(agent_res[node_name].hwthreads())) + + if ag_nncores != ag_nnthreads and ag_nnthreads == env_nncpus: + if not already_logged_smt: + _logger.info( + 'Detected SMT (hyperthreading) as available and' + ' enabled. Note that MUSCLE3 will assign whole cores to' + ' each thread or MPI process.') + already_logged_smt = True + + resources.add_node(agent_res[node_name]) + + elif ag_nncores < env_nncpus: + _logger.warning( + f'Node {node_name} should have {env_nncpus} cores' + f' available, but the agent reports only {ag_nncores}' + f' available to it. We\'ll use the {ag_nncores} we seem to' + ' have.') + + resources.add_node(agent_res[node_name]) + + elif env_nncpus < ag_nncores: + _logger.warning( + f'Node {node_name} should have {env_nncpus} cores' + f' available, but the agent reports {ag_nncores} available' + ' to it. Maybe the cluster does not constrain resources?' + f' We\'ll use the {env_nncpus} that we should have got.') + resources.add_node( + OnNodeResources( + node_name, + agent_res[node_name].cpu_cores.get_first_cores( + env_nncpus))) + + else: + # no SMT, agent matches environment + resources.add_node(agent_res[node_name]) + + for node in agent_res: + if node.node_name not in env_ncpus: + _logger.warning( + f'An agent is running on node {node.node_name} but the' + ' environment does not list it as ours. It seems that the' + ' node\'s hostname does not match what SLURM calls it. We will' + ' not use this node, because we\'re not sure it\'s really ours.' + ) + + self._resources_out.put(resources) + + def _instantiate(self, request: InstantiationRequest) -> None: + """Instantiate an implementation according to the request.""" + name = str(request.instance) + + env = create_instance_env(request.instance, request.implementation.env) + self._add_resources(env, request.res_req) + + rankfile = request.instance_dir / 'rankfile' + + if global_resources().on_cluster(): + rankfile_contents, resource_env = prep_resources( + request.implementation.execution_model, request.resources, + rankfile) + + if rankfile_contents: + with rankfile.open('w') as f: + f.write(rankfile_contents) + env['MUSCLE_RANKFILE'] = str(rankfile) + + env.update(resource_env) + + run_script_file = self._write_run_script(request, rankfile) + args = [str(run_script_file)] + + self._processes[name] = Process(request.instance, request.resources) + + _logger.debug(f'Instantiating {name} on {request.resources}') + try: + self._agent_manager.start( + request.resources.by_rank[0].node_name, + name, request.work_dir, args, env, + request.stdout_path, request.stderr_path) + self._processes[name].status = ProcessStatus.RUNNING + + except Exception as e: + _logger.warning(f'Instance {name} failed to start: {e}') + self._processes[name].status = ProcessStatus.ERROR + self._processes[name].error_msg = f'Instance failed to start: {e}' + + def _write_run_script( + self, request: InstantiationRequest, rankfile: Optional[Path]) -> Path: + """Create and write out the run script and return its location.""" + # TODO: Only write out once for each implementation + if request.implementation.script: + run_script = request.implementation.script + else: + run_script = make_script( + request.implementation, request.res_req, + not global_resources().on_cluster(), rankfile) + + run_script_file = request.instance_dir / 'run_script.sh' + + with run_script_file.open('w') as f: + f.write(run_script) + + run_script_file.chmod(0o700) + return run_script_file + + def _add_resources( + self, env: Dict[str, str], res_req: ResourceRequirements) -> None: + """Add resource env vars to the given env.""" + if isinstance(res_req, ThreadedResReq): + num_threads = res_req.threads + elif isinstance(res_req, (MPICoresResReq, MPINodesResReq)): + num_threads = res_req.threads_per_mpi_process + + env['MUSCLE_THREADS'] = str(num_threads) + env['OMP_NUM_THREADS'] = str(num_threads) + + num_mpi_processes: Optional[int] = None + if isinstance(res_req, MPICoresResReq): + num_mpi_processes = res_req.mpi_processes + elif isinstance(res_req, MPINodesResReq): + num_mpi_processes = res_req.nodes * res_req.mpi_processes_per_node + + if num_mpi_processes is not None: + env['MUSCLE_MPI_PROCESSES'] = str(num_mpi_processes) + + def _report_failed_processes(self) -> None: + """Get processes that failed to start and report their status.""" + failed_processes: List[str] = list() + + for name, process in self._processes.items(): + if process.status == ProcessStatus.ERROR: + self._results_out.put(process) + failed_processes.append(name) + + for name in failed_processes: + del self._processes[name] + + def _report_finished_processes(self) -> None: + """Get finished processes and report back their status.""" + for name, exit_code in self._agent_manager.get_finished(): + process = self._processes[name] + if process.status == ProcessStatus.RUNNING: + if exit_code == 0: + process.status = ProcessStatus.SUCCESS + else: + process.status = ProcessStatus.ERROR + process.error_msg = 'Instance returned a non-zero exit code' + process.exit_code = exit_code + self._results_out.put(process) + del self._processes[name] diff --git a/libmuscle/python/libmuscle/native_instantiator/process_manager.py b/libmuscle/python/libmuscle/native_instantiator/process_manager.py new file mode 100644 index 00000000..bfd8f3ca --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/process_manager.py @@ -0,0 +1,68 @@ +import logging +from pathlib import Path +from subprocess import Popen +from typing import Dict, List, Tuple + + +_logger = logging.getLogger(__name__) + + +class ProcessManager: + """Manages a set of running processes.""" + def __init__(self) -> None: + """Create a ProcessManager.""" + self._processes: Dict[str, Popen] = dict() + + def start( + self, name: str, work_dir: Path, args: List[str], env: Dict[str, str], + stdout: Path, stderr: Path) -> None: + """Start a process. + + The files that the output is directed to will be overwritten if they already + exist. + + Args: + name: Name under which this process will be known + work_dir: Working directory in which to start + args: Executable and arguments to run + env: Environment variables to set + stdout: File to redirect stdout to + stderr: File to redirect stderr to + + Raises: + RuntimeError: If there is already a process with the given name. + OSError: If the process could not be started. + """ + if name in self._processes: + raise RuntimeError(f'Process {name} already exists') + _logger.debug(f'Starting process {args} with env {env} in {work_dir}') + with stdout.open('w') as out, stderr.open('w') as err: + self._processes[name] = Popen( + args, cwd=work_dir, env=env, stdout=out, stderr=err) + + def cancel_all(self) -> None: + """Stops all running processes. + + This does not wait for them to terminate, it just sends the signal to kill + them. + """ + for process in self._processes.values(): + process.kill() + + def get_finished(self) -> List[Tuple[str, int]]: + """Returns names and exit codes of finished processes. + + This returns all processes that have finished running since the previous call; + each started process will be returned exactly once. + """ + result: List[Tuple[str, int]] = list() + + for name, process in self._processes.items(): + exit_code = process.poll() + if exit_code is not None: + result.append((name, exit_code)) + + for name, _ in result: + del self._processes[name] + + return result diff --git a/libmuscle/python/libmuscle/native_instantiator/run_script.py b/libmuscle/python/libmuscle/native_instantiator/run_script.py new file mode 100644 index 00000000..e566d123 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/run_script.py @@ -0,0 +1,392 @@ +import logging +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Tuple + +from libmuscle.errors import ConfigurationError +from libmuscle.native_instantiator.slurm import slurm +from libmuscle.planner.planner import ResourceAssignment +from ymmsl import ( + ExecutionModel, Implementation, MPICoresResReq, MPINodesResReq, + ResourceRequirements, ThreadedResReq) + + +def direct_prep_resources(resources: ResourceAssignment) -> Tuple[str, Dict[str, str]]: + """Create resources for a non-MPI program with taskset. + + Taskset expects a set of hwthreads on the command line, either as a comma-separated + list or as a hexadecimal mask. We generate both here and set two environment + variables. + + Args: + resources: The resource assignment to describe + + Return: + No rank file, and a set of environment variables. + """ + env: Dict[str, str] = dict() + only_node_hwthreads_list = list(resources.by_rank[0].hwthreads()) + + env['MUSCLE_BIND_LIST'] = ','.join(map(str, only_node_hwthreads_list)) + + mask_int = sum((1 << c for c in only_node_hwthreads_list)) + env['MUSCLE_BIND_MASK'] = format(mask_int, 'X') + + return '', env + + +def openmpi_prep_resources(resources: ResourceAssignment) -> Tuple[str, Dict[str, str]]: + """Create resource description for OpenMPI mpirun + + Args: + resources: The resource assignment to describe + + Return: + The contents of the rankfile, and a set of environment variables + """ + ranklines: List[str] = list() + all_cores = ( + (node_res, ','.join(map(str, sorted(node_res.hwthreads())))) + for node_res in resources.by_rank) + + for i, (node_res, hwthreads) in enumerate(all_cores): + ranklines.append(f'rank {i}={node_res.node_name} slot={hwthreads}') + + rankfile = '\n'.join(ranklines) + '\n' + + return rankfile, dict() + + +def impi_prep_resources(resources: ResourceAssignment) -> Tuple[str, Dict[str, str]]: + """Create resource description for Intel MPI mpirun + + Intel MPI mpirun accepts either one core for each MPI process, or one hwthread. It + cannot bind a process to more than one explicitly specified core or hwthread the way + srun and OpenMPI can. At the moment, we bind each process to one core, and that's + what we do here as well, but this will become a problem for MPI+OpenMP codes. Those + can be pinned to sockets, NUMA domains or caches, which does make sense, so we'll + have to figure that out when we add support. + + Args: + resources: The resource assignment to describe + + Return: + The contents of the machinefile, and a set of environment variables + """ + env: Dict[str, str] = dict() + machine_nodes: List[str] = list() + pin_masks: List[int] = list() + + for rank, res in enumerate(resources.by_rank): + machine_nodes.append(res.node_name) + pin_masks.append(sum((1 << c for c in res.hwthreads()))) + + # coalesce machine lines + proc_counts = [1] * len(machine_nodes) + i = 1 + while i < len(machine_nodes): + if machine_nodes[i-1] == machine_nodes[i]: + del machine_nodes[i] + proc_counts[i-1] += proc_counts[i] + del proc_counts[i] + else: + i += 1 + + machinefile = '\n'.join( + (f'{m}:{c}' for m, c in zip(machine_nodes, proc_counts))) + '\n' + + # disable pinning to SLURM-specified resources + # env['I_MPI_PIN_RESPECT_CPUSET'] = '0' + env['I_MPI_JOB_RESPECT_PROCESS_PLACEMENT'] = 'off' + + # which cores to bind each rank to + pin_masks_str = ','.join(format(mask, '#x') for mask in pin_masks) + env['I_MPI_PIN_DOMAIN'] = f'[{pin_masks_str}]' + + # I_MPI_PIN_DOMAIN=[55,aa] + # pins the first rank to 0,2,16,18 and the second to 1,3,17,19 + # I_MPI_PIN_PROCESSOR_LIST=0,1,5,6 + # pins rank 0 to core 0, rank 1 to core 1, rank 2 to core 5, rank 3 to core 6 + # machinefile: + # host1:2 + # host2:4 + # runs two processes on host1 and four on host2 + return machinefile, env + + +def mpich_prep_resources(resources: ResourceAssignment) -> Tuple[str, Dict[str, str]]: + """Create resource description for MPICH mpirun + + Args: + resources: The resource assignment to describe + + Return: + The contents of the machinefile, and a set of environment variables + """ + # No env vars, but rankfile + raise NotImplementedError() + + +def srun_prep_resources( + resources: ResourceAssignment, rankfile_location: Path + ) -> Tuple[str, Dict[str, str]]: + """Create resource description for srun + + Args: + resources: The resources to describe + rankfile_location: Location where the rankfile will be written + + Return: + The contents of the hostfile, and a set of environment variables + """ + hostfile = '\n'.join(( + node_res.node_name for node_res in resources.by_rank + for _ in node_res.hwthreads())) + + env = {'SLURM_HOSTFILE': str(rankfile_location)} + + def core_mask(hwthreads: Iterable[int]) -> str: + mask = sum((1 << hwthread) for hwthread in hwthreads) + return format(mask, '#x') + + bind_str = ','.join([ + core_mask(node_res.hwthreads()) for node_res in resources.by_rank]) + + env['SLURM_CPU_BIND'] = f'verbose,mask_cpu:{bind_str}' + + return hostfile, env + + +def prep_resources( + model: ExecutionModel, resources: ResourceAssignment, rankfile_location: Path + ) -> Tuple[str, Dict[str, str]]: + """Create resource description for the given execution model. + + Args: + model: The execution model to generate a description for + resources: The resource assignment to describe + rankfile_location: Path to where the rankfile will be written + + Return: + The contents of the rank/machine/hostfile, and a set of environment variables. + """ + if model == ExecutionModel.DIRECT: + return direct_prep_resources(resources) + elif model == ExecutionModel.OPENMPI: + return openmpi_prep_resources(resources) + elif model == ExecutionModel.INTELMPI: + return impi_prep_resources(resources) + elif model == ExecutionModel.SRUNMPI: + return srun_prep_resources(resources, rankfile_location) + # elif model == ExecutionModel.MPICH: + # return mpich_prep_resources(resources) + raise RuntimeError( + f'Impossible execution model {model}, please create an issue on GitHub') + + +def num_mpi_tasks(res_req: ResourceRequirements) -> int: + """Determine the number of MPI tasks to be started. + + For non-MPI resource requirements, returns 1. + + Args: + res_req: Resource requirements to analyse. + """ + if isinstance(res_req, ThreadedResReq): + return 1 + elif isinstance(res_req, MPICoresResReq): + return res_req.mpi_processes + elif isinstance(res_req, MPINodesResReq): + return res_req.nodes * res_req.mpi_processes_per_node + raise RuntimeError('Invalid ResourceRequirements') + + +def local_command(implementation: Implementation, enable_debug: bool) -> str: + """Make a format string for the command to run. + + This interprets the execution_model and produces an appropriate shell command to + start the implementation. This function produces commands for running locally: + pinning is disabled and there's only one node. + + Args: + implementation: The implementation to start. + enable_debug: Whether to produce extra debug output. + + Return: + A format string with embedded {ntasks} and {rankfile}. + """ + if implementation.execution_model == ExecutionModel.DIRECT: + fstr = '{command} {args}' + elif implementation.execution_model == ExecutionModel.OPENMPI: + # Native name is orterun for older and prterun for newer OpenMPI. + # So we go with mpirun, which works for either. + fargs = [ + 'mpirun -np $MUSCLE_MPI_PROCESSES', + '--oversubscribe' + ] + + if enable_debug: + fargs.append('-v --debug-daemons --display-map --display-allocation') + + fargs.append('{command} {args}') + + fstr = ' '.join(fargs) + + elif implementation.execution_model == ExecutionModel.INTELMPI: + fstr = 'mpirun -n $MUSCLE_MPI_PROCESSES {command} {args}' + elif implementation.execution_model == ExecutionModel.SRUNMPI: + raise ConfigurationError( + f'Could not start {implementation.name} because the SRUNMPI execution' + ' method only works in a SLURM allocation, and we are running locally.' + ' Please switch this implementation to a different execution method' + ' in the configuration file. You will probably want OPENMPI or' + ' INTELMPI depending on which MPI implementation this code was' + ' compiled with.') + # elif implementation.execution_model == ExecutionModel.MPICH + # fstr = 'mpiexec -n {{ntasks}} {command} {args}' + + if implementation.args is None: + args = '' + elif isinstance(implementation.args, str): + args = implementation.args + elif isinstance(implementation.args, list): + args = ' '.join(implementation.args) + + return fstr.format( + command=implementation.executable, + args=args + ) + + +def cluster_command(implementation: Implementation, enable_debug: bool) -> str: + """Make a format string for the command to run. + + This interprets the execution_model and produces an appropriate shell command to + start the implementation. This function produces commands for running on a cluster, + with processes distributed across nodes and CPU pinning enabled. + + Args: + implementation: The implementation to start. + enable_debug: Whether to produce extra debug output. + + Return: + A string with the command to use to start the implementation. + """ + if implementation.execution_model == ExecutionModel.DIRECT: + fargs = [ + 'if ! taskset -V >/dev/null 2>&1 ; then', + ' {command} {args}', + 'else', + ' taskset $MUSCLE_BIND_MASK {command} {args}', + 'fi' + ] + fstr = '\n'.join(fargs) + + elif implementation.execution_model == ExecutionModel.OPENMPI: + fargs = [ + # Native name is orterun for older and prterun for newer OpenMPI. + # So we go with mpirun, which works for either. + 'mpirun -np $MUSCLE_MPI_PROCESSES', + '--rankfile $MUSCLE_RANKFILE --use-hwthread-cpus --bind-to hwthread', + '--oversubscribe' + ] + + if enable_debug: + fargs.append('-v --display-allocation --display-map --report-bindings') + + if slurm.quirks.overlap: + # This adds the given option to the srun command used by mpirun to launch + # its daemons. mpirun specifies --exclusive, which on SLURM <= 21-08 causes + # SLURM to wait for our agents to quit, as it considers them to be occupying + # the cores, causing a deadlock. Fortunately, it seems that adding --overlap + # overrides the --exclusive and it works. + fargs.append('-mca plm_slurm_args "--overlap"') + + fargs.append('{command} {args}') + + fstr = ' '.join(fargs) + + elif implementation.execution_model == ExecutionModel.INTELMPI: + fargs = [ + 'mpirun -n $MUSCLE_MPI_PROCESSES', + '-machinefile $MUSCLE_RANKFILE'] + + if enable_debug: + fargs.append('-genv I_MPI_DEBUG=4') + + fargs.append('{command} {args}') + + fstr = ' '.join(fargs) + + elif implementation.execution_model == ExecutionModel.SRUNMPI: + fargs = ['srun -n $MUSCLE_MPI_PROCESSES -m arbitrary'] + + if slurm.quirks.overlap: + fargs.append('--overlap') + + verbose = 'verbose,' if enable_debug else '' + + fargs.append(f'{slurm.quirks.cpu_bind}={verbose}$SLURM_CPU_BIND') + fargs.append('{command} {args}') + + fstr = ' '.join(fargs) + + # elif implementation.execution_model == ExecutionModel.MPICH + # fstr = 'mpiexec -n $MUSCLE_MPI_PROCESSES -f $MUSCLE_RANKFILE {command} {args}' + + if implementation.args is None: + args = '' + elif isinstance(implementation.args, str): + args = implementation.args + elif isinstance(implementation.args, list): + args = ' '.join(implementation.args) + + return fstr.format( + command=implementation.executable, + args=args + ) + + +def make_script( + implementation: Implementation, res_req: ResourceRequirements, + local: bool, rankfile: Optional[Path] = None) -> str: + """Make a run script for a given implementation. + + Args: + implementation: The implementation to launch + res_req: The job's resource requirements + local: Whether this is to run locally (True) or on a cluster (False) + rankfile: Location of the rankfile, if any + + Return: + A string with embedded newlines containing the shell script. + """ + enable_debug = logging.getLogger('libmuscle').getEffectiveLevel() <= logging.DEBUG + + lines: List[str] = list() + + lines.append('#!/bin/bash') + lines.append('') + + # The environment is passed when starting the script, rather than as a set of + # export statements here. + + if implementation.modules: + if isinstance(implementation.modules, str): + lines.append(f'module load {implementation.modules}') + else: + for module in implementation.modules: + lines.append(f'module load {module}') + lines.append('') + + if implementation.virtual_env: + lines.append(f'. {implementation.virtual_env}/bin/activate') + lines.append('') + + if local: + lines.append(local_command(implementation, enable_debug)) + else: + lines.append(cluster_command(implementation, enable_debug)) + + lines.append('') + + return '\n'.join(lines) diff --git a/libmuscle/python/libmuscle/native_instantiator/slurm.py b/libmuscle/python/libmuscle/native_instantiator/slurm.py new file mode 100644 index 00000000..dc22d23d --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/slurm.py @@ -0,0 +1,352 @@ +from itertools import product +import logging +import os +from parsimonious import Grammar, NodeVisitor +from parsimonious.nodes import Node +import subprocess +from typing import Any, cast, List, Sequence, Tuple + + +_logger = logging.getLogger(__name__) + + +_node_range_expression_grammar = Grammar( + """ + nre = nre_parts ("," nre_parts)* + nre_parts = nre_part+ + nre_part = identifier ("[" index_set "]")? + index_set = index_range ("," index_range)* + index_range = integer ("-" integer)? + identifier = ~"[a-z 0-9 _-]+"i + integer = padded_int / int + int = ~"[0-9]+" + padded_int = ~"0[0-9]+" + """ + ) + + +class NREVisitor(NodeVisitor): + """Processes a parsed NRE and produces a list of nodes. + + Node range expressions are used by SLURM to describe collections of nodes. See + parse_slurm_nodelist() below. + """ + def visit_nre( + self, node: Node, + visited_children: Tuple[List[str], Sequence[Tuple[Any, List[str]]]] + ) -> List[str]: + """Return a list of nodes corresponding to the NRE.""" + nodes = visited_children[0].copy() + for _, more_nodes in visited_children[1]: + nodes.extend(more_nodes) + return nodes + + def visit_nre_parts( + self, node: Node, visited_children: Sequence[Tuple[str, List[str]]] + ) -> List[str]: + """Return a list of node ids for the part.""" + fmt = ''.join([c[0] + '{}' for c in visited_children]) + index_lists = [c[1] for c in visited_children] + return [fmt.format(*idxs) for idxs in product(*index_lists)] + + def visit_nre_part( + self, node: Node, visited_children: Tuple[ + str, Sequence[Tuple[Any, List[str], Any]]] + ) -> Tuple[str, List[str]]: + """Return the identifier part and a list of indexes for the set.""" + identifier = visited_children[0] + if not visited_children[1]: + index_set = [''] + else: + index_set = visited_children[1][0][1] + return identifier, index_set + + def visit_index_set( + self, node: Node, + visited_children: Tuple[List[str], Sequence[Tuple[Any, List[str]]]] + ) -> List[str]: + """Return a list of indexes corresponding to the set.""" + indexes = visited_children[0].copy() + for _, more_indexes in visited_children[1]: + indexes.extend(more_indexes) + return indexes + + def visit_index_range( + self, node: Node, + visited_children: Tuple[ + Tuple[int, int], + Sequence[ + Tuple[Any, Tuple[int, int]] + ]] + ) -> List[str]: + """Return a list of indexes corresponding to the range.""" + + def format_str(width: int) -> str: + if width == -1: + return '{}' + return f'{{:0{width}}}' + + start_value, width = visited_children[0] + if visited_children[1]: + end_value, _ = visited_children[1][0][1] + fmt = format_str(width) + return [fmt.format(i) for i in range(start_value, end_value + 1)] + + fmt = format_str(width) + return [fmt.format(start_value)] + + def visit_identifier(self, node: Node, _: Sequence[Any]) -> str: + return node.text + + def visit_integer( + self, node: Node, visited_children: Sequence[Tuple[int, int]] + ) -> Tuple[int, int]: + """Returns the value of the int, and a field width or -1.""" + return visited_children[0] + + def visit_int(self, node: Node, _: Sequence[Any]) -> Tuple[int, int]: + """Returns the value and a field width of -1.""" + return int(node.text), -1 + + def visit_padded_int(self, node: Node, _: Sequence[Any]) -> Tuple[int, int]: + """Returns the value of the int and the field width.""" + return int(node.text), len(node.text) + + def generic_visit( + self, node: Node, visited_children: Sequence[Any]) -> Sequence[Any]: + return visited_children + + +_nre_visitor = NREVisitor() + + +def parse_slurm_nodelist(s: str) -> List[str]: + """Parse a SLURM node range expression and produce node names. + + Exactly what the syntax is for a "node range expression" isn't entirely + clear. Some examples are given throughout the documentation: + + linux[00-17] + lx[10-20] + tux[2,1-2] + tux[1-2,2] + tux[1-3] + linux[0-64,128] + alpha,beta,gamma + lx[15,18,32-33] + linux[0000-1023] + rack[0-63]_blade[0-41] + + unit[0-31]rack is invalid + + If a range uses leading zeros, then so should the generated indexes. + See _node_range_expression_grammar above for my best guess at the + correct grammar. + + This function takes a string containing an NRE and returns the + corresponding list of node names. + """ + ast = _node_range_expression_grammar.parse(s) + return cast(List[str], _nre_visitor.visit(ast)) + + +_nodes_cores_expression_grammar = Grammar( + """ + nce = nce_run ("," nce_run)* + nce_run = int ("(" run_length ")")? + run_length = "x" int + int = ~"[0-9]+" + """ + ) + + +class NCEVisitor(NodeVisitor): + """Processes a parsed NCE and produces a list of cpu counts per node. + + Nodes cores expressions are used by SLURM to describe cores on a collection of + nodes. See parse_slurm_nodes_cores() below. + """ + def visit_nce( + self, node: Node, + visited_children: Tuple[List[int], Sequence[Tuple[Any, List[int]]]] + ) -> List[int]: + """Return a list of nodes corresponding to the NRE.""" + nodes_cores = visited_children[0].copy() + for _, more_nodes_cores in visited_children[1]: + nodes_cores.extend(more_nodes_cores) + return nodes_cores + + def visit_nce_run( + self, node: Node, + visited_children: Tuple[int, Sequence[Tuple[Any, int, Any]]] + ) -> List[int]: + """Return a list of core counts produced by this run.""" + num_cores = visited_children[0] + result = [num_cores] + + if visited_children[1]: + result *= visited_children[1][0][1] + + return result + + def visit_run_length( + self, node: Node, visited_children: Tuple[str, int]) -> int: + """Return the number of repetitions.""" + return visited_children[1] + + def visit_int(self, node: Node, _: Sequence[Any]) -> int: + """Returns the value as an int""" + return int(node.text) + + def generic_visit( + self, node: Node, visited_children: Sequence[Any]) -> Sequence[Any]: + return visited_children + + +_nce_visitor = NCEVisitor() + + +def parse_slurm_nodes_cores(s: str) -> List[int]: + """Parse a SLURM nodes cores expression and produce node names. + + The sbatch documentation page describes the format under + SLURM_JOB_CPUS_PER_NODE as CPU_count[(xnumber_of_nodes)][,CPU_count + [(xnumber_of_nodes)] ...]. and gives the example of '72(x2),36' describing a set of + three nodes, the first two with 72 cores and the third with 36. + + See _nodes_cores_expression_grammar above for the corresponding grammar. + + This function takes a string containing an NCE and returns the corresponding list of + node names. + """ + ast = _nodes_cores_expression_grammar.parse(s) + return cast(List[int], _nce_visitor.visit(ast)) + + +class SlurmQuirks: + """Collects features of the present SLURM.""" + overlap: bool + """True iff --overlap must be specified for srun.""" + cpu_bind: str + """CPU binding argument, --cpu-bind or --cpu_bind.""" + + +class SlurmInfo: + """Detects and holds information about the present SLURM scheduler.""" + def __init__(self) -> None: + if self.in_slurm_allocation(): + self.version = self._slurm_version() + self.quirks = SlurmQuirks() + + self.quirks.overlap = self.version > (20, 2) + self.quirks.cpu_bind = ( + '--cpu-bind' if self.version > (17, 2) else '--cpu_bind') + + def in_slurm_allocation(self) -> bool: + """Check whether we're in a SLURM allocation. + + Returns true iff SLURM was detected. + """ + return 'SLURM_JOB_ID' in os.environ + + def get_nodes(self) -> List[str]: + """Get a list of node names from SLURM_JOB_NODELIST. + + This inspects SLURM_JOB_NODELIST or SLURM_NODELIST and returns an + expanded list of node names. + + If SLURM_JOB_NODELIST is "node[020-023]" then this returns + ["node020", "node021", "node022", "node023"]. + """ + nodelist = os.environ.get('SLURM_JOB_NODELIST') + if not nodelist: + nodelist = os.environ.get('SLURM_NODELIST') + if not nodelist: + raise RuntimeError('SLURM_(JOB_)NODELIST not set, are we running locally?') + + _logger.debug(f'SLURM node list: {nodelist}') + + return parse_slurm_nodelist(nodelist) + + def get_logical_cpus_per_node(self) -> List[int]: + """Return the number of logical CPU cores per node. + + This returns a list with the number of cores of each node in the result of + get_nodes(), which gets read from SLURM_JOB_CPUS_PER_NODE. + """ + sjcpn = os.environ.get('SLURM_JOB_CPUS_PER_NODE') + _logger.debug(f'SLURM_JOB_CPUS_PER_NODE: {sjcpn}') + + if sjcpn: + return parse_slurm_nodes_cores(sjcpn) + else: + scon = os.environ.get('SLURM_CPUS_ON_NODE') + _logger.debug(f'SLURM_CPUS_ON_NODE: {scon}') + + snn = os.environ.get('SLURM_JOB_NUM_NODES') + if not snn: + snn = os.environ.get('SLURM_NNODES') + _logger.debug(f'SLURM num nodes: {snn}') + + if scon and snn: + return [int(scon)] * int(snn) + + raise RuntimeError( + 'SLURM_JOB_CPUS_PER_NODE is not set in the environment, and also' + ' SLURM_CPUS_ON_NODE is missing or neither SLURM_JOB_NUM_NODES nor' + ' SLURM_NNODES is set. Please create an issue on GitHub with the output' + ' of "sbatch --version" on this cluster.') + + def agent_launch_command(self, agent_cmd: List[str], nnodes: int) -> List[str]: + """Return a command for launching one agent on each node. + + Args: + agent_cmd: A command that will start the agent. + """ + # TODO: On the latest Slurm, there's a special command for this that we should + # use if we have that, --external-launcher. Poorly documented though, so will + # require some experimentation. + + # On SLURM <= 23-02, the number of tasks is inherited by srun from sbatch rather + # than calculated anew from --nodes and --ntasks-per-node, so we specify it + # explicitly to avoid getting an agent per logical cpu rather than per node. + srun_cmd = [ + 'srun', f'--nodes={nnodes}', f'--ntasks={nnodes}', + '--ntasks-per-node=1' + ] + + if self.quirks.overlap: + srun_cmd.append('--overlap') + + return srun_cmd + agent_cmd + + def _slurm_version(self) -> Tuple[int, int]: + """Obtains current version of SLURM from srun -v. + + This returns only the first two numbers, hopefully there won't be any changes in + behaviour within a release series. + """ + proc = subprocess.run( + ['srun', '--version'], check=True, capture_output=True, text=True, + encoding='utf-8' + ) + + output = proc.stdout.strip().split() + if len(output) < 2: + raise RuntimeError( + f'Unexpected srun version output "{output}". MUSCLE3 does not know' + ' how to run on this version of SLURM. Please file an issue on' + ' GitHub.') + + version_str = output[1] + version = version_str.split('.') + if len(version) < 2: + _logger.error(f'srun produced unexpected version {version_str}') + raise RuntimeError( + f'Unexpected srun version output "{output}". MUSCLE3 does not know' + ' how to run on this version of SLURM. Please file an issue on' + ' GitHub.') + return int(version[0]), int(version[1]) + + +slurm = SlurmInfo() diff --git a/libmuscle/python/libmuscle/native_instantiator/test/test_process_manager.py b/libmuscle/python/libmuscle/native_instantiator/test/test_process_manager.py new file mode 100644 index 00000000..93dabcfb --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/test/test_process_manager.py @@ -0,0 +1,120 @@ +from time import monotonic, sleep + +import pytest + +from libmuscle.native_instantiator.process_manager import ProcessManager + + +@pytest.fixture +def lpm(): + return ProcessManager() + + +def _poll_completion(lpm, num_jobs): + completed_jobs = list() + while len(completed_jobs) < num_jobs: + done = lpm.get_finished() + while not done: + sleep(0.1) + done = lpm.get_finished() + completed_jobs.extend(done) + + return completed_jobs + + +def test_run_process(lpm, tmp_path): + lpm.start( + 'test', tmp_path, ['bash', '-c', 'exit 0'], {}, + tmp_path / 'out', tmp_path / 'err') + completed_jobs = _poll_completion(lpm, 1) + assert completed_jobs[0] == ('test', 0) + + +def test_existing_process(lpm, tmp_path): + lpm.start( + 'test', tmp_path, ['bash', '-c', 'exit 0'], {}, + tmp_path / 'out', tmp_path / 'err') + with pytest.raises(RuntimeError): + lpm.start( + 'test', tmp_path, ['bash', '-c', 'exit 0'], {}, + tmp_path / 'out', tmp_path / 'err') + + completed_jobs = _poll_completion(lpm, 1) + + assert completed_jobs[0] == ('test', 0) + + +def test_env(lpm, tmp_path): + env = {'ENVVAR': 'TESTING123'} + lpm.start( + 'test', tmp_path, ['bash', '-c', 'echo ${ENVVAR}'], env, + tmp_path / 'out', tmp_path / 'err') + _poll_completion(lpm, 1) + + with (tmp_path / 'out').open('r') as f: + lines = f.readlines() + + assert lines[0] == 'TESTING123\n' + + +def test_exit_code(lpm, tmp_path): + lpm.start( + 'test_exit_code', tmp_path, ['bash', '-c', 'exit 3'], {}, + tmp_path / 'out', tmp_path / 'err') + done = lpm.get_finished() + while not done: + sleep(0.02) + done = lpm.get_finished() + + assert done[0] == ('test_exit_code', 3) + + +def test_multiple(lpm, tmp_path): + for i in range(3): + lpm.start( + f'test_{i}', tmp_path, ['bash', '-c', 'sleep 1'], {}, + tmp_path / f'out{i}', tmp_path / f'err{i}') + + completed_jobs = _poll_completion(lpm, 3) + + assert sorted(completed_jobs) == [('test_0', 0), ('test_1', 0), ('test_2', 0)] + + +def test_cancel_all(lpm, tmp_path): + begin_time = monotonic() + + for i in range(2): + lpm.start( + f'test_{i}', tmp_path, ['bash', '-c', 'sleep 1'], {}, + tmp_path / f'out{i}', tmp_path / f'err{i}') + + lpm.cancel_all() + + completed_jobs = _poll_completion(lpm, 2) + + end_time = monotonic() + + assert sorted(completed_jobs) == [('test_0', -9), ('test_1', -9)] + assert end_time - begin_time < 1.0 + + +def test_output_redirect(lpm, tmp_path): + lpm.start( + 'test', tmp_path, ['bash', '-c', 'ls'], {}, + tmp_path / 'out', tmp_path / 'err') + _poll_completion(lpm, 1) + with (tmp_path / 'out').open('r') as f: + assert f.readlines() + with (tmp_path / 'err').open('r') as f: + assert f.readlines() == [] + + +def test_error_redirect(lpm, tmp_path): + lpm.start( + 'test', tmp_path, ['bash', '-c', 'ls 1>&2'], {}, + tmp_path / 'out', tmp_path / 'err') + _poll_completion(lpm, 1) + with (tmp_path / 'out').open('r') as f: + assert f.readlines() == [] + with (tmp_path / 'err').open('r') as f: + assert f.readlines() diff --git a/libmuscle/python/libmuscle/native_instantiator/test/test_slurm.py b/libmuscle/python/libmuscle/native_instantiator/test/test_slurm.py new file mode 100644 index 00000000..d3610b65 --- /dev/null +++ b/libmuscle/python/libmuscle/native_instantiator/test/test_slurm.py @@ -0,0 +1,72 @@ +from libmuscle.native_instantiator.slurm import ( + parse_slurm_nodelist, parse_slurm_nodes_cores) + +import pytest + + +NRES_ = [ + # from various bits of SLURM documentation + ( + 'linux[00-17]', [ + 'linux00', 'linux01', 'linux02', 'linux03', 'linux04', 'linux05', + 'linux06', 'linux07', 'linux08', 'linux09', 'linux10', 'linux11', + 'linux12', 'linux13', 'linux14', 'linux15', 'linux16', 'linux17']), + ( + 'lx[10-20]', [ + 'lx10', 'lx11', 'lx12', 'lx13', 'lx14', 'lx15', 'lx16', 'lx17', 'lx18', + 'lx19', 'lx20']), + ('tux[2,1-2]', ['tux2', 'tux1', 'tux2']), + ('tux[1-2,2]', ['tux1', 'tux2', 'tux2']), + ('tux[1-3]', ['tux1', 'tux2', 'tux3']), + ( + 'linux[0-64,128]', [ + 'linux0', 'linux1', 'linux2', 'linux3', 'linux4', 'linux5', 'linux6', + 'linux7', 'linux8', 'linux9', 'linux10', 'linux11', 'linux12', + 'linux13', 'linux14', 'linux15', 'linux16', 'linux17', 'linux18', + 'linux19', 'linux20', 'linux21', 'linux22', 'linux23', 'linux24', + 'linux25', 'linux26', 'linux27', 'linux28', 'linux29', 'linux30', + 'linux31', 'linux32', 'linux33', 'linux34', 'linux35', 'linux36', + 'linux37', 'linux38', 'linux39', 'linux40', 'linux41', 'linux42', + 'linux43', 'linux44', 'linux45', 'linux46', 'linux47', 'linux48', + 'linux49', 'linux50', 'linux51', 'linux52', 'linux53', 'linux54', + 'linux55', 'linux56', 'linux57', 'linux58', 'linux59', 'linux60', + 'linux61', 'linux62', 'linux63', 'linux64', 'linux128']), + ('alpha,beta,gamma', ['alpha', 'beta', 'gamma']), + ('lx[15,18,32-33]', ['lx15', 'lx18', 'lx32', 'lx33']), + ('linux[0000-1023]', [f'linux{i:04}' for i in range(1024)]), + ( + 'rack[0-63]_blade[0-41]', [ + f'rack{i}_blade{j}' for i in range(64) for j in range(42)]), + # my additions + ('linux', ['linux']), + ('linux[0]', ['linux0']), + ('linux[0,1]', ['linux0', 'linux1']), + ('linux[0-2]', ['linux0', 'linux1', 'linux2']), + ( + 'rack[00-12,14]_blade[0-2],alpha,tux[1-3,6]', ( + [f'rack{i:02}_blade{j}' for i in range(13) for j in range(3)] + [ + 'rack14_blade0', 'rack14_blade1', 'rack14_blade2', 'alpha', + 'tux1', 'tux2', 'tux3', 'tux6'])), + ('node-0', ['node-0']), + ('node-[0-3]', ['node-0', 'node-1', 'node-2', 'node-3']), + ] + + +@pytest.mark.parametrize('nre,expected', NRES_) +def test_parse_slurm_nodelist(nre, expected): + assert parse_slurm_nodelist(nre) == expected + + +NCES_ = [ + ('8', [8]), + ('8(x2)', [8, 8]), + ('16,24', [16, 24]), + ('16,24(x3)', [16, 24, 24, 24]), + ('1(x1),2', [1, 2]), + ('72(x2),36', [72, 72, 36]) + ] + + +@pytest.mark.parametrize('nce,expected', NCES_) +def test_parse_slurm_nodes_cores(nce, expected): + assert parse_slurm_nodes_cores(nce) == expected diff --git a/libmuscle/python/libmuscle/planner/planner.py b/libmuscle/python/libmuscle/planner/planner.py index 47d4b903..5a443a68 100644 --- a/libmuscle/python/libmuscle/planner/planner.py +++ b/libmuscle/python/libmuscle/planner/planner.py @@ -1,11 +1,12 @@ -from copy import copy, deepcopy +from copy import copy import logging -from typing import Dict, Iterable, List, Mapping, Optional, Set, Tuple +from typing import Dict, Iterable, List, Mapping, Set, Tuple from ymmsl import ( Component, Configuration, Model, MPICoresResReq, MPINodesResReq, Operator, Reference, ResourceRequirements, ThreadedResReq) +from libmuscle.planner.resources import OnNodeResources, Resources from libmuscle.util import instance_indices @@ -383,125 +384,54 @@ def _calc_direct_succs_preds(self) -> None: self._direct_supersuccs[sender].add((receiver, shared_dims)) -class Resources: - """Designates a (sub)set of resources. +class ResourceAssignment: + """Assigned resources for each process of an instance. - Whether these resources are free or allocated in general or by - something specific depends on the context, this just says which - resources we're talking about. + Note that we use the classes from libmuscle.planner.resources to generically refer + to collections of resources, either to describe the available hardware or to + designate a subset of it that is occupied by a particular instance, or a subset that + isn't currently occupied. + + This class has more detailed information, because it knows for each process (MPI + rank) in the instance which subset of the overall resources for the instance it + should be on, which we need to launch it in the right place. Attributes: - cores: A dictionary mapping designated nodes to designated - cores on them. + by_rank: List of OnNodeResources objects containing assigned resources, + indexed by rank. """ - def __init__(self, cores: Optional[Dict[str, Set[int]]] = None) -> None: - """Create a Resources object with the given cores. + def __init__(self, by_rank: List[OnNodeResources]) -> None: + """Create a ResourceAssignment. Args: - cores: Cores to be designated by this object. + by_rank: List of OnNodeResources objects containing assigned resources, + indexed by rank. """ - if cores is None: - self.cores: Dict[str, Set[int]] = {} - else: - self.cores = cores - - def __copy__(self) -> 'Resources': - """Copy the object.""" - return Resources(deepcopy(self.cores)) + self.by_rank = by_rank def __eq__(self, other: object) -> bool: - """Check for equality.""" - if not isinstance(other, Resources): + if not isinstance(other, ResourceAssignment): return NotImplemented - if len(self.cores) != len(other.cores): - return False - - for node, cores in self.cores.items(): - if node not in other.cores: - return False - if other.cores[node] != cores: - return False - return True - - def __iadd__(self, other: 'Resources') -> 'Resources': - """Add the resources in the argument to this object.""" - for node in other.cores: - if node in self.cores: - self.cores[node] |= other.cores[node] - else: - self.cores[node] = set(other.cores[node]) - return self - - def __isub__(self, other: 'Resources') -> 'Resources': - """Remove the resources in the argument from this object.""" - for node in other.cores: - if node in self.cores: - self.cores[node] -= other.cores[node] - if not self.cores[node]: - del self.cores[node] - return self + return ( + len(self.by_rank) == len(other.by_rank) and + all([ + snr == onr + for snr, onr in zip(self.by_rank, other.by_rank)])) def __str__(self) -> str: - """Return a human-readable string representation.""" - def collapse_ranges(cores: Set[int]) -> str: - if len(cores) == 0: - return '' - - result = list() - scores = sorted(cores) - start = 0 - i = 1 - while i <= len(scores): - if (i == len(scores)) or (scores[i-1] != scores[i] - 1): - if start == i - 1: - # run of one - result.append(str(scores[i-1])) - else: - # run of at least two - result.append(f'{scores[start]}-{scores[i-1]}') - start = i - i += 1 - return ','.join(result) - - return 'Resources(' + '; '.join([ - n + ': ' + collapse_ranges(cs) - for n, cs in self.cores.items()]) + ')' + # str(list()) uses repr() on the elements, we want str() + str_rbr = ', '.join([str(nr) for nr in self.by_rank]) + return f'[{str_rbr}]' def __repr__(self) -> str: - """Return a string representation.""" - return f'Resources({self.cores})' - - def nodes(self) -> Iterable[str]: - """Returns the nodes on which we designate resources.""" - return self.cores.keys() + return f'ResourceAssignment({repr(self.by_rank)})' - def total_cores(self) -> int: - """Returns the total number of cores designated.""" - return sum([len(cs) for cs in self.cores.values()]) - - def isdisjoint(self, other: 'Resources') -> bool: - """Returns whether we share resources with other.""" - for node, cores in self.cores.items(): - if node in other.cores: - if not cores.isdisjoint(other.cores[node]): - return False - return True - - @staticmethod - def union(resources: Iterable['Resources']) -> 'Resources': - """Combines the resources into one. - - Args: - resources: A collection of resources to merge. - - Return: - A Resources object referring to all the resources in the - input. - """ + def as_resources(self) -> Resources: + """Return a Resources representing the combined assigned resources.""" result = Resources() - for cur_resources in resources: - result += cur_resources + for node_res in self.by_rank: + result.merge_node(node_res) return result @@ -511,12 +441,12 @@ class InsufficientResourcesAvailable(RuntimeError): class Planner: """Allocates resources and keeps track of allocations.""" - def __init__(self, all_resources: Resources): - """Create a ResourceManager. + def __init__(self, all_resources: Resources) -> None: + """Create a Planner. Args: all_resources: An object describing the available resources - to be managed by this ResourceManager. + for the planner to use. """ self._all_resources = all_resources self._allocations: Dict[Reference, Resources] = {} @@ -525,7 +455,7 @@ def __init__(self, all_resources: Resources): def allocate_all( self, configuration: Configuration, virtual: bool = False - ) -> Dict[Reference, Resources]: + ) -> Dict[Reference, ResourceAssignment]: """Allocates resources for the given components. Allocation can occur either on a fixed set of available @@ -546,9 +476,11 @@ def allocate_all( virtual: Allocate on virtual resources or not, see above Returns: - Resources for each instance required by the model. + Assigned resources for each instance required by the model. """ - result: Dict[Reference, Resources] = {} + result: Dict[Reference, ResourceAssignment] = {} + + _logger.debug(f'Planning on resources {self._all_resources}') # Analyse model model = ModelGraph(configuration.model) @@ -570,6 +502,7 @@ def allocate_all( unallocated_instances, requirements) for instance in to_allocate: + _logger.debug(f'Placing {instance}') component = model.component(instance.without_trailing_ints()) conflicting_names = self._conflicting_names( model, exclusive, component, instance) @@ -577,7 +510,7 @@ def allocate_all( done = False while not done: try: - result[instance] = self._allocate_instance( + result[instance] = self._assign_instance( instance, component, requirements[component.name], conflicting_names, virtual) @@ -683,11 +616,14 @@ def _expand_resources( """Adds an extra virtual node to the available resources.""" taken = True while taken: - new_node = 'node{:06d}'.format(self._next_virtual_node) - taken = new_node in self._all_resources.cores + new_node_name = 'node{:06d}'.format(self._next_virtual_node) + taken = new_node_name in self._all_resources.nodes() self._next_virtual_node += 1 - num_cores = len(next(iter(self._all_resources.cores.values()))) + new_node = copy(next(iter(self._all_resources))) + new_node.node_name = new_node_name + + num_cores = len(new_node.cpu_cores) if isinstance(req, ThreadedResReq): if req.threads > num_cores: raise InsufficientResourcesAvailable( @@ -701,13 +637,14 @@ def _expand_resources( f' {req.threads_per_mpi_process} threads per process,' f' which is impossible with {num_cores} cores per' ' node.') - self._all_resources.cores[new_node] = set(range(num_cores)) - def _allocate_instance( + self._all_resources.add_node(new_node) + + def _assign_instance( self, instance: Reference, component: Component, requirements: ResourceRequirements, simultaneous_instances: Set[Reference], virtual: bool - ) -> Resources: + ) -> ResourceAssignment: """Allocates resources for the given instance. If we are on real resources, and the instance requires more @@ -716,7 +653,7 @@ def _allocate_instance( resources, this will raise InsufficientResourcesAvailable. Args: - instance: The instance to allocate for + instance: The instance to assign resources to component: The component it is an instance of requirements: Its resource requirements simultaneous_instances: Instances which may execute @@ -725,19 +662,20 @@ def _allocate_instance( virtual: Whether we are on virtual resources Returns: - A Resources object describing the resources allocated + The resources assigned to each process in the instance """ - allocation = Resources({}) + assignment = ResourceAssignment([]) free_resources = copy(self._all_resources) for other in self._allocations: if other in simultaneous_instances: free_resources -= self._allocations[other] + _logger.debug(f'Free resources: {free_resources}') try: if isinstance(requirements, ThreadedResReq): - allocation = self._allocate_thread_block( - free_resources, requirements.threads) + assignment.by_rank.append(self._assign_thread_block( + free_resources, requirements.threads)) elif isinstance(requirements, MPICoresResReq): if requirements.threads_per_mpi_process != 1: @@ -745,10 +683,10 @@ def _allocate_instance( 'Multiple threads per MPI process is not supported' ' yet. Please make an issue on GitHub.') for proc in range(requirements.mpi_processes): - allocation += self._allocate_thread_block( - free_resources, - requirements.threads_per_mpi_process) - free_resources -= allocation + block = self._assign_thread_block( + free_resources, requirements.threads_per_mpi_process) + assignment.by_rank.append(block) + free_resources -= Resources([block]) elif isinstance(requirements, MPINodesResReq): raise RuntimeError( @@ -759,35 +697,81 @@ def _allocate_instance( if not self._allocations and not virtual: # There are no other allocations and it's still not # enough. Just give it all and hope for the best. - _logger.warning(( - 'Instance {} requires more resources than are' - ' available in total. Oversubscribing this' - ' instance.').format(instance)) - allocation = copy(self._all_resources) + assignment = self._oversubscribe_instance(instance, requirements) else: raise - self._allocations[instance] = allocation - return allocation + self._allocations[instance] = assignment.as_resources() + return assignment - def _allocate_thread_block( - self, free_resources: Resources, threads: int) -> Resources: - """Allocate resources for a group of threads. + def _assign_thread_block( + self, free_resources: Resources, num_threads: int) -> OnNodeResources: + """Assign resources for a group of threads. - This chooses a set of cores on the same node. It - returns the allocated resources; it doesn't update - self._allocations or free_resources. + This chooses a set of cores on the same node. It returns the + assigned resources; it doesn't update self._allocations or free_resources. Args: - threads: Number of cores + num_threads: Number of threads to allocate for free_resources: Available resources to allocate from Returns: - The allocated resources + The assigned resources """ - for node in free_resources.nodes(): - if len(free_resources.cores[node]) >= threads: - available_cores = sorted(free_resources.cores[node]) - to_reserve = set(available_cores[:threads]) - return Resources({node: to_reserve}) + for node in free_resources: + if len(node.cpu_cores) >= num_threads: + available_cores = node.cpu_cores + _logger.debug(f'available cores: {available_cores}') + to_reserve = available_cores.get_first_cores(num_threads) + _logger.debug(f'assigned {to_reserve}') + return OnNodeResources(node.node_name, to_reserve) raise InsufficientResourcesAvailable() + + def _oversubscribe_instance( + self, instance: Reference, requirements: ResourceRequirements + ) -> ResourceAssignment: + """Oversubscribe an instance. + + This is called when all resources are available and we still cannot fit an + instance, i.e. that single instance requires more resources than we have + available in total. In that case, we're just going to map it onto the resources + we have and hope for the best, which is what this function does. + + There's a lot of repetition between this and the code above. There's probably a + cleaner way to do this, but it'll do for now. Eventually we'll have an optimiser + and all this goes away anyway. + + Args: + instance: The instance we're oversubscribing + requirements: The required resources + + Returns: + An oversubscribed resource assignment + """ + _logger.warning( + f'Instance {instance} requires more resources than are available in' + ' total. Oversubscribing this instance.') + + res_by_rank: List[OnNodeResources] = list() + + if isinstance(requirements, ThreadedResReq): + res_by_rank.append(copy(next(iter(self._all_resources)))) + + elif isinstance(requirements, MPICoresResReq): + if requirements.threads_per_mpi_process != 1: + raise RuntimeError( + 'Multiple threads per MPI process is not supported yet. Please' + ' make an issue on GitHub.') + + free_resources = copy(self._all_resources) + for proc in range(requirements.mpi_processes): + if free_resources.total_cores() < requirements.threads_per_mpi_process: + free_resources = copy(self._all_resources) + + block = self._assign_thread_block( + free_resources, requirements.threads_per_mpi_process) + + res_by_rank.append(block) + free_resources -= Resources([block]) + + return ResourceAssignment(res_by_rank) diff --git a/libmuscle/python/libmuscle/planner/resources.py b/libmuscle/python/libmuscle/planner/resources.py new file mode 100644 index 00000000..0e1dd41a --- /dev/null +++ b/libmuscle/python/libmuscle/planner/resources.py @@ -0,0 +1,647 @@ +"""Module for describing compute resources + +There's a huge comment here because there's a big mess here that took me forever to +figure out, so now I'm going to document it for the future. + + +Identifying hardware resources + +Today's computers all contain multi-core CPUs, often with symmetric multithreading +(SMT), also known as hyperthreading. This means that we have hardware threads +(hwthreads) and also cores, and then there's caches and memory as well but we're not +going into NUMA here. + +Cores and hwthreads are identified by number, but they have multiple different numbers +that are referred to by different names in different contexts, making everything very +confusing. So here are some definitions to disambiguate things. Note that this is still +a rather simplified representation, but it's enough for what we're doing here in +MUSCLE3. + + +Hardware threads + +A *hardware thread (hwthread)* is, at the hardware level, an instruction decoder. It +points to wherever in the code we are currently executing, and it can read the next +couple of instructions and figure out how to execute them. It can't actually execute +anything however, because it doesn't have the hardware that does that. + +Intel refers to hwthreads as "logical processors" and so does Linux, hwloc calls them +"processing units" or PUs and so does OpenMPI unless it uses the term hwthread just to +confuse things a bit more. + +Cores + +A *core* contains at least one hwthread, and at least one functional unit, which is a +hardware component that actually does calculations and other data processing. Within a +core, the hwthread(s) read instructions and pass them to the functional units to be +executed. If a core has more than one hwthread, then the CPU supports SMT. + +Intel refers to cores as "physical processors", hwloc calls them cores and so do most +other sources. We'll use cores here. + +Since a hwthread cannot do anything on its own, it's always part of a core. + +CPUs + +The term CPU is used in many ways by various bits of documentation, sometimes referring +to a hwthread or a core, but here we'll take it to mean a collection of cores in a +plastic box. Similar terms are *package* (referring to that plastic box with very many +metal pins) and *socket* (the thing the package mounts into), or *processor*, which was +originally used to refer to all of the above when CPUs still had only one core with only +one hwthread, and has now become ambiguous. + +Weird things can happen here, I've seen CPUs that as far as I can tell are a single +package, but nevertheless claim to have two sockets. I suspect that that's two physical +chips in a single plastic box, but I don't know for sure. + +Here, we're concerned with hwthreads and cores and how to identify them and assign +instances to them. + + +Linux + +On modern operating systems, hardware access is mediated by the operating system, and +we're mainly concerned with Linux here because that is what all the clusters are running +(see the note on macOS below). Information about the CPU(s) can be obtained on Linux +from the /proc/cpuinfo file, or equivalently but more modernly, from the files in +/sys/devices/system/cpu/cpu/topology/. + +Linux collects information about processors because it needs to run processes (programs, +software threads) on them on behalf of the user. Processes are assigned to hwthreads, so +that is what Linux considers a *processor*. /proc/cpuinfo lists all these processors, +and they each have their own directory /sys/devices/system/cpu/cpu. + +On Linux, processors have an id, which is that number in the directory, and is +listed under "processor" in /proc/cpuinfo. Since this number identifies a hwthread and +is assigned by Linux rather than being baked into the hardware, I'm calling it a +"logical hwthread id", this being a logical id of a hwthread, not an id of a logical +hwthread. It's also the id of a logical processor in Intel-speak. + +Hwthreads actually have a second number associated with them, which does come from the +hardware. In /proc/cpuinfo, that's listed under "apicid"; it doesn't seem to be +available from sysfs. Hwloc call this the "physical PU (its name for a hwthread) id", +and OpenMPI's mpirun manpage also refers to it as a "physical processor location". + +There's great potential for confusion here: the "physical PU id" and "physical processor +location" both identify a hardware-specified number (a physical id or a physical +location) for a hwthread. This is something completely different than what Intel calls a +"physical processor", which they use to refer to a core. + +MUSCLE3 uses logical hwthread ids everywhere, it does not use physical ids. + +Linux knows about how hwthreads are grouped into bigger things of course. Cores are +identified in Linux using the "core id", which is listed in /proc/cpuinfo and in +/sys/devices/system/cpu/cpu/topology/core_id. So for each hwthread, identified by its +logical id, we can look up which core it is a part of. The core id is a logical id, +assigned by Linux, not by the hardware. While logical hwthread ids seem to always be +consecutive at least on the hardware I've seen so far, core ids may have gaps. + +MUSCLE3 does not use core ids, although it uses groups of hwthread ids that contain all +the hwthreads for a given core. + + +Resource binding + +Running processes need something to run on, a hwthread. The assignment of process to +hwthread is done by the operating system's scheduler: when a process is ready to run, +the scheduler will try to find it a free hwthread to run on. + +The scheduler can be constrained in which hwthreads it considers for a given process, +which is known as binding the process. This may have performance benefits, because +moving a process from one hwthread to another takes time. In MUSCLE3, when running on a +cluster, each process is assigned its own specific set of hwthreads to run on, and we +try to bind the instance to the assigned hwthreads. + +Taskset + +How this is done depends on how the instance is started. For non-MPI instances, we use a +Linux utility named 'taskset' that starts another program with a giving binding. The +binding is expressed as an *affinity mask*, a string of bits that say whether a given +processor (hwthread) can be used by the process or not. Each position in the string of +bits corresponds to the hwthread with that logical id. + +OpenMPI + +OpenMPI can bind cores in various ways, we use a rankfile and the --use-hwthread-cpus +option to specify the logical hwthread ids we want to bind each MPI process (rank) to. +Note that OpenMPI by default binds to cores, and can also bind to various other things +including sockets. + +MPICH + +MPICH doesn't support binding, as far as I can see. + +Intel MPI + +Intel MPI uses logical hwthread ids-based masks, specified in an environment variable, +to go with a machinefile that lists the nodes to put each process on. + +Slurm srun + +Slurm's srun has a CPU_BIND environment variable that likewise contains logical hwthread +ids-based masks, and a hostfile that lists the nodes to put each process on. + +Here are some disambiguation tables to help with the confusion: + + +``` +MUSCLE3 hwthread logical hwthread id physical hwthread id + +Linux processor processor apicid + (/proc/cpuinfo only) + +cgroups always uses these + +taskset always uses these + +hwloc PU PU L# PU P# + +OpenMPI hwthread used in rankfile if used in rankfile if + --use-hwthread-cpus rmaps_rank_file_physical + is specified MCA param set + +Intel logical logical processor + processor number + +srun used by --bind-to + +psutil logical returned by Process.cpu_affinity() + core counted by psutil.cpu_count(logical=True) +``` + + +``` +MUSCLE3 core core id + +Linux core core id + +Hwloc core core L# + +OpenMPI core used in rankfile if + --use-hwthread-cpus not + specified + +psutil physical counted by psutil.cpu_count(logical=False) + core +``` + +""" +from copy import copy, deepcopy +from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple + + +class Core: + """Describes a CPU core or designates a core or one or more hwthreads. + + A core is a group of functional units with one or more instruction decoders. If the + core supports symmetric multithreading (SMT, aka hyperthreading) then there will be + more than one instruction decoder or hardware thread in the core. + + Note that the term "logical CPU" refers to an instruction decoder/hwthread. If the + processor does not support SMT, then each core has a single decoder and so a logical + CPU is also a core. + + This class can be used in different ways with slighly different interpretations. + When describing hardware resources, it describes a core and all of its hwthreads. In + this case, cid is the core id, and hwthreads contains the hwthread ids of all + hwthreads on this core. If no SMT is supported, then there will be only one + hwthread id. + + When designating a whole core (e.g. for use by a process), cid is set to the id of + the core, and hwthreads contains all of the hwthreads on that core. When designating + a hwthread on a particular core, cid is set to the id of the core and hwthreads + contains the designated (single) hwthread. + + MUSCLE3 never assigns swthreads to subsets of hwthreads on a core, it assigns them + to either a single hwthread or a single whole core. So if more than one hwthread is + given, then we can assume that those are all the hwthreads on that core. + + Objects of this class automatically deepcopy when copied. This means that you can + make a copy using copy.copy() and modify that copy anywhere without changing the + original. + + Args: + cid: ID of this core, to be used to refer to it + hwthreads: Ids of hwthreads (logical CPUs) belonging to this core + """ + def __init__(self, cid: int, hwthreads: Set[int]) -> None: + """Create a Core""" + self.cid = cid + self.hwthreads = copy(hwthreads) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, Core): + return NotImplemented + + return self.cid == other.cid and self.hwthreads == other.hwthreads + + def __len__(self) -> int: + return len(self.hwthreads) + + def __copy__(self) -> 'Core': + return Core(self.cid, self.hwthreads) + + def __or__(self, other: object) -> 'Core': + if not isinstance(other, Core): + return NotImplemented + + if other.cid != self.cid: + raise ValueError('Cannot merge hwthreads on different cores') + + return Core(self.cid, self.hwthreads | other.hwthreads) + + def __ior__(self, other: object) -> 'Core': + if not isinstance(other, Core): + return NotImplemented + + if other.cid != self.cid: + raise ValueError('Cannot merge hwthreads on different cores') + + self.hwthreads |= other.hwthreads + return self + + def __isub__(self, other: object) -> 'Core': + if not isinstance(other, Core): + return NotImplemented + + if other.cid != self.cid: + raise ValueError('Cannot merge hwthreads on different cores') + + self.hwthreads -= other.hwthreads + return self + + def __str__(self) -> str: + hwthreads = ','.join(map(str, sorted(self.hwthreads))) + return f'{self.cid}({hwthreads})' + + def __repr__(self) -> str: + hwthreads = ','.join(map(str, sorted(self.hwthreads))) + return f'Core({self.cid}, {{{hwthreads}}})' + + def isdisjoint(self, other: 'Core') -> bool: + """Returns whether we share resources with other.""" + if self.cid != other.cid: + raise ValueError('Cannot compare hwthreads on different cores') + + return self.hwthreads.isdisjoint(other.hwthreads) + + +class CoreSet: + """A set of cores on a single node. + + This exists to make it a bit easier to operate on sets of cores, merging and + subtracting them. + + Objects of this class automatically deepcopy when copied. This means that you can + make a copy using copy.copy() and modify that copy anywhere without changing the + original. + """ + def __init__(self, cores: Iterable[Core]) -> None: + """Create a CoreSet + + Args: + cores: A set of cores to contain. + """ + self._cores = {c.cid: c for c in cores} + + def __eq__(self, other: object) -> bool: + if not isinstance(other, CoreSet): + return NotImplemented + + if len(self._cores) != len(other._cores): + return False + + for cid, core in self._cores.items(): + if cid not in other._cores: + return False + if core.hwthreads != other._cores[cid].hwthreads: + return False + + return True + + def __len__(self) -> int: + return len(self._cores) + + def __iter__(self) -> Iterator[Core]: + return iter(self._cores.values()) + + def __copy__(self) -> 'CoreSet': + return CoreSet(deepcopy(list(self._cores.values()))) + + def __ior__(self, other: object) -> 'CoreSet': + if not isinstance(other, CoreSet): + return NotImplemented + + for cid, core in other._cores.items(): + if cid in self._cores: + self._cores[cid] |= core + else: + self._cores[cid] = copy(core) + + return self + + def __isub__(self, other: object) -> 'CoreSet': + if not isinstance(other, CoreSet): + return NotImplemented + + for cid, core in other._cores.items(): + if cid in self._cores: + self._cores[cid] -= core + if not self._cores[cid].hwthreads: + del self._cores[cid] + + return self + + def __str__(self) -> str: + def collapse_ranges(ids: List[int]) -> str: + if len(ids) == 0: + return '' + + result = list() + start = 0 + i = 1 + while i <= len(ids): + if (i == len(ids)) or (ids[i-1] != ids[i] - 1): + if start == i - 1: + # run of one + result.append(str(ids[i-1])) + else: + # run of at least two + result.append(f'{ids[start]}-{ids[i-1]}') + start = i + i += 1 + return ','.join(result) + + cores = sorted((c.cid for c in self._cores.values())) + hwthreads = sorted((t for c in self._cores.values() for t in c.hwthreads)) + + return f'{collapse_ranges(cores)}({collapse_ranges(hwthreads)})' + + def __repr__(self) -> str: + cores = ', '.join(map(repr, sorted(self._cores.values(), key=lambda c: c.cid))) + return f'CoreSet({{{cores}}})' + + def isdisjoint(self, other: 'CoreSet') -> bool: + """Returns whether we share resources with other.""" + for cid, core in self._cores.items(): + if cid in other._cores: + if not core.isdisjoint(other._cores[cid]): + return False + return True + + def get_first_cores(self, num_cores: int) -> 'CoreSet': + """Returns the first num_cores cores in this set. + + Args: + The number of cores to select. + """ + result = copy(self) + cids = list(self._cores.keys()) + selected = cids[:num_cores] + if len(selected) < num_cores: + raise RuntimeError('Tried to get more cores than available') + + result._cores = {c.cid: c for c in result._cores.values() if c.cid in selected} + return result + + +class OnNodeResources: + """Resources on a single node, currently only CPU cores. + + This represents a set of resources on a single node, either all of the resources + available or some subset of interest. + + Objects of this class automatically deepcopy when copied. This means that you can + make a copy using copy.copy() and modify that copy anywhere without changing the + original. + """ + def __init__(self, node_name: str, cpu_cores: CoreSet) -> None: + """Create an OnNodeResources. + + Args: + name: (Host)name of the node. + cpu_cores: A set of cores for this node. + """ + self.node_name = node_name + self.cpu_cores = cpu_cores + + def __eq__(self, other: object) -> bool: + if not isinstance(other, OnNodeResources): + return NotImplemented + + return ( + isinstance(other, OnNodeResources) and + self.node_name == other.node_name and + self.cpu_cores == other.cpu_cores) + + def __copy__(self) -> 'OnNodeResources': + return OnNodeResources(self.node_name, copy(self.cpu_cores)) + + def __ior__(self, other: object) -> 'OnNodeResources': + if not isinstance(other, OnNodeResources): + return NotImplemented + + if self.node_name != other.node_name: + raise ValueError('Cannot merge resources on different nodes') + + self.cpu_cores |= other.cpu_cores + return self + + def __isub__(self, other: object) -> 'OnNodeResources': + if not isinstance(other, OnNodeResources): + return NotImplemented + + if self.node_name != other.node_name: + raise ValueError('Cannot remove resources on different nodes') + + self.cpu_cores -= other.cpu_cores + return self + + def __str__(self) -> str: + return f'OnNodeResources({self.node_name}, c: {str(self.cpu_cores)})' + + def __repr__(self) -> str: + return f'OnNodeResources("{self.node_name}", {repr(self.cpu_cores)})' + + def hwthreads(self) -> Iterable[int]: + """Return the hwthreads in this node.""" + return (thread for core in self.cpu_cores for thread in core.hwthreads) + + def total_cores(self) -> int: + """Return the number of CPU cores in this node.""" + return len(self.cpu_cores) + + def isdisjoint(self, other: 'OnNodeResources') -> bool: + """Returns whether we share resources with other.""" + return ( + self.node_name != other.node_name or + self.cpu_cores.isdisjoint(other.cpu_cores)) + + +class Resources: + """Designates a (sub)set of resources. + + Whether these resources are free or allocated in general or by something specific + depends on the context, this just says which resources we're talking about. + + Objects of this class automatically deepcopy when copied. This means that you can + make a copy using copy.copy() and modify that copy anywhere without changing the + original. + + Attributes: + nodes: A collection of nodes to include in this resource set + """ + def __init__(self, nodes: Optional[Iterable[OnNodeResources]] = None) -> None: + """Create a Resources object with the given nodes. + + Args: + nodes: OnNodeResourcess to be designated by this object. + """ + if nodes is None: + self._nodes: Dict[str, OnNodeResources] = {} + else: + self._nodes = {n.node_name: n for n in nodes} + + def __len__(self) -> int: + return len(self._nodes) + + def __iter__(self) -> Iterator[OnNodeResources]: + return iter(self._nodes.values()) + + def __getitem__(self, node_name: str) -> OnNodeResources: + return self._nodes[node_name] + + def __eq__(self, other: object) -> bool: + """Check for equality.""" + if not isinstance(other, Resources): + return NotImplemented + + if len(self._nodes) != len(other._nodes): + return False + + for node_name, node in self._nodes.items(): + if node_name not in other._nodes: + return False + if other._nodes[node_name] != node: + return False + + return True + + def __copy__(self) -> 'Resources': + """Copy the object.""" + return Resources((copy(n) for n in self._nodes.values())) + + def __ior__(self, other: object) -> 'Resources': + """Add the resources in the argument to this object.""" + if not isinstance(other, Resources): + return NotImplemented + + for node_name, other_node in other._nodes.items(): + if node_name in self._nodes: + self._nodes[node_name] |= other_node + else: + self._nodes[node_name] = copy(other_node) + + return self + + def __isub__(self, other: object) -> 'Resources': + """Remove the resources in the argument from this object.""" + if not isinstance(other, Resources): + return NotImplemented + + for node_name, other_node in other._nodes.items(): + if node_name in self._nodes: + self._nodes[node_name] -= other_node + if not self._nodes[node_name]: + del self._nodes[node_name] + + return self + + def __str__(self) -> str: + """Return a human-readable string representation.""" + nodes = ','.join( + map(str, sorted(self._nodes.values(), key=lambda n: n.node_name))) + return f'Resources({nodes})' + + def __repr__(self) -> str: + """Return a string representation.""" + nodes = sorted(self._nodes.values(), key=lambda n: n.node_name) + return f'Resources({nodes})' + + def nodes(self) -> Iterable[str]: + """Return the names of the nodes on which we designate resources.""" + return self._nodes.keys() + + def total_cores(self) -> int: + """Return the total number of cores (not hwthreads) designated.""" + return sum((len(n.cpu_cores) for n in self._nodes.values())) + + def cores(self) -> Iterable[Tuple[str, int]]: + """Return this resources as a list of node, core.""" + return ( + (node.node_name, core.cid) + for node in self._nodes.values() for core in node.cpu_cores) + + def hwthreads(self) -> Iterable[Tuple[str, int]]: + """Return this resources as a list of node, hwthread.""" + return ( + (node.node_name, hwthread) + for node in self._nodes.values() for hwthread in node.hwthreads()) + + def isdisjoint(self, other: 'Resources') -> bool: + """Return whether we share resources with other.""" + for node_name, node in self._nodes.items(): + if node_name in other._nodes: + if not node.isdisjoint(other._nodes[node_name]): + return False + return True + + def add_node(self, node_res: OnNodeResources) -> None: + """Add a node's resources. + + This absorbs node_res into this Resources object, so if you change node_res + after adding it, the changes will be reflected in this Resources. + + Args: + node_res: Resources on a node not yet included in this Resources. + + Raises: + RuntimeError: if we already have a node with this node name. + """ + if node_res.node_name in self._nodes: + raise RuntimeError( + 'Tried to add a OnNodeResources to a Resources for a node that is' + ' already present. This is a bug in MUSCLE3, please report it on' + ' GitHub.') + + self._nodes[node_res.node_name] = node_res + + def merge_node(self, node_res: OnNodeResources) -> None: + """Merges a node's resources + + This always copies the object. + + Args: + node_res: Resources on a node that may already be included in this + Resources. + """ + if node_res.node_name in self._nodes: + self._nodes[node_res.node_name] |= node_res + else: + self._nodes[node_res.node_name] = copy(node_res) + + @staticmethod + def union(resources: Iterable['Resources']) -> 'Resources': + """Combines the resources into one. + + Args: + resources: A collection of resources to merge. + + Return: + A Resources object referring to all the resources in the + input. + """ + result = Resources() + for cur_resources in resources: + result |= cur_resources + return result diff --git a/libmuscle/python/libmuscle/planner/test/test_planner.py b/libmuscle/python/libmuscle/planner/test/test_planner.py index 95e8e7fb..273b0c7f 100644 --- a/libmuscle/python/libmuscle/planner/test/test_planner.py +++ b/libmuscle/python/libmuscle/planner/test/test_planner.py @@ -1,7 +1,3 @@ -from libmuscle.planner.planner import ( - InsufficientResourcesAvailable, ModelGraph, Planner, Resources) - -from copy import copy import pytest from typing import Dict, List @@ -9,13 +5,22 @@ Component, Conduit, Configuration, Implementation, Model, MPICoresResReq, Ports, Reference, ResourceRequirements, ThreadedResReq) +from libmuscle.planner.planner import ( + InsufficientResourcesAvailable, ModelGraph, Planner, ResourceAssignment) +from libmuscle.planner.resources import Resources + +from libmuscle.test.conftest import core as c, on_node_resources as onr, resources + + +Ref = Reference + @pytest.fixture def all_resources() -> Resources: - return Resources({ - 'node001': {1, 2, 3, 4}, - 'node002': {1, 2, 3, 4}, - 'node003': {1, 2, 3, 4}}) + return resources({ + 'node001': [c(1), c(2), c(3), c(4)], + 'node002': [c(1), c(2), c(3), c(4)], + 'node003': [c(1), c(2), c(3), c(4)]}) @pytest.fixture @@ -49,17 +54,17 @@ def model(init: Component, macro: Component, micro: Component) -> Model: @pytest.fixture def implementations() -> List[Implementation]: return [ - Implementation(Reference('init'), script='init'), - Implementation(Reference('macro'), script='macro'), - Implementation(Reference('micro'), script='micro')] + Implementation(Ref('init'), script='init'), + Implementation(Ref('macro'), script='macro'), + Implementation(Ref('micro'), script='micro')] @pytest.fixture def requirements() -> Dict[Reference, ResourceRequirements]: res_list = [ - ThreadedResReq(Reference('init'), 4), - ThreadedResReq(Reference('macro'), 4), - ThreadedResReq(Reference('micro'), 4)] + ThreadedResReq(Ref('init'), 4), + ThreadedResReq(Ref('macro'), 4), + ThreadedResReq(Ref('micro'), 4)] return {r.name: r for r in res_list} @@ -70,6 +75,13 @@ def configuration( return Configuration(model, None, implementations, requirements) +@pytest.fixture +def assignment() -> ResourceAssignment: + return ResourceAssignment([ + onr('node001', {0, 1}), + onr('node002', {2, 3})]) + + def test_model_graph( init: Component, macro: Component, micro: Component, model: Model ) -> None: @@ -93,45 +105,51 @@ def test_model_graph( assert not graph.successors(micro) -def test_resources(all_resources: Resources) -> None: - res1 = all_resources - assert res1.cores == { - 'node001': {1, 2, 3, 4}, - 'node002': {1, 2, 3, 4}, - 'node003': {1, 2, 3, 4}} - assert set(res1.nodes()) == {'node001', 'node002', 'node003'} +def test_resource_assignment_eq() -> None: + asm1 = ResourceAssignment([]) + asm2 = ResourceAssignment([]) - res2 = Resources({ - 'node004': {1, 2, 3, 4, 5, 6}, 'node005': {1, 2, 3, 4, 5, 6}}) - res1 += res2 + assert asm1 == asm2 - assert res1.cores == { - 'node001': {1, 2, 3, 4}, 'node002': {1, 2, 3, 4}, - 'node003': {1, 2, 3, 4}, 'node004': {1, 2, 3, 4, 5, 6}, - 'node005': {1, 2, 3, 4, 5, 6}} + asm1.by_rank.append(onr('node001', {0, 1})) + assert asm1 != asm2 - res3 = Resources({'node003': {1, 2, 3, 4}, 'node005': {4, 5, 6}}) - res1 -= res3 + asm2.by_rank.append(onr('node001', {0, 2})) + assert asm1 != asm2 - assert res1.cores == { - 'node001': {1, 2, 3, 4}, 'node002': {1, 2, 3, 4}, - 'node004': {1, 2, 3, 4, 5, 6}, 'node005': {1, 2, 3}} - assert res1.nodes() == { - 'node001', 'node002', 'node004', 'node005'} + asm2.by_rank[0] = onr('node001', {0, 1}) + assert asm1 == asm2 - res4 = copy(res3) - res4.cores['node003'] = {8} - assert res3.cores['node003'] == {1, 2, 3, 4} - assert res4.cores['node003'] == {8} +def test_resource_assignment_str(assignment: ResourceAssignment) -> None: + assert str(assignment) == ( + '[OnNodeResources(node001, c: 0-1(0-1)),' + ' OnNodeResources(node002, c: 2-3(2-3))]') - all_resources = Resources.union([res1, res2, res3, res4]) - assert all_resources.cores['node001'] == {1, 2, 3, 4} - assert all_resources.cores['node002'] == {1, 2, 3, 4} - assert all_resources.cores['node003'] == {1, 2, 3, 4, 8} - assert all_resources.cores['node004'] == {1, 2, 3, 4, 5, 6} - assert all_resources.cores['node005'] == {1, 2, 3, 4, 5, 6} +def test_resource_assignment_repr(assignment: ResourceAssignment) -> None: + assert repr(assignment) == ( + 'ResourceAssignment([' + 'OnNodeResources("node001", CoreSet({Core(0, {0}), Core(1, {1})})),' + ' OnNodeResources("node002", CoreSet({Core(2, {2}), Core(3, {3})}))])') + + +def test_resource_assignment_as_resources(assignment) -> None: + res = assignment.as_resources() + + assert res._nodes == { + 'node001': onr('node001', {0, 1}), + 'node002': onr('node002', {2, 3})} + + asm2 = ResourceAssignment([ + onr('node001', {0, 1}), onr('node001', {2, 3}), onr('node001', {2, 3}), + onr('node003', {4, 5})]) + + res = asm2.as_resources() + + assert res._nodes == { + 'node001': onr('node001', {0, 1, 2, 3}), + 'node003': onr('node003', {4, 5})} def test_planner( @@ -139,33 +157,31 @@ def test_planner( planner = Planner(all_resources) allocations = planner.allocate_all(configuration) - assert allocations[Reference('init')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('macro')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('micro')].cores == {'node001': {1, 2, 3, 4}} + assert allocations[Ref('init')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('macro')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('micro')].by_rank == [onr('node001', {1, 2, 3, 4})] def test_planner_exclusive_macro( all_resources: Resources, configuration: Configuration) -> None: planner = Planner(all_resources) - configuration.implementations[Reference('macro')].can_share_resources = ( - False) + configuration.implementations[Ref('macro')].can_share_resources = False allocations = planner.allocate_all(configuration) - assert allocations[Reference('init')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('macro')].cores == {'node002': {1, 2, 3, 4}} - assert allocations[Reference('micro')].cores == {'node001': {1, 2, 3, 4}} + assert allocations[Ref('init')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('macro')].by_rank == [onr('node002', {1, 2, 3, 4})] + assert allocations[Ref('micro')].by_rank == [onr('node001', {1, 2, 3, 4})] def test_planner_exclusive_predecessor( all_resources: Resources, configuration: Configuration) -> None: planner = Planner(all_resources) - configuration.implementations[Reference('init')].can_share_resources = ( - False) + configuration.implementations[Ref('init')].can_share_resources = False allocations = planner.allocate_all(configuration) - assert allocations[Reference('init')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('macro')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('micro')].cores == {'node001': {1, 2, 3, 4}} + assert allocations[Ref('init')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('macro')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('micro')].by_rank == [onr('node001', {1, 2, 3, 4})] def test_oversubscribe( @@ -177,90 +193,84 @@ def test_oversubscribe( planner = Planner(all_resources) allocations = planner.allocate_all(configuration) - assert allocations[Reference('init[0]')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('init[1]')].cores == {'node002': {1, 2, 3, 4}} - assert allocations[Reference('init[2]')].cores == {'node003': {1, 2, 3, 4}} - assert allocations[Reference('init[3]')].cores == {'node001': {1, 2, 3, 4}} - assert allocations[Reference('init[4]')].cores == {'node002': {1, 2, 3, 4}} - - assert allocations[Reference('macro[0]')].cores == { - 'node001': {1, 2, 3, 4}} - assert allocations[Reference('macro[1]')].cores == { - 'node002': {1, 2, 3, 4}} - assert allocations[Reference('macro[2]')].cores == { - 'node003': {1, 2, 3, 4}} - assert allocations[Reference('macro[3]')].cores == { - 'node001': {1, 2, 3, 4}} - assert allocations[Reference('macro[4]')].cores == { - 'node002': {1, 2, 3, 4}} - - assert allocations[Reference('micro[0]')].cores == { - 'node001': {1, 2, 3, 4}} - assert allocations[Reference('micro[1]')].cores == { - 'node002': {1, 2, 3, 4}} - assert allocations[Reference('micro[2]')].cores == { - 'node003': {1, 2, 3, 4}} - assert allocations[Reference('micro[3]')].cores == { - 'node001': {1, 2, 3, 4}} - assert allocations[Reference('micro[4]')].cores == { - 'node002': {1, 2, 3, 4}} + assert allocations[Ref('init[0]')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('init[1]')].by_rank == [onr('node002', {1, 2, 3, 4})] + assert allocations[Ref('init[2]')].by_rank == [onr('node003', {1, 2, 3, 4})] + assert allocations[Ref('init[3]')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('init[4]')].by_rank == [onr('node002', {1, 2, 3, 4})] + + assert allocations[Ref('macro[0]')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('macro[1]')].by_rank == [onr('node002', {1, 2, 3, 4})] + assert allocations[Ref('macro[2]')].by_rank == [onr('node003', {1, 2, 3, 4})] + assert allocations[Ref('macro[3]')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('macro[4]')].by_rank == [onr('node002', {1, 2, 3, 4})] + + assert allocations[Ref('micro[0]')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('micro[1]')].by_rank == [onr('node002', {1, 2, 3, 4})] + assert allocations[Ref('micro[2]')].by_rank == [onr('node003', {1, 2, 3, 4})] + assert allocations[Ref('micro[3]')].by_rank == [onr('node001', {1, 2, 3, 4})] + assert allocations[Ref('micro[4]')].by_rank == [onr('node002', {1, 2, 3, 4})] def test_oversubscribe_single_instance_threaded() -> None: model = Model('single_instance', [Component('x', 'x', ports=Ports())]) - impl = [Implementation(Reference('x'), script='x')] + impl = [Implementation(Ref('x'), script='x')] reqs: Dict[Reference, ResourceRequirements] = { - Reference('x'): ThreadedResReq(Reference('x'), 24)} + Ref('x'): ThreadedResReq(Ref('x'), 24)} config = Configuration(model, None, impl, reqs) - res = Resources({'node001': {1, 2, 3, 4}}) + res = resources({'node001': [c(1), c(2), c(3), c(4)]}) planner = Planner(res) allocations = planner.allocate_all(config) - assert allocations[Reference('x')].cores == {'node001': {1, 2, 3, 4}} + assert allocations[Ref('x')].by_rank == [onr('node001', {1, 2, 3, 4})] def test_oversubscribe_single_instance_mpi() -> None: model = Model('single_instance', [Component('x', 'x', ports=Ports())]) - impl = [Implementation(Reference('x'), script='x')] + impl = [Implementation(Ref('x'), script='x')] reqs: Dict[Reference, ResourceRequirements] = { - Reference('x'): MPICoresResReq(Reference('x'), 24)} + Ref('x'): MPICoresResReq(Ref('x'), 24)} config = Configuration(model, None, impl, reqs) - res = Resources({'node001': {1, 2, 3, 4}}) + res = resources({'node001': [c(1), c(2), c(3), c(4)]}) planner = Planner(res) allocations = planner.allocate_all(config) - assert allocations[Reference('x')].cores == {'node001': {1, 2, 3, 4}} + assert len(allocations[Ref('x')].by_rank) == 24 + for r in range(24): + assert allocations[Ref('x')].by_rank[r] == onr('node001', {r % 4 + 1}) def test_virtual_allocation() -> None: model = Model('ensemble', [Component('x', 'x', 9, ports=Ports())]) - impl = [Implementation(Reference('x'), script='x')] - reqs: Dict[Reference, ResourceRequirements] = { - Reference('x'): MPICoresResReq(Reference('x'), 13)} + impl = [Implementation(Ref('x'), script='x')] + reqs: Dict[Ref, ResourceRequirements] = { + Ref('x'): MPICoresResReq(Ref('x'), 13)} config = Configuration(model, None, impl, reqs) - res = Resources({'node000001': {1, 2, 3, 4}}) + res = resources({'node000001': [c(1), c(2), c(3), c(4)]}) planner = Planner(res) allocations = planner.allocate_all(config, virtual=True) assert res.total_cores() == 120 - assert allocations[Reference('x[0]')].total_cores() == 13 - assert allocations[Reference('x[8]')].total_cores() == 13 + for i in range(9): + for r in range(13): + assert len(allocations[Ref(f'x[{i}]')].by_rank) == 13 + assert allocations[Ref(f'x[{i}]')].by_rank[r].total_cores() == 1 def test_impossible_virtual_allocation() -> None: model = Model('ensemble', [Component('x', 'x', 9, ports=Ports())]) - impl = [Implementation(Reference('x'), script='x')] - reqs: Dict[Reference, ResourceRequirements] = { - Reference('x'): ThreadedResReq(Reference('x'), 13)} + impl = [Implementation(Ref('x'), script='x')] + reqs: Dict[Ref, ResourceRequirements] = { + Ref('x'): ThreadedResReq(Ref('x'), 13)} config = Configuration(model, None, impl, reqs) - res = Resources({'node000001': {1, 2, 3, 4}}) + res = resources({'node000001': [c(1), c(2), c(3), c(4)]}) planner = Planner(res) with pytest.raises(InsufficientResourcesAvailable): diff --git a/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py b/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py index cf6067d4..13ec5ce3 100644 --- a/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py +++ b/libmuscle/python/libmuscle/planner/test/test_planner_scenarios.py @@ -1,6 +1,4 @@ from copy import deepcopy -from libmuscle.planner.planner import ModelGraph, Planner, Resources - from typing import Dict, Tuple import pytest @@ -8,6 +6,11 @@ Component, Conduit, Configuration, Implementation, Model, MPICoresResReq, Ports, Reference, ResourceRequirements, ThreadedResReq) +from libmuscle.planner.planner import ModelGraph, Planner, ResourceAssignment +from libmuscle.planner.resources import Resources + +from libmuscle.test.conftest import core as c, on_node_resources as onr, resources + _ResReqs = Dict[Reference, ResourceRequirements] @@ -38,12 +41,12 @@ s0_model, None, s0_implementations, s0_requirements) -s0_resources = Resources({'node001': {0, 1, 2, 3}}) +s0_resources = resources({'node001': [c(0), c(1), c(2), c(3)]}) s0_solution = { - Reference('macro'): Resources({'node001': {0, 1}}), - Reference('micro'): Resources({'node001': {2, 3}})} + Reference('macro'): ResourceAssignment([onr('node001', {0, 1})]), + Reference('micro'): ResourceAssignment([onr('node001', {2, 3})])} s1_model = Model( @@ -83,14 +86,14 @@ s1_model, None, s1_implementations, s1_requirements) -s1_resources = Resources({'node001': {0, 1, 2, 3}}) +s1_resources = resources({'node001': [c(0), c(1), c(2), c(3)]}) s1_solution = { - Reference('macro'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro1'): Resources({'node001': {0, 1}}), - Reference('micro2'): Resources({'node001': {0, 1}}), - Reference('micro3'): Resources({'node001': {0}})} + Reference('macro'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro1'): ResourceAssignment([onr('node001', {0, 1})]), + Reference('micro2'): ResourceAssignment([onr('node001', {0, 1})]), + Reference('micro3'): ResourceAssignment([onr('node001', 0)])} s2_model = Model( @@ -125,13 +128,14 @@ s2_model, None, s2_implementations, s2_requirements) -s2_resources = Resources({'node001': {0, 1, 2, 3}, 'node002': {0, 1, 2, 3}}) +s2_resources = resources( + {'node001': [c(0), c(1), c(2), c(3)], 'node002': [c(0), c(1), c(2), c(3)]}) s2_solution = { - Reference('macro'): Resources({'node001': {0}}), - Reference('micro1'): Resources({'node001': {0, 1, 2}}), - Reference('micro2'): Resources({'node002': {0, 1}})} + Reference('macro'): ResourceAssignment([onr('node001', 0)]), + Reference('micro1'): ResourceAssignment([onr('node001', {0, 1, 2})]), + Reference('micro2'): ResourceAssignment([onr('node002', {0, 1})])} s3_model = Model( @@ -170,14 +174,17 @@ s3_model, None, s3_implementations, s3_requirements) -s3_resources = Resources({'node001': {0, 1, 2, 3}, 'node002': {0, 1, 2, 3}}) +s3_resources = resources( + {'node001': [c(0), c(1), c(2), c(3)], 'node002': [c(0), c(1), c(2), c(3)]}) s3_solution = { - Reference('a'): Resources({'node001': {0}}), - Reference('b1'): Resources({'node001': {2, 3}, 'node002': {0, 1, 2, 3}}), - Reference('b2'): Resources({'node001': {0, 1}}), - Reference('c'): Resources({'node001': {0, 1, 2, 3}})} + Reference('a'): ResourceAssignment([onr('node001', 0)]), + Reference('b1'): ResourceAssignment([ + onr('node001', 2), onr('node001', 3), onr('node002', 0), onr('node002', 1), + onr('node002', 2), onr('node002', 3)]), + Reference('b2'): ResourceAssignment([onr('node001', {0, 1})]), + Reference('c'): ResourceAssignment([onr('node001', {0, 1, 2, 3})])} s4_model = Model( @@ -213,13 +220,14 @@ s4_model, None, s4_implementations, s4_requirements) -s4_resources = Resources({'node001': {0, 1, 2, 3}, 'node002': {0, 1, 2, 3}}) +s4_resources = resources( + {'node001': [c(0), c(1), c(2), c(3)], 'node002': [c(0), c(1), c(2), c(3)]}) s4_solution = { - Reference('macro1'): Resources({'node002': {0, 1}}), - Reference('macro2'): Resources({'node001': {0, 1, 2}}), - Reference('micro'): Resources({'node001': {0, 1, 2}})} + Reference('macro1'): ResourceAssignment([onr('node002', {0, 1})]), + Reference('macro2'): ResourceAssignment([onr('node001', {0, 1, 2})]), + Reference('micro'): ResourceAssignment([onr('node001', {0, 1, 2})])} s5_model = Model( @@ -261,18 +269,19 @@ s5_model, None, s5_implementations, s5_requirements) -s5_resources = Resources({ - 'node001': {0, 1, 2, 3}, 'node002': {0, 1, 2, 3}, 'node003': {0, 1}}) +s5_resources = resources({ + 'node001': [c(0), c(1), c(2), c(3)], 'node002': [c(0), c(1), c(2), c(3)], + 'node003': [c(0), c(1)]}) # This is inefficient, as the models can all share resources. But repeater # is funny, and the algorithm cannot deal with it yet. It does give a valid # result with no overlap, so we'll accept that for the time being. s5_solution = { - Reference('init'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro'): Resources({'node002': {0, 1, 2, 3}}), - Reference('repeater'): Resources({'node003': {0}})} + Reference('init'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('repeater'): ResourceAssignment([onr('node003', 0)])} s6_model = Model( @@ -308,22 +317,22 @@ s6_model, None, s6_implementations, s6_requirements) -s6_resources = Resources({ - 'node001': {0, 1, 2, 3}, 'node002': {0, 1, 2, 3}, - 'node003': {0, 1, 2, 3}, 'node004': {0, 1, 2, 3}, - 'node005': {0, 1, 2, 3}, 'node006': {0, 1, 2, 3} +s6_resources = resources({ + 'node001': [c(0), c(1), c(2), c(3)], 'node002': [c(0), c(1), c(2), c(3)], + 'node003': [c(0), c(1), c(2), c(3)], 'node004': [c(0), c(1), c(2), c(3)], + 'node005': [c(0), c(1), c(2), c(3)], 'node006': [c(0), c(1), c(2), c(3)] }) s6_solution = { - Reference('a'): Resources({'node001': {0, 1, 2, 3}}), - Reference('tcf'): Resources({'node002': {0}}), - Reference('b'): Resources({ - 'node002': {1, 2, 3}, - 'node003': {0, 1, 2, 3}, - 'node004': {0, 1, 2, 3}, - 'node005': {0, 1, 2, 3}, - 'node006': {0}})} + Reference('a'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('tcf'): ResourceAssignment([onr('node002', 0)]), + Reference('b'): ResourceAssignment([ + onr('node002', 1), onr('node002', 2), onr('node002', 3), onr('node003', 0), + onr('node003', 1), onr('node003', 2), onr('node003', 3), onr('node004', 0), + onr('node004', 1), onr('node004', 2), onr('node004', 3), onr('node005', 0), + onr('node005', 1), onr('node005', 2), onr('node005', 3), onr('node006', 0)]) + } s7_model = Model( @@ -364,47 +373,70 @@ s7_model, None, s7_implementations, s7_requirements) -s7_resources = Resources({ - 'node001': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node002': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node003': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node004': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node005': {0, 1, 2, 3, 4, 5, 6, 7}, +s7_resources = resources({ + 'node001': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node002': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node003': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node004': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node005': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], }) s7_solution = { - Reference('mc'): Resources({'node001': {0}}), - Reference('init[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('init[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('init[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('init[3]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('init[4]'): Resources({'node003': {0, 1, 2, 3}}), - Reference('init[5]'): Resources({'node003': {4, 5, 6, 7}}), - Reference('init[6]'): Resources({'node004': {0, 1, 2, 3}}), - Reference('init[7]'): Resources({'node004': {4, 5, 6, 7}}), - Reference('init[8]'): Resources({'node005': {0, 1, 2, 3}}), - Reference('init[9]'): Resources({'node005': {4, 5, 6, 7}}), - Reference('macro[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('macro[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('macro[3]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('macro[4]'): Resources({'node003': {0, 1, 2, 3}}), - Reference('macro[5]'): Resources({'node003': {4, 5, 6, 7}}), - Reference('macro[6]'): Resources({'node004': {0, 1, 2, 3}}), - Reference('macro[7]'): Resources({'node004': {4, 5, 6, 7}}), - Reference('macro[8]'): Resources({'node005': {0, 1, 2, 3}}), - Reference('macro[9]'): Resources({'node005': {4, 5, 6, 7}}), - Reference('micro[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('micro[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('micro[3]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('micro[4]'): Resources({'node003': {0, 1, 2, 3}}), - Reference('micro[5]'): Resources({'node003': {4, 5, 6, 7}}), - Reference('micro[6]'): Resources({'node004': {0, 1, 2, 3}}), - Reference('micro[7]'): Resources({'node004': {4, 5, 6, 7}}), - Reference('micro[8]'): Resources({'node005': {0, 1, 2, 3}}), - Reference('micro[9]'): Resources({'node005': {4, 5, 6, 7}})} + Reference('mc'): ResourceAssignment([onr('node001', 0)]), + + Reference('init[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('init[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('init[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('init[3]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), + Reference('init[4]'): ResourceAssignment([onr('node003', {0, 1, 2, 3})]), + Reference('init[5]'): ResourceAssignment([onr('node003', {4, 5, 6, 7})]), + Reference('init[6]'): ResourceAssignment([onr('node004', {0, 1, 2, 3})]), + Reference('init[7]'): ResourceAssignment([onr('node004', {4, 5, 6, 7})]), + Reference('init[8]'): ResourceAssignment([onr('node005', {0, 1, 2, 3})]), + Reference('init[9]'): ResourceAssignment([onr('node005', {4, 5, 6, 7})]), + + Reference('macro[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('macro[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('macro[3]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), + Reference('macro[4]'): ResourceAssignment([onr('node003', {0, 1, 2, 3})]), + Reference('macro[5]'): ResourceAssignment([onr('node003', {4, 5, 6, 7})]), + Reference('macro[6]'): ResourceAssignment([onr('node004', {0, 1, 2, 3})]), + Reference('macro[7]'): ResourceAssignment([onr('node004', {4, 5, 6, 7})]), + Reference('macro[8]'): ResourceAssignment([onr('node005', {0, 1, 2, 3})]), + Reference('macro[9]'): ResourceAssignment([onr('node005', {4, 5, 6, 7})]), + + Reference('micro[0]'): ResourceAssignment([ + onr('node001', 0), onr('node001', 1), onr('node001', 2), + onr('node001', 3)]), + Reference('micro[1]'): ResourceAssignment([ + onr('node001', 4), onr('node001', 5), onr('node001', 6), + onr('node001', 7)]), + Reference('micro[2]'): ResourceAssignment([ + onr('node002', 0), onr('node002', 1), onr('node002', 2), + onr('node002', 3)]), + Reference('micro[3]'): ResourceAssignment([ + onr('node002', 4), onr('node002', 5), onr('node002', 6), + onr('node002', 7)]), + Reference('micro[4]'): ResourceAssignment([ + onr('node003', 0), onr('node003', 1), onr('node003', 2), + onr('node003', 3)]), + Reference('micro[5]'): ResourceAssignment([ + onr('node003', 4), onr('node003', 5), onr('node003', 6), + onr('node003', 7)]), + Reference('micro[6]'): ResourceAssignment([ + onr('node004', 0), onr('node004', 1), onr('node004', 2), + onr('node004', 3)]), + Reference('micro[7]'): ResourceAssignment([ + onr('node004', 4), onr('node004', 5), onr('node004', 6), + onr('node004', 7)]), + Reference('micro[8]'): ResourceAssignment([ + onr('node005', 0), onr('node005', 1), onr('node005', 2), + onr('node005', 3)]), + Reference('micro[9]'): ResourceAssignment([ + onr('node005', 4), onr('node005', 5), onr('node005', 6), + onr('node005', 7)])} s8_model = Model( @@ -441,13 +473,14 @@ s8_model, None, s8_implementations, s8_requirements) -s8_resources = Resources({'node001': {0, 1, 2, 3}, 'node002': {0, 1, 2, 3}}) +s8_resources = resources( + {'node001': [c(0), c(1), c(2), c(3)], 'node002': [c(0), c(1), c(2), c(3)]}) s8_solution = { - Reference('macro'): Resources({'node001': {3}}), - Reference('micro1'): Resources({'node001': {0, 1, 2}}), - Reference('micro2'): Resources({'node001': {0, 1}})} + Reference('macro'): ResourceAssignment([onr('node001', 3)]), + Reference('micro1'): ResourceAssignment([onr('node001', {0, 1, 2})]), + Reference('micro2'): ResourceAssignment([onr('node001', {0, 1})])} s9_model = Model( @@ -489,15 +522,15 @@ s9_model, None, s9_implementations, s9_requirements) -s9_resources = Resources({'node001': {0, 1, 2, 3}}) +s9_resources = resources({'node001': [c(0), c(1), c(2), c(3)]}) s9_solution = { - Reference('a'): Resources({'node001': {1}}), - Reference('b'): Resources({'node001': {0}}), - Reference('c'): Resources({'node001': {0}}), - Reference('d'): Resources({'node001': {1}}), - Reference('e'): Resources({'node001': {0}})} + Reference('a'): ResourceAssignment([onr('node001', 1)]), + Reference('b'): ResourceAssignment([onr('node001', 0)]), + Reference('c'): ResourceAssignment([onr('node001', 0)]), + Reference('d'): ResourceAssignment([onr('node001', 1)]), + Reference('e'): ResourceAssignment([onr('node001', 0)])} s10_model = Model( @@ -541,32 +574,40 @@ s10_model, None, s10_implementations, s10_requirements) -s10_resources = Resources({ - 'node001': {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - 'node002': {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - 'node003': {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, +s10_resources = resources({ + 'node001': [ + c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7), + c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)], + 'node002': [ + c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7), + c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)], + 'node003': [ + c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7), + c(8), c(9), c(10), c(11), c(12), c(13), c(14), c(15)], }) s10_solution = { - Reference('mc'): Resources({'node001': {0}}), - Reference('rr'): Resources({'node001': {0}}), - Reference('macro[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('macro[2]'): Resources({'node001': {8, 9, 10, 11}}), - Reference('macro[3]'): Resources({'node001': {12, 13, 14, 15}}), - Reference('macro[4]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('macro[5]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('macro[6]'): Resources({'node002': {8, 9, 10, 11}}), - Reference('macro[7]'): Resources({'node002': {12, 13, 14, 15}}), - Reference('micro[0]'): Resources({'node001': {0, 1}}), - Reference('micro[1]'): Resources({'node001': {4, 5}}), - Reference('micro[2]'): Resources({'node001': {8, 9}}), - Reference('micro[3]'): Resources({'node001': {12, 13}}), - Reference('micro[4]'): Resources({'node002': {0, 1}}), - Reference('micro[5]'): Resources({'node002': {4, 5}}), - Reference('micro[6]'): Resources({'node002': {8, 9}}), - Reference('micro[7]'): Resources({'node002': {12, 13}})} + Reference('mc'): ResourceAssignment([onr('node001', 0)]), + Reference('rr'): ResourceAssignment([onr('node001', 0)]), + + Reference('macro[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('macro[2]'): ResourceAssignment([onr('node001', {8, 9, 10, 11})]), + Reference('macro[3]'): ResourceAssignment([onr('node001', {12, 13, 14, 15})]), + Reference('macro[4]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('macro[5]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), + Reference('macro[6]'): ResourceAssignment([onr('node002', {8, 9, 10, 11})]), + Reference('macro[7]'): ResourceAssignment([onr('node002', {12, 13, 14, 15})]), + + Reference('micro[0]'): ResourceAssignment([onr('node001', {0, 1})]), + Reference('micro[1]'): ResourceAssignment([onr('node001', {4, 5})]), + Reference('micro[2]'): ResourceAssignment([onr('node001', {8, 9})]), + Reference('micro[3]'): ResourceAssignment([onr('node001', {12, 13})]), + Reference('micro[4]'): ResourceAssignment([onr('node002', {0, 1})]), + Reference('micro[5]'): ResourceAssignment([onr('node002', {4, 5})]), + Reference('micro[6]'): ResourceAssignment([onr('node002', {8, 9})]), + Reference('micro[7]'): ResourceAssignment([onr('node002', {12, 13})])} s11_model = Model( @@ -605,26 +646,25 @@ s11_config = Configuration(s11_model, None, s11_implementations, s11_requirements) -s11_resources = Resources({ - 'node001': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node002': {0, 1, 2, 3, 4, 5, 6, 7}, +s11_resources = resources({ + 'node001': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node002': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], }) s11_solution = { - Reference('macro1[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro1[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('macro1[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('micro1[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro1[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('micro1[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('macro2[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro2[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('macro2[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('micro2[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro2[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('micro2[2]'): Resources({'node002': {0, 1, 2, 3}}), - } + Reference('macro1[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro1[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('macro1[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('micro1[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro1[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('micro1[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('macro2[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro2[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('macro2[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('micro2[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro2[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('micro2[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})])} s12_model = deepcopy(s11_model) @@ -646,14 +686,16 @@ s12_solution = { - Reference('macro1'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro1[0]'): Resources({'node001': {0, 1, 2, 3, 4, 5, 6, 7}}), - Reference('micro1[1]'): Resources({'node002': {0, 1, 2, 3, 4, 5, 6, 7}}), - Reference('macro2'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro2[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro2[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('micro2[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('micro2[3]'): Resources({'node002': {4, 5, 6, 7}}), + Reference('macro1'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro1[0]'): ResourceAssignment([ + onr('node001', {0, 1, 2, 3, 4, 5, 6, 7})]), + Reference('micro1[1]'): ResourceAssignment([ + onr('node002', {0, 1, 2, 3, 4, 5, 6, 7})]), + Reference('macro2'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro2[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro2[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('micro2[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('micro2[3]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), } @@ -675,59 +717,59 @@ s13_config = Configuration(s13_model, None, s11_implementations, s13_requirements) -s13_resources = Resources({ - 'node001': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node002': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node003': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node004': {0, 1, 2, 3, 4, 5, 6, 7}, - 'node005': {0, 1, 2, 3, 4, 5, 6, 7}, +s13_resources = resources({ + 'node001': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node002': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node003': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node004': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], + 'node005': [c(0), c(1), c(2), c(3), c(4), c(5), c(6), c(7)], }) s13_solution = { - Reference('macro1[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro1[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('macro1[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('macro1[3]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('macro1[4]'): Resources({'node003': {0, 1, 2, 3}}), - - Reference('micro1[0][0]'): Resources({'node001': {0, 1}}), - Reference('micro1[0][1]'): Resources({'node001': {2, 3}}), - Reference('micro1[0][2]'): Resources({'node003': {4, 5}}), - Reference('micro1[0][3]'): Resources({'node003': {6, 7}}), - Reference('micro1[1][0]'): Resources({'node001': {4, 5}}), - Reference('micro1[1][1]'): Resources({'node001': {6, 7}}), - Reference('micro1[1][2]'): Resources({'node004': {0, 1}}), - Reference('micro1[1][3]'): Resources({'node004': {2, 3}}), - Reference('micro1[2][0]'): Resources({'node002': {0, 1}}), - Reference('micro1[2][1]'): Resources({'node002': {2, 3}}), - Reference('micro1[2][2]'): Resources({'node004': {4, 5}}), - Reference('micro1[2][3]'): Resources({'node004': {6, 7}}), - Reference('micro1[3][0]'): Resources({'node002': {4, 5}}), - Reference('micro1[3][1]'): Resources({'node002': {6, 7}}), - Reference('micro1[3][2]'): Resources({'node005': {0, 1}}), - Reference('micro1[3][3]'): Resources({'node005': {2, 3}}), - Reference('micro1[4][0]'): Resources({'node003': {0, 1}}), - Reference('micro1[4][1]'): Resources({'node003': {2, 3}}), - Reference('micro1[4][2]'): Resources({'node005': {4, 5}}), - Reference('micro1[4][3]'): Resources({'node005': {6, 7}}), - - Reference('macro2[0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('macro2[1]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('macro2[2]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('macro2[3]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('macro2[4]'): Resources({'node003': {0, 1, 2, 3}}), - - Reference('micro2[0][0]'): Resources({'node001': {0, 1, 2, 3}}), - Reference('micro2[0][1]'): Resources({'node003': {4, 5, 6, 7}}), - Reference('micro2[1][0]'): Resources({'node001': {4, 5, 6, 7}}), - Reference('micro2[1][1]'): Resources({'node004': {0, 1, 2, 3}}), - Reference('micro2[2][0]'): Resources({'node002': {0, 1, 2, 3}}), - Reference('micro2[2][1]'): Resources({'node004': {4, 5, 6, 7}}), - Reference('micro2[3][0]'): Resources({'node002': {4, 5, 6, 7}}), - Reference('micro2[3][1]'): Resources({'node005': {0, 1, 2, 3}}), - Reference('micro2[4][0]'): Resources({'node003': {0, 1, 2, 3}}), - Reference('micro2[4][1]'): Resources({'node005': {4, 5, 6, 7}}), + Reference('macro1[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro1[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('macro1[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('macro1[3]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), + Reference('macro1[4]'): ResourceAssignment([onr('node003', {0, 1, 2, 3})]), + + Reference('micro1[0][0]'): ResourceAssignment([onr('node001', {0, 1})]), + Reference('micro1[0][1]'): ResourceAssignment([onr('node001', {2, 3})]), + Reference('micro1[0][2]'): ResourceAssignment([onr('node003', {4, 5})]), + Reference('micro1[0][3]'): ResourceAssignment([onr('node003', {6, 7})]), + Reference('micro1[1][0]'): ResourceAssignment([onr('node001', {4, 5})]), + Reference('micro1[1][1]'): ResourceAssignment([onr('node001', {6, 7})]), + Reference('micro1[1][2]'): ResourceAssignment([onr('node004', {0, 1})]), + Reference('micro1[1][3]'): ResourceAssignment([onr('node004', {2, 3})]), + Reference('micro1[2][0]'): ResourceAssignment([onr('node002', {0, 1})]), + Reference('micro1[2][1]'): ResourceAssignment([onr('node002', {2, 3})]), + Reference('micro1[2][2]'): ResourceAssignment([onr('node004', {4, 5})]), + Reference('micro1[2][3]'): ResourceAssignment([onr('node004', {6, 7})]), + Reference('micro1[3][0]'): ResourceAssignment([onr('node002', {4, 5})]), + Reference('micro1[3][1]'): ResourceAssignment([onr('node002', {6, 7})]), + Reference('micro1[3][2]'): ResourceAssignment([onr('node005', {0, 1})]), + Reference('micro1[3][3]'): ResourceAssignment([onr('node005', {2, 3})]), + Reference('micro1[4][0]'): ResourceAssignment([onr('node003', {0, 1})]), + Reference('micro1[4][1]'): ResourceAssignment([onr('node003', {2, 3})]), + Reference('micro1[4][2]'): ResourceAssignment([onr('node005', {4, 5})]), + Reference('micro1[4][3]'): ResourceAssignment([onr('node005', {6, 7})]), + + Reference('macro2[0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('macro2[1]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('macro2[2]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('macro2[3]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), + Reference('macro2[4]'): ResourceAssignment([onr('node003', {0, 1, 2, 3})]), + + Reference('micro2[0][0]'): ResourceAssignment([onr('node001', {0, 1, 2, 3})]), + Reference('micro2[0][1]'): ResourceAssignment([onr('node003', {4, 5, 6, 7})]), + Reference('micro2[1][0]'): ResourceAssignment([onr('node001', {4, 5, 6, 7})]), + Reference('micro2[1][1]'): ResourceAssignment([onr('node004', {0, 1, 2, 3})]), + Reference('micro2[2][0]'): ResourceAssignment([onr('node002', {0, 1, 2, 3})]), + Reference('micro2[2][1]'): ResourceAssignment([onr('node004', {4, 5, 6, 7})]), + Reference('micro2[3][0]'): ResourceAssignment([onr('node002', {4, 5, 6, 7})]), + Reference('micro2[3][1]'): ResourceAssignment([onr('node005', {0, 1, 2, 3})]), + Reference('micro2[4][0]'): ResourceAssignment([onr('node003', {0, 1, 2, 3})]), + Reference('micro2[4][1]'): ResourceAssignment([onr('node005', {4, 5, 6, 7})]), } @@ -763,7 +805,7 @@ s14_model, None, s14_implementations, s14_requirements) -s14_resources = Resources({'node001': {0, 1, 2, 3, 4, 5}}) +s14_resources = resources({'node001': [c(0), c(1), c(2), c(3), c(4), c(5)]}) s14_solution = RuntimeError @@ -810,16 +852,20 @@ def test_scenarios(scenario: _Scenario) -> None: if isinstance(req, ThreadedResReq): for instance in component.instances(): - assert len(list(allocations[instance].nodes())) == 1 - assert allocations[instance].total_cores() == req.threads + assert len(allocations[instance].by_rank) == 1 + assert allocations[instance].by_rank[0].total_cores() == req.threads elif isinstance(req, MPICoresResReq): for instance in component.instances(): - tcores = allocations[instance].total_cores() - assert tcores == req.mpi_processes + nranks = len(allocations[instance].by_rank) + assert nranks == req.mpi_processes + for r in range(nranks): + assert allocations[instance].by_rank[r].total_cores() == 1 # check for any overlapping instances - for instance1, res1 in allocations.items(): - for instance2, res2 in allocations.items(): + for instance1, res_asm1 in allocations.items(): + for instance2, res_asm2 in allocations.items(): + res1 = res_asm1.as_resources() + res2 = res_asm2.as_resources() cname1 = instance1.without_trailing_ints() cname2 = instance2.without_trailing_ints() if cname1 != cname2: diff --git a/libmuscle/python/libmuscle/planner/test/test_resources.py b/libmuscle/python/libmuscle/planner/test/test_resources.py new file mode 100644 index 00000000..f0158850 --- /dev/null +++ b/libmuscle/python/libmuscle/planner/test/test_resources.py @@ -0,0 +1,435 @@ +from copy import copy + +import pytest + +from libmuscle.planner.resources import Core, CoreSet, OnNodeResources, Resources + + +@pytest.fixture +def c1(): + return Core(0, {0, 1}) + + +def test_core_equals(c1): + c2 = Core(0, {0, 1}) + c3 = Core(1, {0, 1}) + c4 = Core(0, {2, 3}) + + assert c1 == c2 + assert not c1 != c2 + assert c1 != c3 + assert c1 != c4 + assert c3 != c4 + + +def test_core_length(c1): + assert len(c1) == 2 + + c2 = Core(1, {4, 5, 6, 7}) + assert len(c2) == 4 + + +def test_core_copy(c1): + c2 = copy(c1) + assert c2.cid == 0 + assert c2.hwthreads == {0, 1} + + c2.hwthreads.add(2) + assert c1.hwthreads == {0, 1} + assert c2.hwthreads == {0, 1, 2} + + +def test_core_union(): + c1 = Core(3, {3}) + c2 = Core(3, {4}) + + assert c1 | c2 == Core(3, {3, 4}) + + c3 = Core(2, {2}) + with pytest.raises(ValueError): + c1 | c3 + + +def test_core_union_onto(c1): + c2 = Core(0, {2, 3}) + + c1 |= c2 + assert c1.hwthreads == {0, 1, 2, 3} + assert c2.hwthreads == {2, 3} + + c3 = Core(3, {6, 7}) + with pytest.raises(ValueError): + c1 |= c3 + + +def test_core_subtract(): + c1 = Core(0, {0, 1, 2, 3}) + c2 = Core(0, {0, 3}) + + c1 -= c2 + assert c1.cid == 0 + assert c1.hwthreads == {1, 2} + + c3 = Core(0, {2, 3}) + c1 -= c3 + assert c1.cid == 0 + assert c1.hwthreads == {1} + + c4 = Core(1, {1, 2}) + with pytest.raises(ValueError): + c1 -= c4 + + +def test_core_isdisjoint(c1): + c2 = Core(0, {0}) + c3 = Core(0, {2, 3}) + c4 = Core(1, {0, 1}) + + assert not c1.isdisjoint(c2) + assert not c2.isdisjoint(c1) + assert c1.isdisjoint(c3) + + with pytest.raises(ValueError): + c1.isdisjoint(c4) + + +def test_core_str(c1): + assert str(c1) == '0(0,1)' + + +def test_core_repr(c1): + assert repr(c1) == 'Core(0, {0,1})' + + +@pytest.fixture +def cs1(): + return CoreSet([Core(0, {0, 1}), Core(1, {2, 3})]) + + +def test_core_set_equals(cs1): + cs2 = CoreSet([Core(0, {0, 1}), Core(1, {2, 3})]) + cs3 = CoreSet([Core(1, {2, 3})]) + cs4 = CoreSet([]) + cs5 = CoreSet([Core(0, {0, 1}), Core(1, {2, 3}), Core(2, {4, 5})]) + cs6 = CoreSet([Core(3, {6, 7})]) + + assert cs1 == cs2 + assert not cs1 != cs2 + assert cs1 != cs3 + assert cs1 != cs4 + assert cs1 != cs5 + assert cs1 != cs6 + assert not cs3 == cs4 + assert cs4 != cs5 + + +def test_core_set_length(cs1): + cs2 = CoreSet([]) + cs3 = CoreSet([Core(3, {6, 7})]) + + assert len(cs1) == 2 + assert len(cs2) == 0 + assert len(cs3) == 1 + + +def test_core_set_iter(cs1): + for i, core in enumerate(cs1): + assert i == core.cid + assert core.hwthreads == {i * 2, i * 2 + 1} + + assert i == 1 + + +def test_core_set_copy(cs1): + cs2 = copy(cs1) + assert cs1 == cs2 + + cs2._cores[2] = Core(2, {4, 5}) + assert len(cs1._cores) == 2 + + cs2._cores[0].hwthreads.add(2) + assert 2 not in cs1._cores[0].hwthreads + + +def test_core_set_union_onto(cs1): + cs2 = CoreSet([Core(3, {6, 7})]) + cs1 |= cs2 + + assert len(cs1) == 3 + assert 0 in cs1._cores + assert cs1._cores[0].cid == 0 + assert cs1._cores[0].hwthreads == {0, 1} + assert 1 in cs1._cores + assert cs1._cores[1].cid == 1 + assert cs1._cores[1].hwthreads == {2, 3} + assert 3 in cs1._cores + assert cs1._cores[3].cid == 3 + assert cs1._cores[3].hwthreads == {6, 7} + + assert id(cs1._cores[3]) != id(cs2._cores[3]) + assert id(cs1._cores[3].hwthreads) != id(cs2._cores[3].hwthreads) + + +def test_core_set_subtract_disjunct(cs1): + cs2 = CoreSet([Core(3, {6, 7})]) + cs1 -= cs2 + + assert len(cs1) == 2 + assert 0 in cs1._cores + assert 1 in cs1._cores + + assert len(cs2) == 1 + assert 3 in cs2._cores + + +def test_core_set_subtract_whole_core(cs1): + cs2 = CoreSet([Core(0, {0, 1})]) + cs1 -= cs2 + + assert len(cs1) == 1 + assert 0 not in cs1._cores + assert 1 in cs1._cores + + assert len(cs2) == 1 + assert 0 in cs2._cores + + +def test_core_set_subtract_threads(cs1): + cs2 = CoreSet([Core(1, {2})]) + i1 = id(cs1._cores[1]) + + cs1 -= cs2 + + assert len(cs1) == 2 + assert 0 in cs1._cores + assert 1 in cs1._cores + assert id(cs1._cores[1]) == i1 + assert len(cs1._cores[1]) == 1 + assert cs1._cores[1].hwthreads == {3} + assert cs1._cores[0].hwthreads == {0, 1} + + +def test_core_set_str(cs1): + assert str(cs1) == '0-1(0-3)' + + +def test_core_set_repr(cs1): + assert repr(cs1) == 'CoreSet({Core(0, {0,1}), Core(1, {2,3})})' + + +def test_core_set_get_first_cores(cs1): + assert cs1.get_first_cores(0)._cores == {} + assert cs1.get_first_cores(1)._cores == {0: Core(0, {0, 1})} + assert cs1.get_first_cores(2)._cores == { + 0: Core(0, {0, 1}), + 1: Core(1, {2, 3})} + with pytest.raises(RuntimeError): + cs1.get_first_cores(3) + + +@pytest.fixture +def n1(cs1): + return OnNodeResources('node001', cs1) + + +def test_node_resources_equals(n1): + n2 = OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})])) + n3 = OnNodeResources('node002', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})])) + n4 = OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(1, {4, 3})])) + + assert n1 == n2 + assert n1 != n3 + assert n1 != n4 + + +def test_node_resources_copy(n1): + n2 = copy(n1) + + assert n1 == n2 + assert id(n1.cpu_cores) != id(n2.cpu_cores) + assert id(n1.cpu_cores._cores[0]) != id(n2.cpu_cores._cores[0]) + assert id(n1.cpu_cores._cores[1].hwthreads) != id(n2.cpu_cores._cores[1].hwthreads) + + +def test_node_resources_union_onto(n1): + n2 = OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(4, {8, 9, 10, 11})])) + n3 = OnNodeResources('node001', CoreSet([Core(0, {3})])) + n4 = OnNodeResources('node002', CoreSet([Core(3, {3})])) + + n1 |= n2 + + assert len(n1.cpu_cores) == 3 + assert id(n1.cpu_cores._cores[4]) != id(n2.cpu_cores._cores[4]) + + n1 |= n3 + + assert len(n1.cpu_cores) == 3 + assert n1.cpu_cores._cores[0].hwthreads == {0, 1, 3} + + with pytest.raises(ValueError): + n1 |= n4 + + +def test_node_resources_hwthreads(n1): + assert list(n1.hwthreads()) == [0, 1, 2, 3] + + +def test_node_resources_subtract(n1): + n2 = OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(4, {8, 9, 10, 11})])) + n3 = OnNodeResources('node001', CoreSet([Core(1, {3})])) + n4 = OnNodeResources('node002', CoreSet([Core(3, {3})])) + + n1 -= n2 + + assert len(n1.cpu_cores) == 1 + assert len(n1.cpu_cores._cores[1]) == 2 + + n1 -= n3 + + assert len(n1.cpu_cores) == 1 + assert len(n1.cpu_cores._cores[1]) == 1 + + with pytest.raises(ValueError): + n1 -= n4 + + +@pytest.fixture +def r1(n1): + return Resources([n1]) + + +def test_resources_length(r1, n1): + r2 = Resources([n1, OnNodeResources('node002', CoreSet([Core(0, {0, 1})]))]) + + assert len(r1) == 1 + assert len(r2) == 2 + + +def test_resources_iter(cs1, n1): + n2 = OnNodeResources('node004', cs1) + n3 = OnNodeResources('node002', CoreSet([Core(3, {3})])) + nodes = [n1, n2, n3] + res = Resources(nodes) + + for i, n in enumerate(res): + assert n == nodes[i] + + +def test_resources_equals(r1): + assert r1 == Resources( + [OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})]))]) + + r2 = Resources( + [OnNodeResources('node002', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})]))]) + assert r1 != r2 + + r3 = Resources( + [OnNodeResources( + 'node001', CoreSet([Core(0, {0, 1}), Core(1, {1, 2, 3})]))]) + assert r1 != r3 + + r4 = Resources([OnNodeResources('node001', CoreSet([Core(1, {1, 2})]))]) + assert r1 != r4 + + r5 = Resources([ + OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})])), + OnNodeResources('node002', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})])) + ]) + assert r1 != r5 + + +def test_resources_copy(r1): + r2 = copy(r1) + assert id(r2._nodes['node001']) != id(r1._nodes['node001']) + assert id(r2._nodes['node001'].cpu_cores) != id(r1._nodes['node001'].cpu_cores) + + +def test_resources_union_onto(r1): + r2 = Resources([]) + r2 |= r1 + assert r2 == r1 + + r3 = Resources([OnNodeResources('node002', CoreSet([Core(0, {0})]))]) + r3 |= r1 + assert len(r3._nodes) == 2 + assert id(r3._nodes['node001']) != id(r1._nodes['node001']) + assert sorted(r3._nodes.keys()) == ['node001', 'node002'] + + +def test_resources_subtract(r1): + r2 = Resources([]) + r2 -= r1 + assert len(r2._nodes) == 0 + + r1 -= r2 + assert len(r1._nodes) == 1 + + r3 = Resources([OnNodeResources('node001', CoreSet([Core(0, {0})]))]) + r1 -= r3 + assert len(r1._nodes) == 1 + assert r1._nodes['node001'].cpu_cores._cores[0].hwthreads == {1} + + +def test_resources_nodes(): + r1 = Resources([ + OnNodeResources('node001', CoreSet([Core(0, {0})])), + OnNodeResources('node003', CoreSet([Core(1, {1})])), + OnNodeResources('node004', CoreSet([Core(2, {2})]))]) + + assert sorted(r1.nodes()) == ['node001', 'node003', 'node004'] + + +def test_resources_total_cores(): + r1 = Resources([ + OnNodeResources('node001', CoreSet([Core(0, {0, 1})])), + OnNodeResources('node003', CoreSet([Core(1, {1}), Core(5, {5})])), + OnNodeResources('node004', CoreSet([Core(2, {2})]))]) + + assert r1.total_cores() == 4 + + +def test_resource_hwthreads(n1, r1): + hwthreads = list(r1.hwthreads()) + assert hwthreads == [('node001', 0), ('node001', 1), ('node001', 2), ('node001', 3)] + + n2 = OnNodeResources('node007', CoreSet([Core(7, {7}), Core(3, {3})])) + res = Resources([n1, n2]) + + hwthreads = list(res.hwthreads()) + assert hwthreads == [ + ('node001', 0), ('node001', 1), ('node001', 2), ('node001', 3), + ('node007', 7), ('node007', 3)] + + +def test_resources_isdisjoint(r1): + r2 = Resources([]) + assert r1.isdisjoint(r2) + + r3 = Resources([OnNodeResources('node001', CoreSet([Core(0, {0})]))]) + assert not r1.isdisjoint(r3) + + r4 = Resources([OnNodeResources('node001', CoreSet([Core(0, {2})]))]) + assert r1.isdisjoint(r4) + + r5 = Resources([OnNodeResources('node002', CoreSet([Core(0, {0})]))]) + assert r1.isdisjoint(r5) + + +def test_resources_union(r1): + r2 = Resources([]) + r3 = Resources([OnNodeResources('node001', CoreSet([Core(0, {0})]))]) + r4 = Resources([OnNodeResources('node001', CoreSet([Core(0, {2})]))]) + r5 = Resources([OnNodeResources('node002', CoreSet([Core(0, {0})]))]) + + assert Resources.union([r1, r2]) == r1 + assert Resources.union([r1, r3]) == r1 + assert Resources.union([r1, r4]) == Resources([ + OnNodeResources('node001', CoreSet([Core(0, {0, 1, 2}), Core(1, {2, 3})]))]) + + assert Resources.union([r1, r5]) == Resources([ + OnNodeResources('node001', CoreSet([Core(0, {0, 1}), Core(1, {2, 3})])), + OnNodeResources('node002', CoreSet([Core(0, {0})]))]) + + assert Resources.union([r1, r2, r3, r4, r5]) == Resources([ + OnNodeResources('node001', CoreSet([Core(0, {0, 1, 2}), Core(1, {2, 3})])), + OnNodeResources('node002', CoreSet([Core(0, {0})]))]) diff --git a/libmuscle/python/libmuscle/post_office.py b/libmuscle/python/libmuscle/post_office.py index e15057fc..2ec2056c 100644 --- a/libmuscle/python/libmuscle/post_office.py +++ b/libmuscle/python/libmuscle/post_office.py @@ -2,15 +2,12 @@ import time from typing import Dict -import msgpack from ymmsl import Reference -from libmuscle.mcp.protocol import RequestType -from libmuscle.mcp.transport_server import RequestHandler from libmuscle.outbox import Outbox -class PostOffice(RequestHandler): +class PostOffice: """A PostOffice is an object that holds messages to be retrieved. A PostOffice holds outboxes with messages for receivers. It also @@ -23,25 +20,14 @@ def __init__(self) -> None: self._outbox_lock = Lock() - def handle_request(self, request: bytes) -> bytes: - """Handle a request. - - This receives an MCP request and handles it by blocking until - the requested message is available, then returning it. + def have_message(self, receiver: Reference) -> bool: + """Return whether there's a message for the given receiver. Args: - request: A received request - - Returns: - An encoded response + receiver: The receiver of the message. """ - req = msgpack.unpackb(request, raw=False) - if len(req) != 2 or req[0] != RequestType.GET_NEXT_MESSAGE.value: - raise RuntimeError( - 'Invalid request type. Did the streams get crossed?') - recv_port = Reference(req[1]) - self._ensure_outbox_exists(recv_port) - return self._outboxes[recv_port].retrieve() + self._ensure_outbox_exists(receiver) + return not self._outboxes[receiver].is_empty() def get_message(self, receiver: Reference) -> bytes: """Get a message from a receiver's outbox. diff --git a/libmuscle/python/libmuscle/test/conftest.py b/libmuscle/python/libmuscle/test/conftest.py index 2802061d..b64a51e3 100644 --- a/libmuscle/python/libmuscle/test/conftest.py +++ b/libmuscle/python/libmuscle/test/conftest.py @@ -1,5 +1,6 @@ from copy import copy import pytest +from typing import Dict, List, Set, Union from unittest.mock import patch from ymmsl import Operator, Reference, Settings @@ -8,6 +9,7 @@ from libmuscle.communicator import Message from libmuscle.mcp.transport_client import ProfileData from libmuscle.mmp_client import MMPClient +from libmuscle.planner.resources import Core, CoreSet, OnNodeResources, Resources from libmuscle.port import Port from libmuscle.profiler import Profiler from libmuscle.timestamp import Timestamp @@ -99,3 +101,22 @@ def port_exists(name): port_manager.list_ports.return_value = declared_ports port_manager.port_exists = port_exists return port_manager + + +def core(hwthread_id: int) -> Core: + """Helper that defines a core with the given core and hwthread id.""" + return Core(hwthread_id, {hwthread_id}) + + +def on_node_resources(node_name: str, cores: Union[int, Set[int]]) -> OnNodeResources: + """Helper that defines resources on a node from the name and a CPU core.""" + if isinstance(cores, int): + cores = {cores} + return OnNodeResources(node_name, CoreSet([Core(core, {core}) for core in cores])) + + +def resources(node_resources: Dict[str, List[Core]]) -> Resources: + """Helper that defines a Resources from a dict.""" + return Resources([ + OnNodeResources(node_name, CoreSet(cores)) + for node_name, cores in node_resources.items()]) diff --git a/muscle3/muscle3.py b/muscle3/muscle3.py index 04a8c3a8..b1373bc9 100644 --- a/muscle3/muscle3.py +++ b/muscle3/muscle3.py @@ -8,8 +8,8 @@ from ymmsl import PartialConfiguration -from libmuscle.planner.planner import ( - Planner, Resources, InsufficientResourcesAvailable) +from libmuscle.planner.planner import Planner, InsufficientResourcesAvailable +from libmuscle.planner.resources import Core, CoreSet, OnNodeResources, Resources from libmuscle.snapshot_manager import SnapshotManager from muscle3.profiling import ( plot_instances, plot_resources, plot_timeline, show_plots) @@ -138,7 +138,10 @@ def resources( click.echo(_RESOURCES_INCOMPLETE_MODEL, err=True) sys.exit(1) - resources = Resources({'node000001': set(range(cores_per_node))}) + resources = Resources([ + OnNodeResources( + 'node000001', CoreSet([Core(i, {i}) for i in range(cores_per_node)]))]) + planner = Planner(resources) try: allocations = planner.allocate_all(config, True) diff --git a/scripts/gmake/check_tools.make b/scripts/gmake/check_tools.make index 0adc8ff0..51113dab 100644 --- a/scripts/gmake/check_tools.make +++ b/scripts/gmake/check_tools.make @@ -67,6 +67,8 @@ tool_command := mpi$(CXX) include $(TOOLDIR)/detect_tool.make tool_command := mpic++ include $(TOOLDIR)/detect_tool.make +tool_command := mpicxx +include $(TOOLDIR)/detect_tool.make ifndef MPICXX $(info - No MPI C++ compiler found! Maybe there's no MPI installed?) diff --git a/setup.py b/setup.py index a8d3fda7..d31fa790 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ 'matplotlib>=3,<4', 'msgpack>=1,<2', 'psutil>=5.0.0', + 'parsimonious', "numpy>=1.22", 'qcg-pilotjob==0.13.1', 'typing_extensions>=4.4.0,<5', diff --git a/tox.ini b/tox.ini index 3f627cf5..970daf2d 100644 --- a/tox.ini +++ b/tox.ini @@ -4,10 +4,13 @@ skip_missing_interpreters = true [testenv] deps = - mypy + cerulean # not actually used for these non-cluster tests flake8 + mypy pytest pytest-cov + requests # missing dependency in cerulean... + types-parsimonious types-psutil ymmsl @@ -22,6 +25,25 @@ commands = pytest {posargs} flake8 libmuscle/python/libmuscle integration_test scripts/ +[testenv:cluster] +deps = + cerulean + docker + pytest + pytest-cov + requests # missing dependency in cerulean... + types-parsimonious + types-psutil + ymmsl + +setenv = + MUSCLE_TEST_CLUSTER=1 + +commands = + pytest -k 'test_cluster' --log-disable=paramiko.transport {posargs} + # pytest --log-cli-level=DEBUG --log-disable=paramiko.transport --log-disable=paramiko.transport.sftp --log-disable=cerulean.copy_files -s -k 'test_cluster' {posargs} + + [gh-actions] python = 3.8: py38