Skip to content

Commit

Permalink
Rearrange tests to enable multiple SLURM and MPI versions
Browse files Browse the repository at this point in the history
  • Loading branch information
LourensVeen committed Nov 29, 2024
1 parent d761702 commit edb63ac
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 109 deletions.
217 changes: 139 additions & 78 deletions integration_test/cluster_test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@
logger_ = logging.getLogger(__name__)


IMAGE_NAME = 'muscle3_test_cluster'

REMOTE_SHARED = '/home/cerulean/shared'

IDX_SLURM_VERSIONS = list(enumerate(['23-11']))

# Shut down the containers after running the tests. Set to False to debug.
CLEAN_UP_CONTAINERS = True
Expand Down Expand Up @@ -41,16 +44,20 @@ def local_fs():
return cerulean.LocalFileSystem()


@pytest.fixture(scope='session')
def repo_root(local_fs):
root_dir = Path(__file__).parents[2]
return local_fs / str(root_dir)


@pytest.fixture(scope='session')
def fake_cluster_image(local_term):
IMAGE_NAME = 'muscle3_test_cluster'
run_cmd(local_term, 5400, (
f'docker buildx build -t {IMAGE_NAME}'
' -f integration_test/fake_cluster/Dockerfile .'))
return IMAGE_NAME


def ssh_term(timeout_msg):
def ssh_term(port, timeout_msg):
cred = cerulean.PasswordCredential('cerulean', 'kingfisher')
ready = False
start = time.monotonic()
Expand All @@ -59,7 +66,7 @@ def ssh_term(timeout_msg):
raise Exception(timeout_msg)

try:
term = cerulean.SshTerminal('localhost', 10022, cred)
term = cerulean.SshTerminal('localhost', port, cred)
ready = True
except Exception:
time.sleep(3.0)
Expand All @@ -78,93 +85,55 @@ def shared_dir():

@pytest.fixture(scope='session')
def cleanup_docker(local_term):
for i in range(5):
node_name = f'node-{i}'
run_cmd(local_term, 60, f'docker rm -f {node_name}')
for _, slurm_version in IDX_SLURM_VERSIONS:
_clean_up_base_cluster(local_term, slurm_version)

run_cmd(local_term, 60, 'docker rm -f headnode')
run_cmd(local_term, 60, 'docker network rm -f muscle3-net')


@pytest.fixture(scope='session')
def fake_cluster_network(local_term, cleanup_docker):
name = 'muscle3-net'
def _create_network(local_term, slurm_version):
name = f'muscle3-net-{slurm_version}'
run_cmd(local_term, 60, f'docker network create {name}')
yield name

if CLEAN_UP_CONTAINERS:
run_cmd(local_term, 60, 'docker network rm -f muscle3-net')
return name


@pytest.fixture(scope='session')
def fake_cluster_nodes(
local_term, fake_cluster_image, fake_cluster_network, shared_dir):

node_names = list()

def _start_nodes(local_term, slurm_version, net_name, shared_dir):
for i in range(5):
node_name = f'node-{i}'
ssh_port = 10030 + i

run_cmd(local_term, 60, (
f'docker run -d --name={node_name} --hostname={node_name}'
f' --network={fake_cluster_network} -p {ssh_port}:22'
f' --cap-add=CAP_SYS_NICE'
f'docker run -d --name={node_name}-{slurm_version} --hostname={node_name}'
f' --network={net_name} --cap-add=CAP_SYS_NICE'
f' --env SLURM_VERSION={slurm_version}'
f' --mount type=bind,source={shared_dir},target={REMOTE_SHARED}'
f' {fake_cluster_image}'))

node_names.append(node_name)
f' {IMAGE_NAME}'))

yield None

if CLEAN_UP_CONTAINERS:
run_cmd(local_term, 60, f'docker rm -f {" ".join(node_names)}')


@pytest.fixture(scope='session')
def fake_cluster_headnode(
local_term, fake_cluster_image, fake_cluster_network, fake_cluster_nodes,
shared_dir):

def _start_headnode(local_term, slurm_version, net_name, shared_dir, headnode_port):
run_cmd(local_term, 60, (
'docker run -d --name=headnode --hostname=headnode'
f' --network={fake_cluster_network} -p 10022:22'
f'docker run -d --name=headnode-{slurm_version} --hostname=headnode'
f' --network={net_name} -p {headnode_port}:22'
f' --env SLURM_VERSION={slurm_version}'
f' --mount type=bind,source={shared_dir},target={REMOTE_SHARED}'
f' {fake_cluster_image}'))
f' {IMAGE_NAME}'))

ssh_term('Virtual cluster container start timed out')
yield None
ssh_term(headnode_port, 'Virtual cluster container start timed out')

if CLEAN_UP_CONTAINERS:
run_cmd(local_term, 60, 'docker rm -f headnode')

def _start_base_cluster(local_term, idx_slurm_version, shared_dir):
slurm_index, slurm_version = idx_slurm_version

@pytest.fixture(scope='session')
def setup_connection(fake_cluster_headnode):
# Session-wide connection used for container setup actions only
# Tests each have their own connection, see fake_cluster() below
term = ssh_term('Connection to virtual cluster container timed out')
with cerulean.SftpFileSystem(term, True) as fs:
yield term, fs

# We abuse this to clean up the contents of the shared directory.
# Because it's been made inside of the container, it has a different owner
# than what we're running with on the host, and the host user cannot remove
# the files.
if CLEAN_UP_CONTAINERS:
run_cmd(term, 60, f'rm -rf {REMOTE_SHARED}/*')
headnode_port = 10022 + slurm_index

net_name = _create_network(local_term, slurm_version)
_start_nodes(local_term, slurm_version, net_name, shared_dir)
_start_headnode(local_term, slurm_version, net_name, shared_dir, headnode_port)

@pytest.fixture(scope='session')
def repo_root(local_fs):
root_dir = Path(__file__).parents[2]
return local_fs / str(root_dir)
term = ssh_term(headnode_port, 'Connection to virtual cluster container timed out')
fs = cerulean.SftpFileSystem(term, False)

return term, fs, headnode_port

@pytest.fixture(scope='session')
def remote_source(repo_root, setup_connection):
remote_term, remote_fs = setup_connection

def _install_remote_source(repo_root, remote_term, remote_fs):
muscle3_tgt = remote_fs / 'home' / 'cerulean' / 'muscle3'
muscle3_tgt.mkdir()
(muscle3_tgt / 'libmuscle').mkdir()
Expand All @@ -178,10 +147,7 @@ def remote_source(repo_root, setup_connection):
return muscle3_tgt


@pytest.fixture(scope='session')
def muscle3_venv(repo_root, remote_source, setup_connection):
remote_term, remote_fs = setup_connection

def _create_muscle3_venv(remote_term, remote_source):
run_cmd(remote_term, 10, f'python3 -m venv {REMOTE_SHARED}/venv')
in_venv = f'source {REMOTE_SHARED}/venv/bin/activate && '

Expand All @@ -192,21 +158,116 @@ def muscle3_venv(repo_root, remote_source, setup_connection):
return in_venv


@pytest.fixture(scope='session')
def muscle3_native_openmpi(remote_source, setup_connection):
remote_term, remote_fs = setup_connection

def _install_muscle3_native_openmpi(
remote_source, remote_term, remote_fs, slurm_version):
prefix = remote_fs / REMOTE_SHARED / 'muscle3-openmpi'
prefix.mkdir()

openmpi_hash = run_cmd(remote_term, 600, (
'/bin/bash -c "'
'for phash in $(/opt/spack/bin/spack find --format \\"{hash}\\" openmpi'
' | tr \'\\n\' \' \') ; do'
' if /opt/spack/bin/spack find --deps /\\${phash} |'
f' grep -q slurm@{slurm_version} ; then'
' echo \\${phash} ;'
' fi ;'
'done'
'"'))

openmpi_version = run_cmd(remote_term, 600, (
'/bin/bash -c "'
f'/opt/spack/bin/spack find --format \\"{{version}}\\" /{openmpi_hash}'
'"')).strip()

module_name = f'openmpi/{openmpi_version}-gcc-11.4.0-{openmpi_hash[:7]}'

logger_.info(f'Slurm {slurm_version} and module {module_name}')

run_cmd(remote_term, 600, (
f'/bin/bash -l -c "'
f'module load openmpi && '
f'module load {module_name} && '
f'cd {remote_source} && '
f'make distclean && '
f'PREFIX={prefix} make install"'))

return prefix
return prefix, module_name


def _install_muscle3(repo_root, remote_term, remote_fs, slurm_version):
remote_source = _install_remote_source(repo_root, remote_term, remote_fs)
in_venv = _create_muscle3_venv(remote_term, remote_source)
return _install_muscle3_native_openmpi(
remote_source, remote_term, remote_fs, slurm_version)


def _install_tests(repo_root, remote_term, remote_fs, remote_m3_openmpi):
remote_home = remote_fs / REMOTE_SHARED
remote_m3, openmpi_module = remote_m3_openmpi

cerulean.copy(
repo_root / 'integration_test' / 'cluster_test', remote_home,
copy_permissions=True)

remote_source = remote_home / 'cluster_test'

run_cmd(remote_term, 30, (
'/bin/bash -c "'
f'sed -i \\"s^modules: openmpi^modules: {openmpi_module}^\\"'
f' {remote_source}/implementations_openmpi.ymmsl'
'"'))

run_cmd(remote_term, 30, (
'/bin/bash -c "'
f'sed -i \\"s^modules: openmpi^modules: {openmpi_module}^\\"'
f' {remote_source}/implementations_srunmpi.ymmsl'
'"'))

run_cmd(remote_term, 30, (
f'/bin/bash -l -c "'
f'module load {openmpi_module} && '
f'. {remote_m3}/bin/muscle3.env && '
f'make -C {remote_source}"'))


def _clean_up_base_cluster(local_term, slurm_version):
node_names = [f'node-{i}-{slurm_version}' for i in range(5)]
run_cmd(local_term, 60, f'docker rm -f {" ".join(node_names)}')

run_cmd(local_term, 60, f'docker rm -f headnode-{slurm_version}')

net_name = f'muscle3-net-{slurm_version}'
run_cmd(local_term, 60, f'docker network rm -f {net_name}')


@pytest.fixture(scope='session', params=IDX_SLURM_VERSIONS)
def installed_cluster(
request, cleanup_docker, fake_cluster_image, shared_dir,
repo_root, local_term):

slurm_version = request.param[1]
local_shared_dir = shared_dir / slurm_version
local_shared_dir.mkdir()
local_shared_dir.chmod(0o1777)

remote_term, remote_fs, headnode_port = _start_base_cluster(
local_term, request.param, local_shared_dir)
remote_m3_openmpi = _install_muscle3(
repo_root, remote_term, remote_fs, slurm_version)
_install_tests(repo_root, remote_term, remote_fs, remote_m3_openmpi)

yield headnode_port

# Because it's been made inside of the container, the shared directory has a
# different owner than what we're running with on the host, and the host user cannot
# remove the files. So we do it here from inside the container
if CLEAN_UP_CONTAINERS:
run_cmd(remote_term, 60, f'rm -rf {REMOTE_SHARED}/*')

remote_fs.close()
remote_term.close()

if CLEAN_UP_CONTAINERS:
_clean_up_base_cluster(local_term, slurm_version)


@pytest.fixture(scope='session')
Expand Down
31 changes: 3 additions & 28 deletions integration_test/cluster_test/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,35 +9,10 @@
logger_ = logging.getLogger(__name__)


@pytest.fixture(scope='session')
def copy_test_files(repo_root, setup_connection):
remote_term, remote_fs = setup_connection
remote_home = remote_fs / REMOTE_SHARED

cerulean.copy(
repo_root / 'integration_test' / 'cluster_test', remote_home,
copy_permissions=True)

return remote_home / 'cluster_test'


@pytest.fixture(scope='session')
def build_native_components(
muscle3_native_openmpi, setup_connection, copy_test_files):
remote_term, remote_fs = setup_connection
remote_source = copy_test_files

run_cmd(remote_term, 30, (
f"/bin/bash -l -c '"
f"module load openmpi && "
f". {muscle3_native_openmpi}/bin/muscle3.env && "
f"make -C {remote_source}'"))


@pytest.fixture
def fake_cluster(
fake_cluster_headnode, muscle3_venv, build_native_components, copy_test_files):
term = ssh_term('Connection to virtual cluster container timed out')
def fake_cluster(installed_cluster):
headnode_port = installed_cluster
term = ssh_term(headnode_port, 'Connection to virtual cluster container timed out')
with cerulean.SftpFileSystem(term, True) as fs:
local_sched = cerulean.DirectGnuScheduler(term)
slurm_sched = cerulean.SlurmScheduler(term)
Expand Down
32 changes: 29 additions & 3 deletions integration_test/fake_cluster/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,9 +1,35 @@
FROM ghcr.io/naturalhpc/cerulean-fake-slurm-23-11:latest
# FROM naturalhpc/cerulean-fake-slurm-23-11:latest
FROM ghcr.io/naturalhpc/cerulean-fake-slurm-base:latest
# FROM naturalhpc/cerulean-fake-slurm-base:latest

RUN . /opt/spack/share/spack/setup-env.sh && \
. $(spack location -i lmod)/lmod/lmod/init/bash && \
spack install openmpi+legacylaunchers+pmi schedulers=slurm ^[email protected] ^slurm/dckfty
spack install [email protected] +legacylaunchers +pmi schedulers=slurm \
^$(spack find --deps slurm@20-11 | grep pmix | tr -d ' ') \
^$(spack find --format "slurm/{hash}" slurm@20-11)

RUN . /opt/spack/share/spack/setup-env.sh && \
. $(spack location -i lmod)/lmod/lmod/init/bash && \
spack install [email protected] +legacylaunchers +pmi schedulers=slurm \
^$(spack find --deps slurm@21-08 | grep pmix | tr -d ' ') \
^$(spack find --format "slurm/{hash}" slurm@21-08)

RUN . /opt/spack/share/spack/setup-env.sh && \
. $(spack location -i lmod)/lmod/lmod/init/bash && \
spack install [email protected] +legacylaunchers +pmi schedulers=slurm \
^$(spack find --deps slurm@22-05 | grep pmix | tr -d ' ') \
^$(spack find --format "slurm/{hash}" slurm@22-05)

RUN . /opt/spack/share/spack/setup-env.sh && \
. $(spack location -i lmod)/lmod/lmod/init/bash && \
spack install [email protected] +legacylaunchers +pmi schedulers=slurm \
^$(spack find --deps slurm@23-02 | grep pmix | tr -d ' ') \
^$(spack find --format "slurm/{hash}" slurm@23-02)

RUN . /opt/spack/share/spack/setup-env.sh && \
. $(spack location -i lmod)/lmod/lmod/init/bash && \
spack install [email protected] +legacylaunchers +pmi schedulers=slurm \
^$(spack find --deps slurm@23-11 | grep pmix | tr -d ' ') \
^$(spack find --format "slurm/{hash}" slurm@23-11)

# RUN . /opt/spack/share/spack/setup-env.sh && \
# . $(spack location -i lmod)/lmod/lmod/init/bash && \
Expand Down

0 comments on commit edb63ac

Please sign in to comment.