From dceff8f4154381db28065200c1ec8fdc32bba7be Mon Sep 17 00:00:00 2001 From: Nikhil Date: Sat, 12 Aug 2023 09:35:22 -0700 Subject: [PATCH] intel/ci: Add code changes to enable weekly job options for mpichsuite. Signed-off-by: Nikhil Nanal --- contrib/intel/jenkins/Jenkinsfile | 458 +++--------------------------- contrib/intel/jenkins/build.py | 41 ++- contrib/intel/jenkins/run.py | 11 +- contrib/intel/jenkins/runtests.py | 4 +- contrib/intel/jenkins/tests.py | 147 +++++++--- 5 files changed, 186 insertions(+), 475 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 47574c69929..55d9470a840 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -1,12 +1,12 @@ import groovy.transform.Field -properties([disableConcurrentBuilds(abortPrevious: true)]) +//properties([disableConcurrentBuilds(abortPrevious: true)]) @Field def DO_RUN=true @Field def TARGET="main" @Field def SCRIPT_LOCATION="py_scripts/contrib/intel/jenkins" @Field def RELEASE=false @Field def BUILD_MODES=["reg", "dbg", "dl"] -@Field def MPI_TYPES=["impi", "mpich", "ompi"] +@Field def MPI_TYPES=["mpich"] @Field def PYTHON_VERSION="3.9" def run_python(version, command, output=null) { @@ -31,7 +31,7 @@ def slurm_batch(partition, node_num, output, command) { def run_fabtests(stage_name, partition, node_num, prov, util=null, user_env=null) { - def command = "python3.9 ${RUN_LOCATION}/runtests.py" + def command = "python3.9 runtests.py" def opts = "--prov=${prov} --test=fabtests" if (util) opts = "${opts} --util=${util}" @@ -51,7 +51,7 @@ def run_fabtests(stage_name, partition, node_num, prov, util=null, def run_middleware(providers, stage_name, test, partition, node_num, mpi=null, imb_grp=null) { - def base_cmd = "python3.9 ${RUN_LOCATION}/runtests.py --test=${test}" + def base_cmd = "python3.9 runtests.py --test=${test}" def opts = "" def prefix = "${env.LOG_DIR}/${stage_name}_" def suffix = "_${test}_reg" @@ -62,6 +62,8 @@ def run_middleware(providers, stage_name, test, partition, node_num, mpi=null, if (imb_grp) base_cmd = "${base_cmd} --imb_grp=${imb_grp}" + if (env.WEEKLY.toBoolean()) + base_cmd = "${base_cmd} --weekly=${env.WEEKLY}" for (prov in providers) { if (prov[1]) { @@ -89,17 +91,14 @@ def gather_logs(cluster, key, dest, source) { } } -def summarize(item, verbose=false, release=false, send_mail=false) { - def cmd = "${RUN_LOCATION}/summary.py --summary_item=all" +def summarize(item, verbose=false, release=false) { + def cmd = "${env.WORKSPACE}/${SCRIPT_LOCATION}/summary.py --summary_item=all" if (verbose) { cmd = "${cmd} -v " } if (release) { cmd = "${cmd} --release " } - if (send_mail.toBoolean()) { - cmd = "${cmd} --send_mail " - } run_python(PYTHON_VERSION, cmd) } @@ -131,24 +130,18 @@ def checkout_py_scripts() { """ } -def build(item, mode=null, cluster=null, release=false, additional_args=null) { - def cmd = "${RUN_LOCATION}/build.py --build_item=${item}" +def build(item, mode=null, cluster=null, release=false) { + def cmd = "${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=${item}" if (mode) { cmd = "${cmd} --ofi_build_mode=${mode} " } - if (cluster) { cmd = "${cmd} --build_cluster=${cluster} " } - if (release) { cmd = "${cmd} --release " } - if (additional_args) { - cmd = "${cmd} ${additional_args} " - } - run_python(PYTHON_VERSION, cmd) } @@ -258,206 +251,59 @@ pipeline { } } } + stage ('prepare build') { + when { equals expected: true, actual: DO_RUN } + steps { + script { + echo "Copying build dirs." + build("builddir") + echo "Copying log dirs." + build("logdir", null, null, RELEASE) + build("mpich") + } + } + } stage ('parallel-builds') { when { equals expected: true, actual: DO_RUN } parallel { - stage ('build') { - steps { - script { - dir (CUSTOM_WORKSPACE) { - echo "Copying build dirs." - build("builddir") - echo "Copying log dirs." - build("logdir", null, null, RELEASE) - for (mode in BUILD_MODES) { - echo "Building Libfabric $mode" - build("libfabric", "$mode") - echo "Building Fabtests $mode" - build("fabtests", "$mode") - } - } - } - } - } - stage ('build-daos') { - agent { - node { - label 'daos_head' - customWorkspace CUSTOM_WORKSPACE - } - } + stage ('build-libfabric') { steps { script { - checkout_py_scripts() - dir (CUSTOM_WORKSPACE) { - build("logdir") - build("libfabric", "reg", "daos") - build("fabtests", "reg") + for (mode in BUILD_MODES) { + echo "Building Libfabric $mode" + build("libfabric", "$mode") + echo "Building Fabtests $mode" + build("fabtests", "$mode") } } } } - stage ('build-gpu') { - agent { - node { - label 'ze' - customWorkspace CUSTOM_WORKSPACE - } - } + stage ('buildmpich-libfabric') { steps { script { - checkout_py_scripts() - dir (CUSTOM_WORKSPACE) { - build("logdir") - build("builddir") - build("libfabric", "reg", "gpu") - build("fabtests", "reg") + dir("${CUSTOM_WORKSPACE}/mpich"){ + checkout scm + echo "Building Libfabric reg" + slurm_batch("squirtle,totodile", "1", + "${env.LOG_DIR}/libfabric_mpich_log", + """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ + --build_item=libfabric --build_cluster=mpich """ + ) } } } } } } + stage('parallel-tests') { when { equals expected: true, actual: DO_RUN } parallel { - stage('MPI_verbs-rxm_IMB') { + stage('mpichtestsuite_tcp') { steps { script { dir (RUN_LOCATION) { - def providers = [["verbs", "rxm"]] - for (mpi in MPI_TYPES) { - for (imb_grp = 1; imb_grp < 4; imb_grp++) { - run_middleware(providers, "MPI", "IMB", - "squirtle,totodile", "2", "${mpi}", - "${imb_grp}") - } - } - } - } - } - } - stage('MPI_verbs-rxm_OSU') { - steps { - script { - dir (RUN_LOCATION) { - def providers = [["verbs", "rxm"]] - for (mpi in MPI_TYPES) { - run_middleware(providers, "MPI", "osu", "squirtle,totodile", - "2", "${mpi}") - } - } - } - } - } - stage('MPI_tcp-rxm') { - steps { - script { - dir (RUN_LOCATION) { - def providers = [["tcp", "rxm"]] - for (mpi in MPI_TYPES) { - for (imb_grp = 1; imb_grp < 4; imb_grp++) { - run_middleware(providers, "MPI", "IMB", - "bulbasaur", "2", "${mpi}", "${imb_grp}") - } - run_middleware(providers, "MPI", "osu", "bulbasaur", "2", - "${mpi}") - } - } - } - } - } - stage('tcp') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("tcp", "bulbasaur", "2", "tcp") - } - } - } - } - stage('verbs-rxm') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("verbs-rxm", "squirtle,totodile", "2", "verbs", - "rxm") - } - } - } - } - stage('verbs-rxd') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("verbs-rxd", "squirtle", "2", "verbs", - "rxd") - } - } - } - } - stage('udp') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("udp", "bulbasaur", "2", "udp") - } - } - } - } - stage('shm') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("shm", "bulbasaur", "1", "shm") - run_fabtests("shm", "bulbasaur", "1", "shm", null, - "FI_SHM_DISABLE_CMA=1") - } - } - } - } - stage('sockets') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("sockets", "bulbasaur", "2", "sockets") - } - } - } - } - stage('ucx') { - steps { - script { - dir (CUSTOM_WORKSPACE) { - for (mode in BUILD_MODES) { - echo "Building Libfabric $mode" - build("libfabric", "${mode}", null, false, "--ucx") - echo "Building Fabtests $mode" - build("fabtests", "${mode}", null, false, "--ucx") - } - } - dir (RUN_LOCATION) { - run_fabtests("ucx", "totodile", "2", "ucx") - } - } - } - } - stage('psm3') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("psm3", "squirtle", "2", "psm3", null, - "PSM3_IDENTIFY=1") - } - } - } - } - stage('mpichtestsuite') { - steps { - script { - dir (RUN_LOCATION) { - def providers = [["verbs", "rxm"], ["tcp", null], - ["tcp", "rxm"], ["sockets", null]] + def providers = [["tcp", null],["verbs","rxm"]] for (mpi in MPI_TYPES) { run_middleware(providers, "mpichtestsuite", "mpichtestsuite", "squirtle,totodile", "2", "${mpi}") @@ -466,229 +312,7 @@ pipeline { } } } - stage('SHMEM') { - steps { - script { - dir (RUN_LOCATION) { - run_middleware([["verbs", null], ["tcp", null], - ["sockets", null]], "SHMEM", "shmem", - "squirtle,totodile", "2") - } - } - } - } - stage ('multinode_performance') { - steps { - script { - dir (RUN_LOCATION) { - run_middleware([["tcp", null]], "multinode_performance", - "multinode", "bulbasaur", "2") - } - } - } - } - stage ('oneCCL') { - steps { - script { - dir (RUN_LOCATION) { - run_middleware([["tcp", "rxm"]/*, ["psm3", null]*/], "oneCCL", - "oneccl", "bulbasaur", "2") - } - } - } - } - stage ('oneCCL-GPU') { - steps { - script { - dir (RUN_LOCATION) { - run_middleware([["verbs", "rxm"]], "oneCCL-GPU", "onecclgpu", - "charmander", "2") - } - } - } - } - stage ('oneCCL-GPU-v3') { - agent { node { label 'ze' } } - options { skipDefaultCheckout() } - steps { - script { - dir (RUN_LOCATION) { - run_middleware([["verbs", "rxm"]], "oneCCL-GPU-v3", "onecclgpu", - "fabrics-ci", "2") - } - } - } - } - stage('daos_tcp') { - agent { node { label 'daos_tcp' } } - options { skipDefaultCheckout() } - steps { - script { - dir (RUN_LOCATION) { - run_python(PYTHON_VERSION, - """runtests.py --prov='tcp' --util='rxm' \ - --test=daos \ - --log_file=${env.LOG_DIR}/daos_tcp-rxm_reg""") - } - } - } - } - stage('daos_verbs') { - agent { node { label 'daos_verbs' } } - options { skipDefaultCheckout() } - steps { - script { - dir (RUN_LOCATION) { - run_python(PYTHON_VERSION, - """runtests.py --prov='verbs' --util='rxm' \ - --test=daos \ - --log_file=${env.LOG_DIR}/daos_verbs-rxm_reg""") - } - } - } - } - stage ('DMABUF-Tests') { - agent { node { label 'ze' } } - options { skipDefaultCheckout() } - steps { - script { - dir ("${env.WORKSPACE}/${SCRIPT_LOCATION}/") { - output = "${LOG_DIR}/DMABUF-Tests_verbs-rxm_dmabuf_reg" - cmd = """ python3.9 runtests.py --test=dmabuf \ - --prov=verbs --util=rxm""" - slurm_batch("fabrics-ci", "1", "${output}", "${cmd}") - } - } - } - } - stage ('ze-shm') { - steps { - script { - dir (RUN_LOCATION) { - def providers = [["shm", null]] - def directions = ["h2d", "d2d", "xd2d"] - def base_cmd = "python3.9 runtests.py --device=ze" - def prefix = "${env.LOG_DIR}/ze_" - def suffix = "_reg" - for (prov in providers) { - for (way in directions) { - if (prov[1]) { - echo "Running ${prov[0]}-${prov[1]} ze" - slurm_batch("charmander", "1", - "${prefix}${prov[0]}-${prov[1]}_${way}${suffix}", - """${base_cmd} --prov=${prov[0]} \ - --util=${prov[1]} --way=${way}""") - } else { - echo "Running ${prov[0]} ze" - slurm_batch("charmander", "1", - "${prefix}${prov[0]}_${way}${suffix}", - "${base_cmd} --prov=${prov[0]} --way=${way}") - } - } - } - } - } - } - } - stage ('ze-shm-v3') { - agent { node { label 'ze' } } - options { skipDefaultCheckout() } - steps { - script { - dir (RUN_LOCATION) { - def providers = [["shm", null]] - def directions = ["h2d", "d2d", "xd2d"] - def base_cmd = "python3.9 runtests.py --device=ze" - def prefix = "${env.LOG_DIR}/ze_v3_" - def suffix = "_reg" - for (prov in providers) { - for (way in directions) { - if (prov[1]) { - echo "Running ${prov[0]}-${prov[1]} ze" - slurm_batch("fabrics-ci", "1", - "${prefix}${prov[0]}-${prov[1]}_${way}${suffix}", - """${base_cmd} --prov=${prov[0]} \ - --util=${prov[1]} --way=${way}""") - } else { - echo "Running ${prov[0]} ze" - slurm_batch("fabrics-ci", "1", - "${prefix}${prov[0]}_${way}${suffix}", - "${base_cmd} --prov=${prov[0]} --way=${way}") - } - } - } - } - } - } - } - stage('dsa') { - when { equals expected: true, actual: DO_RUN } - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("shm_dsa", "mudkip", "1", "shm", null, - """FI_SHM_DISABLE_CMA=1 FI_SHM_USE_DSA_SAR=1 \ - FI_LOG_LEVEL=warn""") - } - } - } - } - } - } - stage ('Summary') { - when { equals expected: true, actual: DO_RUN } - steps { - script { - gather_logs("${env.DAOS_ADDR}", "${env.DAOS_KEY}", "${env.LOG_DIR}", - "${env.LOG_DIR}") - gather_logs("${env.ZE_ADDR}", "${env.ZE_KEY}", "${env.LOG_DIR}", - "${env.LOG_DIR}") - - summarize("all", verbose=false, release=RELEASE, - send_mail=env.WEEKLY.toBoolean()) - if (RELEASE) { - save_summary() - } - } - } - } - } - - post { - always { - script { - summarize("all") - } - } - success { - script { - summarize("all", verbose=true, release=false, - send_mail=env.WEEKLY.toBoolean()) - } - } - aborted { - node ('daos_head') { - dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } - } - node ('ze') { - dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } - } - dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } - } - cleanup { - node ('daos_head') { - dir ("${DELETE_LOCATION}") { deleteDir() } - dir("${env.WORKSPACE}") { deleteDir() } - dir("${env.WORKSPACE}@tmp") { deleteDir() } - } - node ('ze') { - dir("${DELETE_LOCATION}") { deleteDir() } - dir("${env.WORKSPACE}") { deleteDir() } - dir("${env.WORKSPACE}@tmp") { deleteDir() } - } - dir("${DELETE_LOCATION}") { deleteDir() } - dir("${env.WORKSPACE}") { deleteDir() } - dir("${env.WORKSPACE}@tmp") { deleteDir() } + } } } } diff --git a/contrib/intel/jenkins/build.py b/contrib/intel/jenkins/build.py index f63d1f0eded..a600cff4894 100755 --- a/contrib/intel/jenkins/build.py +++ b/contrib/intel/jenkins/build.py @@ -69,6 +69,30 @@ def build_fabtests(libfab_install_path, mode): common.run_command(['make', '-j32']) common.run_command(['make', 'install']) +def extract_mpich(mpitype): + + if (mpitype == 'mpich'): + src_dir = 'mpich' + dest = 'mpich_temp' + mpich_tar = 'mpich-4.1.tar.gz' + elif (mpitype == 'impi'): + src_dir = 'impi_mpichtest' + dest = 'impi_mpichtest' + mpich_tar = 'mpich-test.tar.gz' + else: + print(f"Invalid mpi type {mpitype}") + exit + + cwd = os.getcwd() + if (os.path.exists(f'{cloudbees_config.build_dir}/{dest}/') == True): + common.run_command(['rm','-rf', f'{cloudbees_config.build_dir}/{dest}/']) + os.makedirs(f'{cloudbees_config.build_dir}/{dest}') + shutil.copy(f'{cloudbees_config.scm_dir}/{src_dir}/{mpich_tar}', + f'{cloudbees_config.build_dir}/{dest}/') + os.chdir(f'{cloudbees_config.build_dir}/{dest}/') + common.run_command(['tar','-xvf', f'{cloudbees_config.build_dir}/{dest}/{mpich_tar}']) + os.chdir(cwd) + def copy_build_dir(install_path): middlewares_path = f'{install_path}/middlewares' if (os.path.exists(middlewares_path) != True): @@ -78,9 +102,6 @@ def copy_build_dir(install_path): f'{middlewares_path}/shmem') shutil.copytree(f'{cloudbees_config.build_dir}/oneccl', f'{middlewares_path}/oneccl') - - os.symlink(f'{cloudbees_config.build_dir}/mpich', - f'{middlewares_path}/mpich') os.symlink(f'{cloudbees_config.build_dir}/impi', f'{middlewares_path}/impi') os.symlink(f'{cloudbees_config.build_dir}/ompi', @@ -112,13 +133,13 @@ def log_dir(install_path, release=False): parser = argparse.ArgumentParser() parser.add_argument('--build_item', help="build libfabric or fabtests", - choices=['libfabric', 'fabtests', 'builddir', 'logdir']) + choices=['libfabric', 'fabtests', 'builddir', 'logdir','mpich', 'impi_mpich']) parser.add_argument('--ofi_build_mode', help="select buildmode libfabric "\ "build mode", choices=['reg', 'dbg', 'dl']) parser.add_argument('--build_cluster', help="build libfabric on specified cluster", \ - choices=['daos', 'gpu'], default='default') + choices=['daos', 'dsa', 'gpu','mpich'], default='default') parser.add_argument('--release', help="This job is likely testing a "\ "release and will be checked into a git tree.", action='store_true') @@ -145,11 +166,19 @@ def log_dir(install_path, release=False): p = re.compile('mpi*') if (build_item == 'libfabric'): - build_libfabric(libfab_install_path, ofi_build_mode, cluster, ucx) + if (cluster == 'mpich'): + libfab_install_path += "/libfabric_mpich" + build_libfabric(libfab_install_path, ofi_build_mode, cluster) + else: + build_libfabric(libfab_install_path, ofi_build_mode, cluster, ucx) elif (build_item == 'fabtests'): build_fabtests(libfab_install_path, ofi_build_mode) + elif (build_item == 'mpich'): + extract_mpich('mpich') + elif (build_item == 'impi_mpich'): + extract_mpich('impi') elif (build_item == 'builddir'): copy_build_dir(install_path) diff --git a/contrib/intel/jenkins/run.py b/contrib/intel/jenkins/run.py index 70fc7dfdc9f..a53ab755727 100755 --- a/contrib/intel/jenkins/run.py +++ b/contrib/intel/jenkins/run.py @@ -126,18 +126,21 @@ def intel_mpi_benchmark(core, hosts, mpi, mode, group, user_env, log_file, util) print(f"Skipping {mpi.upper} {imb.testname} as execute condition fails") print('-------------------------------------------------------------------') -def mpich_test_suite(core, hosts, mpi, mode, user_env, log_file, util): +def mpich_test_suite(core, hosts, mpi, mode, user_env, log_file, util, weekly=None): mpich_tests = tests.MpichTestSuite(jobname=jbname,buildno=bno, testname="MpichTestSuite",core_prov=core, fabric=fab, mpitype=mpi, hosts=hosts, ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) + log_file=log_file, util_prov=util, weekly=weekly) print('-------------------------------------------------------------------') if (mpich_tests.execute_condn == True): - print(f"Running mpichtestsuite: Spawn Tests for {core}-{util}-{fab}-{mpi}") - mpich_tests.execute_cmd("spawn") + print(f"Running mpichtestsuitefor {core}-{util}-{fab}-{mpi}") + if (mpi == "mpich"): + print("Building mpich") + mpich_tests.build_mpich() + mpich_tests.execute_cmd() else: print(f"Skipping {mpi.upper()} {mpich_tests.testname} as exec condn fails") print('-------------------------------------------------------------------') diff --git a/contrib/intel/jenkins/runtests.py b/contrib/intel/jenkins/runtests.py index a12e1ae0a2c..02faf6aad69 100755 --- a/contrib/intel/jenkins/runtests.py +++ b/contrib/intel/jenkins/runtests.py @@ -37,6 +37,7 @@ def __call__(self, parser, namespace, values, option_string=None): choices=['impi', 'mpich', 'ompi'], default='impi') parser.add_argument('--log_file', help="Full path to log file", default=os.environ['DEFAULT_LOG_LOCATION'], type=str) +parser.add_argument('--weekly', help="run weekly", default=False, type=bool) args = parser.parse_args() args_core = args.prov @@ -45,6 +46,7 @@ def __call__(self, parser, namespace, values, option_string=None): args_device = args.device user_env = args.user_env log_file = args.log_file +weekly = args.weekly if (args.ofi_build_mode): ofi_build_mode = args.ofi_build_mode @@ -131,7 +133,7 @@ def __call__(self, parser, namespace, values, option_string=None): if (run_test == 'all' or run_test == 'mpichtestsuite'): run.mpich_test_suite(args_core, hosts, mpi, ofi_build_mode, user_env, log_file, - args_util) + args_util, weekly) if (run_test == 'all' or run_test == 'IMB'): run.intel_mpi_benchmark(args_core, hosts, mpi, diff --git a/contrib/intel/jenkins/tests.py b/contrib/intel/jenkins/tests.py index 040ce449f7a..59b2acf467f 100755 --- a/contrib/intel/jenkins/tests.py +++ b/contrib/intel/jenkins/tests.py @@ -1,6 +1,6 @@ import sys import os - +import io sys.path.append(os.environ['CLOUDBEES_CONFIG']) import subprocess @@ -451,11 +451,11 @@ class MPICH: def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, server, client, environ, middlewares_path, util_prov=None): - self.mpich_src = f'{middlewares_path}/mpich' + self.mpich_src = f'{cloudbees_config.build_dir}/mpich_temp' self.core_prov = core_prov self.hosts = hosts self.util_prov = util_prov - self.libfab_installpath = libfab_installpath + self.libfab_installpath = f'{libfab_installpath}/libfabric_mpich' self.nw_interface = nw_interface self.server = server self.client = client @@ -472,7 +472,7 @@ def env(self): cmd += f"export FI_PROVIDER={self.core_prov}; " cmd += "export I_MPI_FABRICS=ofi; " cmd += "export MPIR_CVAR_CH4_OFI_ENABLE_ATOMICS=0; " - cmd += "export MPIR_CVAR_CH4_OFI_CAPABILITY_SETS_DEBUG=1; " + cmd += "export MPIR_CVAR_CH4_OFI_CAPABILITY_SETS_DEBUG=0; " cmd += f"export LD_LIBRARY_PATH={self.mpich_src}/lib:$LD_LIBRARY_PATH; " cmd += f"export LD_LIBRARY_PATH={self.libfab_installpath}/lib/:"\ "$LD_LIBRARY_PATH; " @@ -518,10 +518,12 @@ def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, def env(self): cmd = f"bash -c \'source {self.impi_src}/env/vars.sh "\ "-i_mpi_ofi_internal=0; " + cmd += f"source {cloudbees_config.intel_compiler_root}/env/vars.sh; " if (self.util_prov): cmd += f"export FI_PROVIDER={self.core_prov}\\;{self.util_prov}; " else: cmd += f"export FI_PROVIDER={self.core_prov}; " + cmd += "export FI_IFACE=ib0; " cmd += "export I_MPI_FABRICS=ofi; " cmd += f"export LD_LIBRARY_PATH={self.impi_src}/lib:$LD_LIBRARY_PATH; " cmd += f"export LD_LIBRARY_PATH={self.impi_src}/lib/release:"\ @@ -688,58 +690,109 @@ def execute_cmd(self): class MpichTestSuite(Test): def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, mpitype, ofi_build_mode, user_env, log_file, util_prov=None): + hosts, mpitype, ofi_build_mode, user_env, log_file, util_prov=None, weekly=None): super().__init__(jobname, buildno, testname, core_prov, fabric, hosts, ofi_build_mode, user_env, log_file, mpitype, util_prov) - - self.mpichsuitepath = f'{self.middlewares_path}/{mpitype}/'\ - 'mpichsuite/test/mpi/' + self.mpichpath = f'{cloudbees_config.build_dir}/mpich_temp/mpich-4.1/' + self.mpichsuitepath = f'{self.mpichpath}/test/mpi/' + self.impi_mpichtestpath = f'{cloudbees_config.build_dir}/impi_mpichtest/mpich-test' self.pwd = os.getcwd() self.mpi_type = mpitype - - def testgroup(self, testgroupname): - testpath = f'{self.mpichsuitepath}/{testgroupname}' - tests = [] - with open(f'{testpath}/testlist') as file: - for line in file: - if(line[0] != '#' and line[0] != '\n'): - tests.append((line.rstrip('\n')).split(' ')) - - return tests - - def set_options(self, nprocs, timeout=None): - self.mpi.n = nprocs - if (timeout != None): - os.environ['MPIEXEC_TIMEOUT']=timeout - + self.weekly = weekly + + def create_hostfile(self, file, hostlist): + with open(file, "w") as f: + for host in hostlist: + f.write(f"{host}\n") + + def update_testlists(self, filename, category): + with open(filename, 'r') as file: + lines = file.read().splitlines() + for line in lines: + #print("line is:" + line) + if (line == category): + print("commenting out:" + line) + lines[lines.index(line)] = f'#{line}' + else: + continue + with open(filename, 'w') as file: + file.write('\n'.join(lines)) + + def exclude_tests(self, test_root, provider, mpi): + categories = cloudbees_config.mpichtests_exclude[f'{provider}_{mpi}'] + for path,items in categories.items(): + filename = f'{test_root}/{path}/testlist' + print(filename) + for item in items: + self.update_testlists(filename, item) + + def build_mpich(self): + if (os.path.exists(f'{self.mpichpath}/config.log') !=True): + print("configure mpich") + os.chdir(self.mpichpath) + configure_cmd = f'./configure \ + --prefix=/home/cstbuild/cloudbees_middlewares/mpich_temp \ + --with-libfabric={self.mpi.libfab_installpath} \ + --disable-oshmem --disable-fortran --without-ch4-shmmods \ + --with-device=ch4:ofi --without-ze' + print(configure_cmd) + common.run_command(['./autogen.sh']) + common.run_command(shlex.split(configure_cmd)) + common.run_command(['make','-j']) + common.run_command(['make','install']) + os.chdir(self.pwd) @property def execute_condn(self): - return (self.mpi_type == 'impi' or \ - (self.mpi_type == 'mpich' and self.core_prov == 'verbs')) - - def execute_cmd(self, testgroupname): - print("Running Tests: " + testgroupname) - tests = [] - time = None - os.chdir(f'{self.mpichsuitepath}/{testgroupname}') - tests = self.testgroup(testgroupname) - for test in tests: - testname = test[0] - nprocs = test[1] - args = test[2:] - for item in args: - itemlist = item.split('=') - if (itemlist[0] == 'timelimit'): - time = itemlist[1] - self.set_options(nprocs, timeout=time) - testcmd = f'./{testname}' - outputcmd = shlex.split(self.mpi.env + self.mpi.cmd + testcmd + '\'') - common.run_command(outputcmd) - os.chdir(self.pwd) - + return ((self.mpi_type == 'impi' or \ + self.mpi_type == 'mpich') and \ + (self.core_prov == 'verbs' or self.core_prov == 'tcp')) + def execute_cmd(self): + if (self.mpi_type == 'mpich'): + configure_cmd = './configure' + if (self.weekly): + print(f'Weekly {self.mpi_type} mpichsuite tests') + os.chdir(self.mpichsuitepath) + common.run_command(shlex.split(self.mpi.env + configure_cmd +'\'')) + self.exclude_tests(self.mpichsuitepath, self.core_prov, self.mpi_type) + testcmd = 'make testing' + outputcmd = shlex.split(self.mpi.env +testcmd + '\'') + common.run_command(outputcmd) + os.chdir(self.pwd) + else: + #PR Tests + print(f"PR {self.mpi_type} mpichsuite tests") + os.chdir(self.mpichsuitepath) + configure_cmd += f' --with-mpi={cloudbees_config.build_dir}/mpich_temp' + common.run_command(shlex.split(configure_cmd)) + common.run_command(['make', '-j']) + self.exclude_tests(self.mpichsuitepath, self.core_prov, self.mpi_type) + testcmd = "./runtests -tests=testlist -debug -verbose" + common.run_command(shlex.split(self.mpi.env +testcmd + '\'')) + os.chdir(self.pwd) + if (self.mpi_type == 'impi' and self.weekly == True): + print (f'Weekly {self.mpi_type} mpichsuite tests') + os.chdir(self.impi_mpichtestpath) + print(self.hosts) + self.create_hostfile(f'{self.impi_mpichtestpath}/hostfile', + self.hosts) + os.environ["I_MPI_HYDRA_HOST_FILE"] = \ + f'{self.impi_mpichtestpath}/hostfile' + test_root = f'{self.impi_mpichtestpath}/test/mpi' + self.exclude_tests(test_root, self.core_prov, self.mpi_type) + # only append to this file, do not write as this will override the contents + with open(f'{self.impi_mpichtestpath}/intel/test/mpich-test/known-fail-lin',"a") as f: + f.write('# ofi testing excludes\n') + f.write(f'lin {self.core_prov} * * * * rma/[A-Za-z].*\n') + prov = self.core_prov + if(self.util_prov): + prov += f";{self.util_prov}" + test_cmd = f'export I_MPI_HYDRA_HOST_FILE={self.impi_mpichtestpath}/hostfile; ' + test_cmd += f'./test.sh --exclude lin,{self.core_prov},*,*,*,*' + common.run_command(shlex.split(self.mpi.env + test_cmd +'\'')) + os.chdir(self.pwd) class OneCCLTests(Test):