From f4aa232a64ac4c7ab689dfcb7b34d2ff493006ea Mon Sep 17 00:00:00 2001 From: Nikhil Date: Mon, 18 Sep 2023 14:31:23 -0700 Subject: [PATCH] intel/ci: debug mpich osu failing in CI Signed-off-by: Nikhil Nanal --- contrib/intel/jenkins/Jenkinsfile | 333 ++---------------------------- contrib/intel/jenkins/build.py | 29 ++- contrib/intel/jenkins/run.py | 6 +- contrib/intel/jenkins/tests.py | 58 ++---- 4 files changed, 65 insertions(+), 361 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 9d268ec16b5..56eb45b96b8 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -3,7 +3,7 @@ import groovy.transform.Field properties([disableConcurrentBuilds(abortPrevious: true)]) @Field def DO_RUN=true @Field def TARGET="main" -@Field def SCRIPT_LOCATION="py_scripts/contrib/intel/jenkins" +@Field def SCRIPT_LOCATION="contrib/intel/jenkins" @Field def RELEASE=false @Field def BUILD_MODES=["reg", "dbg", "dl"] @Field def MPI_TYPES=["impi", "mpich", "ompi"] @@ -305,44 +305,12 @@ pipeline { "${env.LOG_DIR}/libfabric_mpich_log", """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ --build_item=libfabric_mpich """ - ) - } - } - } - } - stage ('build-daos') { - agent { - node { - label 'daos_head' - customWorkspace CUSTOM_WORKSPACE - } - } - steps { - script { - checkout_py_scripts() - dir (CUSTOM_WORKSPACE) { - build("logdir") - build("libfabric", "reg", "daos") - build("fabtests", "reg") - } - } - } - } - stage ('build-gpu') { - agent { - node { - label 'ze' - customWorkspace CUSTOM_WORKSPACE - } - } - steps { - script { - checkout_py_scripts() - dir (CUSTOM_WORKSPACE) { - build("logdir") - build("builddir") - build("libfabric", "reg", "gpu") - build("fabtests", "reg") + ) + slurm_batch("squirtle,totodile", "1", + "${env.LOG_DIR}/build_mpich_log", + """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ + --build_item=mpich """ + ) } } } @@ -352,22 +320,6 @@ pipeline { stage('parallel-tests') { when { equals expected: true, actual: DO_RUN } parallel { - stage('MPI_verbs-rxm_IMB') { - steps { - script { - dir (RUN_LOCATION) { - def providers = [["verbs", "rxm"]] - for (mpi in MPI_TYPES) { - for (imb_grp = 1; imb_grp < 4; imb_grp++) { - run_middleware(providers, "MPI", "IMB", - "squirtle,totodile", "2", "${mpi}", - "${imb_grp}") - } - } - } - } - } - } stage('MPI_verbs-rxm_OSU') { steps { script { @@ -389,105 +341,12 @@ pipeline { for (mpi in MPI_TYPES) { for (imb_grp = 1; imb_grp < 4; imb_grp++) { run_middleware(providers, "MPI", "IMB", - "bulbasaur", "2", "${mpi}", "${imb_grp}") + "bulbasaur", "2", "${mpi}", "${imb_grp}") } run_middleware(providers, "MPI", "osu", "bulbasaur", "2", - "${mpi}") - } - } - } - } - } - stage('tcp') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("tcp", "bulbasaur", "2", "tcp") - } - } - } - } - stage('verbs-rxm') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("verbs-rxm", "squirtle,totodile", "2", "verbs", - "rxm") - run_fabtests("verbs-rxm", "squirtle,totodile", "2", "verbs", - "rxm", "FI_MR_CACHE_MAX_COUNT=0") - run_fabtests("verbs-rxm", "squirtle,totodile", "2", "verbs", - "rxm", "FI_MR_CACHE_MONITOR=userfaultfd") - } - } - } - } - stage('verbs-rxd') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("verbs-rxd", "squirtle", "2", "verbs", - "rxd") - run_fabtests("verbs-rxd", "squirtle", "2", "verbs", - "rxd", "FI_MR_CACHE_MAX_COUNT=0") - run_fabtests("verbs-rxd", "squirtle", "2", "verbs", - "rxd", "FI_MR_CACHE_MONITOR=userfaultfd") - } - } - } - } - stage('udp') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("udp", "bulbasaur", "2", "udp") - } - } - } - } - stage('shm') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("shm", "bulbasaur", "1", "shm") - run_fabtests("shm", "bulbasaur", "1", "shm", null, - "FI_SHM_DISABLE_CMA=1") - } - } - } - } - stage('sockets') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("sockets", "bulbasaur", "2", "sockets") - } - } - } - } - stage('ucx') { - steps { - script { - dir (CUSTOM_WORKSPACE) { - for (mode in BUILD_MODES) { - echo "Building Libfabric $mode" - build("libfabric", "${mode}", null, false, "--ucx") - echo "Building Fabtests $mode" - build("fabtests", "${mode}", null, false, "--ucx") + "${mpi}") } } - dir (RUN_LOCATION) { - run_fabtests("ucx", "totodile", "2", "ucx") - } - } - } - } - stage('psm3') { - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("psm3", "squirtle", "2", "psm3", null, - "PSM3_IDENTIFY=1") - } } } } @@ -498,180 +357,12 @@ pipeline { def providers = [['tcp', null], ["verbs","rxm"]] for (mpi in MPI_TYPES) { run_middleware(providers, "mpichtestsuite", "mpichtestsuite", - "squirtle,totodile", "2", "${mpi}") - } - } - } - } - } - stage('SHMEM') { - steps { - script { - dir (RUN_LOCATION) { - run_middleware([["verbs", null], ["tcp", null], - ["sockets", null]], "SHMEM", "shmem", - "squirtle,totodile", "2") - } - } - } - } - stage ('multinode_performance') { - steps { - script { - dir (RUN_LOCATION) { - run_middleware([["tcp", null]], "multinode_performance", - "multinode", "bulbasaur", "2") - } - } - } - } - stage ('oneCCL') { - steps { - script { - dir (RUN_LOCATION) { - run_middleware([["tcp", "rxm"]/*, ["psm3", null]*/], "oneCCL", - "oneccl", "bulbasaur", "2") - } - } - } - } - stage ('oneCCL-GPU') { - steps { - script { - dir (RUN_LOCATION) { - run_middleware([["verbs", "rxm"]], "oneCCL-GPU", "onecclgpu", - "charmander", "2") - } - } - } - } - stage ('oneCCL-GPU-v3') { - agent { node { label 'ze' } } - options { skipDefaultCheckout() } - steps { - script { - dir (RUN_LOCATION) { - run_middleware([["verbs", "rxm"]], "oneCCL-GPU-v3", "onecclgpu", - "fabrics-ci", "2") - } - } - } - } - stage('daos_tcp') { - agent { node { label 'daos_tcp' } } - options { skipDefaultCheckout() } - steps { - script { - dir (RUN_LOCATION) { - run_python(PYTHON_VERSION, - """runtests.py --prov='tcp' --util='rxm' \ - --test=daos \ - --log_file=${env.LOG_DIR}/daos_tcp-rxm_reg""") - } - } - } - } - stage('daos_verbs') { - agent { node { label 'daos_verbs' } } - options { skipDefaultCheckout() } - steps { - script { - dir (RUN_LOCATION) { - run_python(PYTHON_VERSION, - """runtests.py --prov='verbs' --util='rxm' \ - --test=daos \ - --log_file=${env.LOG_DIR}/daos_verbs-rxm_reg""") - } - } - } - } - stage ('DMABUF-Tests') { - agent { node { label 'ze' } } - options { skipDefaultCheckout() } - steps { - script { - dir ("${env.WORKSPACE}/${SCRIPT_LOCATION}/") { - dmabuf_output = "${LOG_DIR}/DMABUF-Tests_verbs-rxm_dmabuf_reg" - cmd = """ python3.9 runtests.py --test=dmabuf \ - --prov=verbs --util=rxm""" - slurm_batch("fabrics-ci", "1", "${dmabuf_output}", "${cmd}") - slurm_batch("fabrics-ci", "2", "${dmabuf_output}", "${cmd}") - } - } - } - } - stage ('ze-shm') { - steps { - script { - dir (RUN_LOCATION) { - def providers = [["shm", null]] - def directions = ["h2d", "d2d", "xd2d"] - def base_cmd = "python3.9 runtests.py --device=ze" - def prefix = "${env.LOG_DIR}/ze_" - def suffix = "_reg" - for (prov in providers) { - for (way in directions) { - if (prov[1]) { - echo "Running ${prov[0]}-${prov[1]} ze" - slurm_batch("charmander", "1", - "${prefix}${prov[0]}-${prov[1]}_${way}${suffix}", - """${base_cmd} --prov=${prov[0]} \ - --util=${prov[1]} --way=${way}""") - } else { - echo "Running ${prov[0]} ze" - slurm_batch("charmander", "1", - "${prefix}${prov[0]}_${way}${suffix}", - "${base_cmd} --prov=${prov[0]} --way=${way}") - } - } + "squirtle,totodile", "2", "${mpi}") } } } } } - stage ('ze-shm-v3') { - agent { node { label 'ze' } } - options { skipDefaultCheckout() } - steps { - script { - dir (RUN_LOCATION) { - def providers = [["shm", null]] - def directions = ["h2d", "d2d", "xd2d"] - def base_cmd = "python3.9 runtests.py --device=ze" - def prefix = "${env.LOG_DIR}/ze_v3_" - def suffix = "_reg" - for (prov in providers) { - for (way in directions) { - if (prov[1]) { - echo "Running ${prov[0]}-${prov[1]} ze" - slurm_batch("fabrics-ci", "1", - "${prefix}${prov[0]}-${prov[1]}_${way}${suffix}", - """${base_cmd} --prov=${prov[0]} \ - --util=${prov[1]} --way=${way}""") - } else { - echo "Running ${prov[0]} ze" - slurm_batch("fabrics-ci", "1", - "${prefix}${prov[0]}_${way}${suffix}", - "${base_cmd} --prov=${prov[0]} --way=${way}") - } - } - } - } - } - } - } - stage('dsa') { - when { equals expected: true, actual: DO_RUN } - steps { - script { - dir (RUN_LOCATION) { - run_fabtests("shm_dsa", "mudkip", "1", "shm", null, - """FI_SHM_DISABLE_CMA=1 FI_SHM_USE_DSA_SAR=1 \ - FI_LOG_LEVEL=warn""") - } - } - } - } } } stage ('Summary') { @@ -712,7 +403,7 @@ pipeline { node ('ze') { dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } } - dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } + /*dir ("${DELETE_LOCATION}/middlewares") { deleteDir() }*/ } cleanup { node ('daos_head') { @@ -725,7 +416,7 @@ pipeline { dir("${env.WORKSPACE}") { deleteDir() } dir("${env.WORKSPACE}@tmp") { deleteDir() } } - dir("${DELETE_LOCATION}") { deleteDir() } + /*dir("${DELETE_LOCATION}") { deleteDir() }*/ dir("${env.WORKSPACE}") { deleteDir() } dir("${env.WORKSPACE}@tmp") { deleteDir() } } diff --git a/contrib/intel/jenkins/build.py b/contrib/intel/jenkins/build.py index b29e1a65c17..9c360978087 100755 --- a/contrib/intel/jenkins/build.py +++ b/contrib/intel/jenkins/build.py @@ -93,6 +93,28 @@ def extract_mpich(mpitype): '--strip-components', '1']) os.chdir(cwd) +def build_mpich(libfab_installpath_mpich): + mpich_build_dir = f'{install_path}/middlewares/mpich_mpichtest' + mpich_path = f"{mpich_build_dir}/mpich_mpichsuite" + cwd = os.getcwd() + if (os.path.exists(f"{mpich_build_dir}/bin") !=True): + print("configure mpich") + os.chdir(mpich_path) + configure_cmd = f"./configure " + configure_cmd += f"--prefix={mpich_build_dir} " + configure_cmd += f"--with-libfabric={libfab_installpath_mpich} " + configure_cmd += "--disable-oshmem " + configure_cmd += "--disable-fortran " + configure_cmd += "--without-ch4-shmmods " + configure_cmd += "--with-device=ch4:ofi " + configure_cmd += "--without-ze " + print(configure_cmd) + common.run_command(['./autogen.sh']) + common.run_command(shlex.split(configure_cmd)) + common.run_command(['make','-j']) + common.run_command(['make','install']) + os.chdir(cwd) + def copy_build_dir(install_path): middlewares_path = f'{install_path}/middlewares' if (os.path.exists(middlewares_path) != True): @@ -102,6 +124,9 @@ def copy_build_dir(install_path): f'{middlewares_path}/shmem') shutil.copytree(f'{cloudbees_config.build_dir}/oneccl', f'{middlewares_path}/oneccl') + + os.symlink(f'{cloudbees_config.build_dir}/mpich', + f'{middlewares_path}/mpich') os.symlink(f'{cloudbees_config.build_dir}/impi', f'{middlewares_path}/impi') os.symlink(f'{cloudbees_config.build_dir}/ompi', @@ -135,7 +160,7 @@ def log_dir(install_path, release=False): parser.add_argument('--build_item', help="build libfabric or fabtests", \ choices=['libfabric', 'libfabric_mpich', 'fabtests', \ 'builddir', 'logdir', 'extract_mpich', \ - 'extract_impi_mpich']) + 'extract_impi_mpich', 'mpich']) parser.add_argument('--ofi_build_mode', help="select buildmode libfabric "\ "build mode", choices=['reg', 'dbg', 'dl']) parser.add_argument('--build_cluster', help="build libfabric on specified cluster", \ @@ -170,6 +195,8 @@ def log_dir(install_path, release=False): elif (build_item == 'libfabric_mpich'): build_libfabric(f'{libfab_install_path}/libfabric_mpich', ofi_build_mode, cluster) + elif (build_item == 'mpich'): + build_mpich(f'{libfab_install_path}/libfabric_mpich') elif (build_item == 'fabtests'): build_fabtests(libfab_install_path, ofi_build_mode) elif (build_item == 'extract_mpich'): diff --git a/contrib/intel/jenkins/run.py b/contrib/intel/jenkins/run.py index c0d05d258f3..751e989b73f 100755 --- a/contrib/intel/jenkins/run.py +++ b/contrib/intel/jenkins/run.py @@ -137,9 +137,9 @@ def mpich_test_suite(core, hosts, mpi, mode, user_env, log_file, util, weekly=No print('-------------------------------------------------------------------') if (mpich_tests.execute_condn == True): - if (mpi == "mpich"): - print("Building mpich") - mpich_tests.build_mpich() + # if (mpi == "mpich"): + # print("Building mpich") + # mpich_tests.build_mpich() print(f"Running mpichtestsuite for {core}-{util}-{fab}-{mpi}") mpich_tests.execute_cmd() else: diff --git a/contrib/intel/jenkins/tests.py b/contrib/intel/jenkins/tests.py index 5c96fccc9e0..32fd4612e3b 100755 --- a/contrib/intel/jenkins/tests.py +++ b/contrib/intel/jenkins/tests.py @@ -52,7 +52,8 @@ def __init__ (self, jobname, buildno, testname, core_prov, fabric, if (self.mpi_type == 'impi'): self.mpi = IMPI(self.core_prov, self.hosts, self.libfab_installpath, self.nw_interface, - self.server, self.client, self.env, self.util_prov) + self.server, self.client, self.env, + self.middlewares_path, self.util_prov) elif (self.mpi_type == 'ompi'): self.mpi = OMPI(self.core_prov, self.hosts, self.libfab_installpath, self.nw_interface, @@ -451,8 +452,8 @@ class MPICH: def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, server, client, environ, middlewares_path, util_prov=None): - self.mpich_dir = f'{middlewares_path}/mpich_mpichtests' - self.mpich_src = f'{self.mpich_dir}/mpich_mpichsuite' + self.mpich_dir = f'{middlewares_path}/mpich_mpichtest' + self.mpichpath = f'{self.mpich_dir}/mpich_mpichsuite' self.core_prov = core_prov self.hosts = hosts self.util_prov = util_prov @@ -496,14 +497,15 @@ def options(self): @property def cmd(self): - return f"{self.mpich_src}/bin/mpirun {self.options}" - + return f"{self.mpich_dir}/bin/mpirun {self.options}" class IMPI: def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, - server, client, environ, util_prov=None): + server, client, environ, middlewares_path, util_prov=None): self.impi_src = f'{cloudbees_config.impi_root}' + self.mpichpath = f"{middlewares_path}/impi_mpichtest/" \ + f"impi_mpichsuite/" self.core_prov = core_prov self.hosts = hosts self.util_prov = util_prov @@ -669,12 +671,14 @@ def osu_cmd(self, test_type, test): print(f"Running OSU-{test_type}-{test}") cmd = f'{self.osu_src}/{test_type}/{test} ' return cmd - + def execute_cmd(self): assert(self.osu_src) + print(self.osu_src) p = re.compile('osu_put*') for root, dirs, tests in os.walk(self.osu_src): for test in tests: + print(test) self.mpi.n = self.n_ppn[os.path.basename(root)][0] self.mpi.ppn = self.n_ppn[os.path.basename(root)][1] @@ -685,7 +689,10 @@ def execute_cmd(self): osu_command = self.osu_cmd(os.path.basename(root), test) outputcmd = shlex.split(self.mpi.env + self.mpi.cmd + \ osu_command + '\'') + print(outputcmd) common.run_command(outputcmd) + else: + print("skipped condition") if (test == 'osu_latency_mp' and self.core_prov == 'verbs'): self.env.pop('IBV_FORK_SAFE') @@ -699,10 +706,8 @@ def __init__(self, jobname, buildno, testname, core_prov, fabric, super().__init__(jobname, buildno, testname, core_prov, fabric, hosts, ofi_build_mode, user_env, log_file, mpitype, util_prov) - self.mpi_type = mpitype - self.mpichpath = f"{self.middlewares_path}/{self.mpi_type}_mpichtest/" \ - f"{self.mpi_type}_mpichsuite/" - self.mpichsuitepath = f'{self.mpichpath}/test/mpi/' + self.mpi_type = mpitype + self.mpichsuitepath = f'{self.mpi.mpichpath}/test/mpi/' self.pwd = os.getcwd() self.weekly = weekly self.mpichtests_exclude = { @@ -752,34 +757,15 @@ def exclude_tests(self, test_root, provider): else: #item[1]=test print(f'excluding:{path}/{item[0]}') - def build_mpich(self): - if (os.path.exists(f'{self.mpichpath}/config.log') !=True): - print("configure mpich") - os.chdir(self.mpichpath) - configure_cmd = f"./configure " \ - f"--prefix={self.middlewares_path}/{self.mpi_type}_mpichtest " - configure_cmd += f"--with-libfabric={self.mpi.libfab_installpath} " - configure_cmd += "--disable-oshmem " - configure_cmd += "--disable-fortran " - configure_cmd += "--without-ch4-shmmods " - configure_cmd += "--with-device=ch4:ofi " - configure_cmd += "--without-ze " - print(configure_cmd) - common.run_command(['./autogen.sh']) - common.run_command(shlex.split(configure_cmd)) - common.run_command(['make','-j']) - common.run_command(['make','install']) - os.chdir(self.pwd) - @property def execute_condn(self): return ((self.mpi_type == 'impi' or \ self.mpi_type == 'mpich') and \ (self.core_prov == 'verbs' or self.core_prov == 'tcp')) + def execute_cmd(self): if (self.mpi_type == 'mpich'): - configure_cmd = f"./configure --with-mpi={self.middlewares_path}/" \ - f"{self.mpi_type}_mpichtest " + configure_cmd = f"./configure --with-mpi={self.mpi.mpich_dir} " if (self.weekly): print(f'Weekly {self.mpi_type} mpichsuite tests') os.chdir(self.mpichsuitepath) @@ -808,14 +794,14 @@ def execute_cmd(self): os.chdir(self.pwd) if (self.mpi_type == 'impi' and self.weekly == True): print (f'Weekly {self.mpi_type} mpichsuite tests') - os.chdir(self.mpichpath) + os.chdir(self.mpichsuitepath) print(self.hosts) - self.create_hostfile(f'{self.mpichpath}/hostfile', + self.create_hostfile(f'{self.mpi.mpichpath}/hostfile', self.hosts) os.environ["I_MPI_HYDRA_HOST_FILE"] = \ - f'{self.mpichpath}/hostfile' + f'{self.mpi.mpichpath}/hostfile' test_cmd = f"export I_MPI_HYDRA_HOST_FILE=" \ - f"{self.mpichpath}/hostfile; " + f"{self.mpi.mpichpath}/hostfile; " test_cmd += f"./test.sh --exclude lin,{self.core_prov},*,*,*,*; " common.run_command(shlex.split(self.mpi.env + test_cmd + '\'')) common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \