diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index c2b9ba61ad0..9d268ec16b5 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -8,6 +8,7 @@ properties([disableConcurrentBuilds(abortPrevious: true)]) @Field def BUILD_MODES=["reg", "dbg", "dl"] @Field def MPI_TYPES=["impi", "mpich", "ompi"] @Field def PYTHON_VERSION="3.9" +@Field def TIMEOUT="3600" def run_python(version, command, output=null) { if (output != null) @@ -17,8 +18,9 @@ def run_python(version, command, output=null) { } def slurm_batch(partition, node_num, output, command) { + try { - sh """timeout 3600 sbatch --partition=${partition} -N ${node_num} \ + sh """timeout $TIMEOUT sbatch --partition=${partition} -N ${node_num} \ --wait -o ${output} --open-mode=append --wrap=\'env; ${command}\' """ } catch (Exception e) { @@ -63,6 +65,9 @@ def run_middleware(providers, stage_name, test, partition, node_num, mpi=null, if (imb_grp) base_cmd = "${base_cmd} --imb_grp=${imb_grp}" + if (env.WEEKLY.toBoolean()) + base_cmd = "${base_cmd} --weekly=${env.WEEKLY}" + for (prov in providers) { if (prov[1]) { echo "Running ${prov[0]}-${prov[1]} ${stage_name}" @@ -225,7 +230,7 @@ pipeline { } options { timestamps() - timeout(activity: true, time: 1, unit: 'HOURS') + timeout(activity: true, time: 6, unit: 'HOURS') } environment { JOB_CADENCE = 'PR' @@ -235,7 +240,6 @@ pipeline { RUN_LOCATION="${env.WORKSPACE}/${SCRIPT_LOCATION}/" CUSTOM_WORKSPACE="${CB_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" } - stages { stage ('opt-out') { steps { @@ -250,6 +254,9 @@ pipeline { } else { weekly = env.WEEKLY.toBoolean() } + if (weekly) { + TIMEOUT="21600" + } skip = skip() RELEASE = release() if (skip && !weekly) { @@ -258,6 +265,19 @@ pipeline { } } } + stage ('prepare build') { + when { equals expected: true, actual: DO_RUN } + steps { + script { + echo "Copying build dirs." + build("builddir") + echo "Copying log dirs." + build("logdir", null, null, RELEASE) + build("extract_mpich") + build("extract_impi_mpich") + } + } + } stage ('parallel-builds') { when { equals expected: true, actual: DO_RUN } parallel { @@ -265,10 +285,6 @@ pipeline { steps { script { dir (CUSTOM_WORKSPACE) { - echo "Copying build dirs." - build("builddir") - echo "Copying log dirs." - build("logdir", null, null, RELEASE) for (mode in BUILD_MODES) { echo "Building Libfabric $mode" build("libfabric", "$mode") @@ -279,6 +295,21 @@ pipeline { } } } + stage ('buildmpich-libfabric') { + steps { + script { + dir("${CUSTOM_WORKSPACE}/mpich"){ + checkout scm + echo "Building Libfabric reg" + slurm_batch("squirtle,totodile", "1", + "${env.LOG_DIR}/libfabric_mpich_log", + """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ + --build_item=libfabric_mpich """ + ) + } + } + } + } stage ('build-daos') { agent { node { @@ -464,8 +495,7 @@ pipeline { steps { script { dir (RUN_LOCATION) { - def providers = [["verbs", "rxm"], ["tcp", null], - ["tcp", "rxm"], ["sockets", null]] + def providers = [['tcp', null], ["verbs","rxm"]] for (mpi in MPI_TYPES) { run_middleware(providers, "mpichtestsuite", "mpichtestsuite", "squirtle,totodile", "2", "${mpi}") @@ -700,4 +730,4 @@ pipeline { dir("${env.WORKSPACE}@tmp") { deleteDir() } } } -} +} \ No newline at end of file diff --git a/contrib/intel/jenkins/build.py b/contrib/intel/jenkins/build.py index f63d1f0eded..b29e1a65c17 100755 --- a/contrib/intel/jenkins/build.py +++ b/contrib/intel/jenkins/build.py @@ -41,7 +41,7 @@ def build_libfabric(libfab_install_path, mode, cluster=None, ucx=None): for op in common.common_disable_list: config_cmd.append(f'--enable-{op}=no') - if (cluster == 'default'): + if (cluster == 'default' and build_item != 'libfabric_mpich'): for op in common.default_enable_list: config_cmd.append(f'--enable-{op}') @@ -69,6 +69,30 @@ def build_fabtests(libfab_install_path, mode): common.run_command(['make', '-j32']) common.run_command(['make', 'install']) +def extract_mpich(mpitype): + + dest = f'{install_path}/middlewares/{mpitype}_mpichtest' + if (mpitype == 'mpich'): + src_dir = 'mpich' + mpich_tar = cloudbees_config.mpich_tar + elif (mpitype == 'impi'): + src_dir = 'impi_mpichtest' + mpich_tar = cloudbees_config.impi_mpichtest_tar + else: + print(f"Invalid mpi type {mpitype}") + sys.exit(-1) + + cwd = os.getcwd() + if (os.path.exists(dest)): + shutil.rmtree(dest) + os.makedirs(f'{dest}/{mpitype}_mpichsuite') + os.chdir(f'{cloudbees_config.scm_dir}/{src_dir}/') + common.run_command(['tar', '-xvf', + f"{cloudbees_config.scm_dir}/{src_dir}/{mpich_tar}", + '-C', f'{dest}/{mpitype}_mpichsuite', + '--strip-components', '1']) + os.chdir(cwd) + def copy_build_dir(install_path): middlewares_path = f'{install_path}/middlewares' if (os.path.exists(middlewares_path) != True): @@ -78,9 +102,6 @@ def copy_build_dir(install_path): f'{middlewares_path}/shmem') shutil.copytree(f'{cloudbees_config.build_dir}/oneccl', f'{middlewares_path}/oneccl') - - os.symlink(f'{cloudbees_config.build_dir}/mpich', - f'{middlewares_path}/mpich') os.symlink(f'{cloudbees_config.build_dir}/impi', f'{middlewares_path}/impi') os.symlink(f'{cloudbees_config.build_dir}/ompi', @@ -111,12 +132,12 @@ def log_dir(install_path, release=False): workspace = os.environ['WORKSPACE'] parser = argparse.ArgumentParser() - parser.add_argument('--build_item', help="build libfabric or fabtests", - choices=['libfabric', 'fabtests', 'builddir', 'logdir']) - + parser.add_argument('--build_item', help="build libfabric or fabtests", \ + choices=['libfabric', 'libfabric_mpich', 'fabtests', \ + 'builddir', 'logdir', 'extract_mpich', \ + 'extract_impi_mpich']) parser.add_argument('--ofi_build_mode', help="select buildmode libfabric "\ "build mode", choices=['reg', 'dbg', 'dl']) - parser.add_argument('--build_cluster', help="build libfabric on specified cluster", \ choices=['daos', 'gpu'], default='default') parser.add_argument('--release', help="This job is likely testing a "\ @@ -145,11 +166,16 @@ def log_dir(install_path, release=False): p = re.compile('mpi*') if (build_item == 'libfabric'): - build_libfabric(libfab_install_path, ofi_build_mode, cluster, ucx) - + build_libfabric(libfab_install_path, ofi_build_mode, cluster, ucx) + elif (build_item == 'libfabric_mpich'): + build_libfabric(f'{libfab_install_path}/libfabric_mpich', + ofi_build_mode, cluster) elif (build_item == 'fabtests'): build_fabtests(libfab_install_path, ofi_build_mode) - + elif (build_item == 'extract_mpich'): + extract_mpich('mpich') + elif (build_item == 'extract_impi_mpich'): + extract_mpich('impi') elif (build_item == 'builddir'): copy_build_dir(install_path) diff --git a/contrib/intel/jenkins/run.py b/contrib/intel/jenkins/run.py index 70fc7dfdc9f..c0d05d258f3 100755 --- a/contrib/intel/jenkins/run.py +++ b/contrib/intel/jenkins/run.py @@ -126,18 +126,22 @@ def intel_mpi_benchmark(core, hosts, mpi, mode, group, user_env, log_file, util) print(f"Skipping {mpi.upper} {imb.testname} as execute condition fails") print('-------------------------------------------------------------------') -def mpich_test_suite(core, hosts, mpi, mode, user_env, log_file, util): +def mpich_test_suite(core, hosts, mpi, mode, user_env, log_file, util, weekly=None): mpich_tests = tests.MpichTestSuite(jobname=jbname,buildno=bno, testname="MpichTestSuite",core_prov=core, fabric=fab, mpitype=mpi, hosts=hosts, ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) + log_file=log_file, util_prov=util, + weekly=weekly) print('-------------------------------------------------------------------') if (mpich_tests.execute_condn == True): - print(f"Running mpichtestsuite: Spawn Tests for {core}-{util}-{fab}-{mpi}") - mpich_tests.execute_cmd("spawn") + if (mpi == "mpich"): + print("Building mpich") + mpich_tests.build_mpich() + print(f"Running mpichtestsuite for {core}-{util}-{fab}-{mpi}") + mpich_tests.execute_cmd() else: print(f"Skipping {mpi.upper()} {mpich_tests.testname} as exec condn fails") print('-------------------------------------------------------------------') diff --git a/contrib/intel/jenkins/runtests.py b/contrib/intel/jenkins/runtests.py index a12e1ae0a2c..02faf6aad69 100755 --- a/contrib/intel/jenkins/runtests.py +++ b/contrib/intel/jenkins/runtests.py @@ -37,6 +37,7 @@ def __call__(self, parser, namespace, values, option_string=None): choices=['impi', 'mpich', 'ompi'], default='impi') parser.add_argument('--log_file', help="Full path to log file", default=os.environ['DEFAULT_LOG_LOCATION'], type=str) +parser.add_argument('--weekly', help="run weekly", default=False, type=bool) args = parser.parse_args() args_core = args.prov @@ -45,6 +46,7 @@ def __call__(self, parser, namespace, values, option_string=None): args_device = args.device user_env = args.user_env log_file = args.log_file +weekly = args.weekly if (args.ofi_build_mode): ofi_build_mode = args.ofi_build_mode @@ -131,7 +133,7 @@ def __call__(self, parser, namespace, values, option_string=None): if (run_test == 'all' or run_test == 'mpichtestsuite'): run.mpich_test_suite(args_core, hosts, mpi, ofi_build_mode, user_env, log_file, - args_util) + args_util, weekly) if (run_test == 'all' or run_test == 'IMB'): run.intel_mpi_benchmark(args_core, hosts, mpi, diff --git a/contrib/intel/jenkins/summary.py b/contrib/intel/jenkins/summary.py index 62d0bc51c87..5ead404a46f 100755 --- a/contrib/intel/jenkins/summary.py +++ b/contrib/intel/jenkins/summary.py @@ -530,27 +530,41 @@ def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name): super().__init__(logger, log_dir, prov, file_name, stage_name) self.mpi = mpi - if self.mpi == 'impi': - self.run = '/mpiexec' - else: - self.run = '/mpirun' + self.run = 'mpiexec' + + def read_file(self): + previous = "" + with open(self.file_path,'r') as log_file: + for line in log_file: + line = line.lower().strip() + super().check_features(previous, line) + super().check_node(line) + super().check_line(line) + previous = line + + def check_exclude(self, line): + if line.startswith('excluding:'): + test = line.split(':')[-1] + self.excludes += 1 + self.excluded_tests.append(test) def check_name(self, line): - if self.run in line: - self.name = line.split()[len(line.split()) - 1].split('/')[1] - #assume pass + if (line.startswith('ok') or + line.startswith('not ok')): + self.name = line.split('-')[1].split('#')[0].strip() + + def check_pass(self, line): + if (line.startswith('ok') and not + line.split('#')[1].strip().startswith('skip')): self.passes += 1 self.passed_tests.append(self.name) def check_fail(self, line): - # Fail cases take away assumed pass - if "exiting with" in line: + if (line.startswith('not ok') and not + line.split('#')[1].strip().startswith('skip')): self.fails += 1 - self.passes -= 1 - self.failed_tests.append(f'{self.name}') - #skip to next test - while self.run not in line: - line = self.log.readline().lower() + self.failed_tests.append(self.name) + class ImbSummarizer(Summarizer): def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name): @@ -806,7 +820,7 @@ def summarize_items(summary_item, logger, log_dir, mode): if summary_item == 'mpichtestsuite' or summary_item == 'all': for mpi in mpi_list: - for item in ['tcp-rxm', 'verbs-rxm', 'sockets', 'tcp']: + for item in ['tcp', 'verbs-rxm']: ret = MpichTestSuiteSummarizer( logger, log_dir, item, mpi, f'mpichtestsuite_{item}_{mpi}_'\ diff --git a/contrib/intel/jenkins/tests.py b/contrib/intel/jenkins/tests.py index 040ce449f7a..5c96fccc9e0 100755 --- a/contrib/intel/jenkins/tests.py +++ b/contrib/intel/jenkins/tests.py @@ -1,6 +1,6 @@ import sys import os - +import io sys.path.append(os.environ['CLOUDBEES_CONFIG']) import subprocess @@ -451,11 +451,12 @@ class MPICH: def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, server, client, environ, middlewares_path, util_prov=None): - self.mpich_src = f'{middlewares_path}/mpich' + self.mpich_dir = f'{middlewares_path}/mpich_mpichtests' + self.mpich_src = f'{self.mpich_dir}/mpich_mpichsuite' self.core_prov = core_prov self.hosts = hosts self.util_prov = util_prov - self.libfab_installpath = libfab_installpath + self.libfab_installpath = f'{libfab_installpath}/libfabric_mpich' self.nw_interface = nw_interface self.server = server self.client = client @@ -472,11 +473,11 @@ def env(self): cmd += f"export FI_PROVIDER={self.core_prov}; " cmd += "export I_MPI_FABRICS=ofi; " cmd += "export MPIR_CVAR_CH4_OFI_ENABLE_ATOMICS=0; " - cmd += "export MPIR_CVAR_CH4_OFI_CAPABILITY_SETS_DEBUG=1; " - cmd += f"export LD_LIBRARY_PATH={self.mpich_src}/lib:$LD_LIBRARY_PATH; " + cmd += "export MPIR_CVAR_CH4_OFI_CAPABILITY_SETS_DEBUG=0; " + cmd += f"export LD_LIBRARY_PATH={self.mpich_dir}/lib:$LD_LIBRARY_PATH; " cmd += f"export LD_LIBRARY_PATH={self.libfab_installpath}/lib/:"\ "$LD_LIBRARY_PATH; " - cmd += f"export PATH={self.mpich_src}/bin:$PATH; " + cmd += f"export PATH={self.mpich_dir}/bin:$PATH; " cmd += f"export PATH={self.libfab_installpath}/bin:$PATH; " return cmd @@ -502,7 +503,7 @@ class IMPI: def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, server, client, environ, util_prov=None): - self.impi_src = cloudbees_config.impi_root + self.impi_src = f'{cloudbees_config.impi_root}' self.core_prov = core_prov self.hosts = hosts self.util_prov = util_prov @@ -518,10 +519,15 @@ def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, def env(self): cmd = f"bash -c \'source {self.impi_src}/env/vars.sh "\ "-i_mpi_ofi_internal=0; " + cmd += f"source {cloudbees_config.intel_compiler_root}/env/vars.sh; " if (self.util_prov): cmd += f"export FI_PROVIDER={self.core_prov}\\;{self.util_prov}; " else: cmd += f"export FI_PROVIDER={self.core_prov}; " + if (self.core_prov == 'tcp'): + cmd += "export FI_IFACE=eth0; " + elif (self.core_prov == 'verbs'): + cmd += "export FI_IFACE=ib0; " cmd += "export I_MPI_FABRICS=ofi; " cmd += f"export LD_LIBRARY_PATH={self.impi_src}/lib:$LD_LIBRARY_PATH; " cmd += f"export LD_LIBRARY_PATH={self.impi_src}/lib/release:"\ @@ -688,58 +694,133 @@ def execute_cmd(self): class MpichTestSuite(Test): def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, mpitype, ofi_build_mode, user_env, log_file, util_prov=None): + hosts, mpitype, ofi_build_mode, user_env, log_file, util_prov=None, weekly=None): super().__init__(jobname, buildno, testname, core_prov, fabric, hosts, ofi_build_mode, user_env, log_file, mpitype, util_prov) - - self.mpichsuitepath = f'{self.middlewares_path}/{mpitype}/'\ - 'mpichsuite/test/mpi/' - self.pwd = os.getcwd() self.mpi_type = mpitype + self.mpichpath = f"{self.middlewares_path}/{self.mpi_type}_mpichtest/" \ + f"{self.mpi_type}_mpichsuite/" + self.mpichsuitepath = f'{self.mpichpath}/test/mpi/' + self.pwd = os.getcwd() + self.weekly = weekly + self.mpichtests_exclude = { + 'tcp' : { '.' : [('spawn','dir'), ('rma','dir')], + 'threads' : [('spawn','dir'), ('rma','dir')], + 'errors' : [('spawn','dir'),('rma','dir')] + }, + 'verbs' : { '.' : [('spawn','dir')], + 'threads/comm' : [('idup_nb 4','test')], + 'threads' : [('spawn','dir'), ('rma','dir')], + 'pt2pt' : [('sendrecv3 2','test'), + ('sendrecv3 2 arg=-isendrecv','test')], + 'threads/pt2pt': [(f"mt_improbe_sendrecv_huge 2 " + f"arg=-iter=64 arg=-count=4194304 " + f"env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE" + f"=16384", 'test')] + } + } - def testgroup(self, testgroupname): - testpath = f'{self.mpichsuitepath}/{testgroupname}' - tests = [] - with open(f'{testpath}/testlist') as file: - for line in file: - if(line[0] != '#' and line[0] != '\n'): - tests.append((line.rstrip('\n')).split(' ')) - - return tests - - def set_options(self, nprocs, timeout=None): - self.mpi.n = nprocs - if (timeout != None): - os.environ['MPIEXEC_TIMEOUT']=timeout - + def create_hostfile(self, file, hostlist): + with open(file, "w") as f: + for host in hostlist: + f.write(f"{host}\n") + + def update_testlists(self, filename, category): + with open(filename, 'r') as file: + lines = file.read().splitlines() + for line in lines: + if (line == category): + lines[lines.index(line)] = f'#{line}' + else: + continue + with open(filename, 'w') as file: + file.write('\n'.join(lines)) + + def exclude_tests(self, test_root, provider): + for path,exclude_list in self.mpichtests_exclude[f'{provider}'].items(): + for item in exclude_list: + self.update_testlists(f'{test_root}/{path}/testlist', item[0]) + if (item[1] == 'dir'): + filename = f'{test_root}/{path}/{item[0]}/testlist' + with open(filename,'r') as file: + for line in file: + line = line.strip() + if (not line.startswith('#')): + print(f'excluding:{path}/{item[0]}:{line}') + else: #item[1]=test + print(f'excluding:{path}/{item[0]}') + + def build_mpich(self): + if (os.path.exists(f'{self.mpichpath}/config.log') !=True): + print("configure mpich") + os.chdir(self.mpichpath) + configure_cmd = f"./configure " \ + f"--prefix={self.middlewares_path}/{self.mpi_type}_mpichtest " + configure_cmd += f"--with-libfabric={self.mpi.libfab_installpath} " + configure_cmd += "--disable-oshmem " + configure_cmd += "--disable-fortran " + configure_cmd += "--without-ch4-shmmods " + configure_cmd += "--with-device=ch4:ofi " + configure_cmd += "--without-ze " + print(configure_cmd) + common.run_command(['./autogen.sh']) + common.run_command(shlex.split(configure_cmd)) + common.run_command(['make','-j']) + common.run_command(['make','install']) + os.chdir(self.pwd) @property def execute_condn(self): - return (self.mpi_type == 'impi' or \ - (self.mpi_type == 'mpich' and self.core_prov == 'verbs')) - - def execute_cmd(self, testgroupname): - print("Running Tests: " + testgroupname) - tests = [] - time = None - os.chdir(f'{self.mpichsuitepath}/{testgroupname}') - tests = self.testgroup(testgroupname) - for test in tests: - testname = test[0] - nprocs = test[1] - args = test[2:] - for item in args: - itemlist = item.split('=') - if (itemlist[0] == 'timelimit'): - time = itemlist[1] - self.set_options(nprocs, timeout=time) - testcmd = f'./{testname}' - outputcmd = shlex.split(self.mpi.env + self.mpi.cmd + testcmd + '\'') - common.run_command(outputcmd) - os.chdir(self.pwd) - + return ((self.mpi_type == 'impi' or \ + self.mpi_type == 'mpich') and \ + (self.core_prov == 'verbs' or self.core_prov == 'tcp')) + def execute_cmd(self): + if (self.mpi_type == 'mpich'): + configure_cmd = f"./configure --with-mpi={self.middlewares_path}/" \ + f"{self.mpi_type}_mpichtest " + if (self.weekly): + print(f'Weekly {self.mpi_type} mpichsuite tests') + os.chdir(self.mpichsuitepath) + common.run_command(shlex.split(self.mpi.env + + configure_cmd + '\'')) + self.exclude_tests(self.mpichsuitepath, self.core_prov) + testcmd = 'make testing' + outputcmd = shlex.split(self.mpi.env + testcmd + '\'') + common.run_command(outputcmd) + common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \ + f"summary.tap")) + os.chdir(self.pwd) + else: + print(f"PR {self.mpi_type} mpichsuite tests") + os.chdir(self.mpichsuitepath) + common.run_command(shlex.split(self.mpi.env + + configure_cmd + '\'')) + common.run_command(['make', '-j']) + self.exclude_tests(self.mpichsuitepath, self.core_prov) + testcmd = "./runtests -tests=testlist " + testcmd += f" -xmlfile=summary.xml -tapfile=summary.tap " \ + f"-junitfile=summary.junit.xml " + common.run_command(shlex.split(self.mpi.env + testcmd + '\'')) + common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \ + f"summary.tap")) + os.chdir(self.pwd) + if (self.mpi_type == 'impi' and self.weekly == True): + print (f'Weekly {self.mpi_type} mpichsuite tests') + os.chdir(self.mpichpath) + print(self.hosts) + self.create_hostfile(f'{self.mpichpath}/hostfile', + self.hosts) + os.environ["I_MPI_HYDRA_HOST_FILE"] = \ + f'{self.mpichpath}/hostfile' + test_cmd = f"export I_MPI_HYDRA_HOST_FILE=" \ + f"{self.mpichpath}/hostfile; " + test_cmd += f"./test.sh --exclude lin,{self.core_prov},*,*,*,*; " + common.run_command(shlex.split(self.mpi.env + test_cmd + '\'')) + common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \ + f"summary.tap")) + os.chdir(self.pwd) class OneCCLTests(Test):