Skip to content

Commit

Permalink
intel/ci: debug mpich osu failing in CI
Browse files Browse the repository at this point in the history
Signed-off-by: Nikhil Nanal
  • Loading branch information
nikhilnanal committed Sep 19, 2023
1 parent 13f28f7 commit cfe5218
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 42 deletions.
11 changes: 8 additions & 3 deletions contrib/intel/jenkins/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import groovy.transform.Field
properties([disableConcurrentBuilds(abortPrevious: true)])
@Field def DO_RUN=true
@Field def TARGET="main"
@Field def SCRIPT_LOCATION="py_scripts/contrib/intel/jenkins"
@Field def SCRIPT_LOCATION="contrib/intel/jenkins"
@Field def RELEASE=false
@Field def BUILD_MODES=["reg", "dbg", "dl"]
@Field def MPI_TYPES=["impi", "mpich", "ompi"]
Expand Down Expand Up @@ -305,7 +305,12 @@ pipeline {
"${env.LOG_DIR}/libfabric_mpich_log",
"""python$PYTHON_VERSION ${RUN_LOCATION}/build.py \
--build_item=libfabric_mpich """
)
)
slurm_batch("squirtle,totodile", "1",
"${env.LOG_DIR}/build_mpich_log",
"""python$PYTHON_VERSION ${RUN_LOCATION}/build.py \
--build_item=mpich """
)
}
}
}
Expand Down Expand Up @@ -730,4 +735,4 @@ pipeline {
dir("${env.WORKSPACE}@tmp") { deleteDir() }
}
}
}
}
29 changes: 28 additions & 1 deletion contrib/intel/jenkins/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,28 @@ def extract_mpich(mpitype):
'--strip-components', '1'])
os.chdir(cwd)

def build_mpich(libfab_installpath_mpich):
mpich_build_dir = f'{install_path}/middlewares/mpich_mpichtest'
mpich_path = f"{mpich_build_dir}/mpich_mpichsuite"
cwd = os.getcwd()
if (os.path.exists(f"{mpich_build_dir}/bin") !=True):
print("configure mpich")
os.chdir(mpich_path)
configure_cmd = f"./configure "
configure_cmd += f"--prefix={mpich_build_dir} "
configure_cmd += f"--with-libfabric={libfab_installpath_mpich} "
configure_cmd += "--disable-oshmem "
configure_cmd += "--disable-fortran "
configure_cmd += "--without-ch4-shmmods "
configure_cmd += "--with-device=ch4:ofi "
configure_cmd += "--without-ze "
print(configure_cmd)
common.run_command(['./autogen.sh'])
common.run_command(shlex.split(configure_cmd))
common.run_command(['make','-j'])
common.run_command(['make','install'])
os.chdir(cwd)

def copy_build_dir(install_path):
middlewares_path = f'{install_path}/middlewares'
if (os.path.exists(middlewares_path) != True):
Expand All @@ -102,6 +124,9 @@ def copy_build_dir(install_path):
f'{middlewares_path}/shmem')
shutil.copytree(f'{cloudbees_config.build_dir}/oneccl',
f'{middlewares_path}/oneccl')

os.symlink(f'{cloudbees_config.build_dir}/mpich',
f'{middlewares_path}/mpich')
os.symlink(f'{cloudbees_config.build_dir}/impi',
f'{middlewares_path}/impi')
os.symlink(f'{cloudbees_config.build_dir}/ompi',
Expand Down Expand Up @@ -135,7 +160,7 @@ def log_dir(install_path, release=False):
parser.add_argument('--build_item', help="build libfabric or fabtests", \
choices=['libfabric', 'libfabric_mpich', 'fabtests', \
'builddir', 'logdir', 'extract_mpich', \
'extract_impi_mpich'])
'extract_impi_mpich', 'mpich'])
parser.add_argument('--ofi_build_mode', help="select buildmode libfabric "\
"build mode", choices=['reg', 'dbg', 'dl'])
parser.add_argument('--build_cluster', help="build libfabric on specified cluster", \
Expand Down Expand Up @@ -170,6 +195,8 @@ def log_dir(install_path, release=False):
elif (build_item == 'libfabric_mpich'):
build_libfabric(f'{libfab_install_path}/libfabric_mpich',
ofi_build_mode, cluster)
elif (build_item == 'mpich'):
build_mpich(f'{libfab_install_path}/libfabric_mpich')
elif (build_item == 'fabtests'):
build_fabtests(libfab_install_path, ofi_build_mode)
elif (build_item == 'extract_mpich'):
Expand Down
6 changes: 3 additions & 3 deletions contrib/intel/jenkins/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,9 @@ def mpich_test_suite(core, hosts, mpi, mode, user_env, log_file, util, weekly=No

print('-------------------------------------------------------------------')
if (mpich_tests.execute_condn == True):
if (mpi == "mpich"):
print("Building mpich")
mpich_tests.build_mpich()
# if (mpi == "mpich"):
# print("Building mpich")
# mpich_tests.build_mpich()
print(f"Running mpichtestsuite for {core}-{util}-{fab}-{mpi}")
mpich_tests.execute_cmd()
else:
Expand Down
58 changes: 23 additions & 35 deletions contrib/intel/jenkins/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ def __init__ (self, jobname, buildno, testname, core_prov, fabric,
if (self.mpi_type == 'impi'):
self.mpi = IMPI(self.core_prov, self.hosts,
self.libfab_installpath, self.nw_interface,
self.server, self.client, self.env, self.util_prov)
self.server, self.client, self.env,
self.middlewares_path, self.util_prov)
elif (self.mpi_type == 'ompi'):
self.mpi = OMPI(self.core_prov, self.hosts,
self.libfab_installpath, self.nw_interface,
Expand Down Expand Up @@ -390,6 +391,7 @@ def __init__(self, core_prov, hosts, libfab_installpath, nw_interface,
server, client, environ, middlewares_path, util_prov=None):

self.ompi_src = f'{middlewares_path}/ompi'
self.mpichpath = None
self.core_prov = core_prov
self.hosts = hosts
self.util_prov = util_prov
Expand Down Expand Up @@ -451,8 +453,8 @@ class MPICH:
def __init__(self, core_prov, hosts, libfab_installpath, nw_interface,
server, client, environ, middlewares_path, util_prov=None):

self.mpich_dir = f'{middlewares_path}/mpich_mpichtests'
self.mpich_src = f'{self.mpich_dir}/mpich_mpichsuite'
self.mpich_dir = f'{middlewares_path}/mpich_mpichtest'
self.mpichpath = f'{self.mpich_dir}/mpich_mpichsuite'
self.core_prov = core_prov
self.hosts = hosts
self.util_prov = util_prov
Expand Down Expand Up @@ -496,14 +498,15 @@ def options(self):

@property
def cmd(self):
return f"{self.mpich_src}/bin/mpirun {self.options}"

return f"{self.mpich_dir}/bin/mpirun {self.options}"

class IMPI:
def __init__(self, core_prov, hosts, libfab_installpath, nw_interface,
server, client, environ, util_prov=None):
server, client, environ, middlewares_path, util_prov=None):

self.impi_src = f'{cloudbees_config.impi_root}'
self.mpichpath = f"{middlewares_path}/impi_mpichtest/" \
f"impi_mpichsuite/"
self.core_prov = core_prov
self.hosts = hosts
self.util_prov = util_prov
Expand Down Expand Up @@ -669,12 +672,14 @@ def osu_cmd(self, test_type, test):
print(f"Running OSU-{test_type}-{test}")
cmd = f'{self.osu_src}/{test_type}/{test} '
return cmd

def execute_cmd(self):
assert(self.osu_src)
print(self.osu_src)
p = re.compile('osu_put*')
for root, dirs, tests in os.walk(self.osu_src):
for test in tests:
print(test)
self.mpi.n = self.n_ppn[os.path.basename(root)][0]
self.mpi.ppn = self.n_ppn[os.path.basename(root)][1]

Expand All @@ -685,7 +690,10 @@ def execute_cmd(self):
osu_command = self.osu_cmd(os.path.basename(root), test)
outputcmd = shlex.split(self.mpi.env + self.mpi.cmd + \
osu_command + '\'')
print(outputcmd)
common.run_command(outputcmd)
else:
print("skipped condition")

if (test == 'osu_latency_mp' and self.core_prov == 'verbs'):
self.env.pop('IBV_FORK_SAFE')
Expand All @@ -700,9 +708,8 @@ def __init__(self, jobname, buildno, testname, core_prov, fabric,
fabric, hosts, ofi_build_mode, user_env, log_file, mpitype,
util_prov)
self.mpi_type = mpitype
self.mpichpath = f"{self.middlewares_path}/{self.mpi_type}_mpichtest/" \
f"{self.mpi_type}_mpichsuite/"
self.mpichsuitepath = f'{self.mpichpath}/test/mpi/'
if (mpitype != 'ompi'):
self.mpichsuitepath = f'{self.mpi.mpichpath}/test/mpi/'
self.pwd = os.getcwd()
self.weekly = weekly
self.mpichtests_exclude = {
Expand Down Expand Up @@ -752,34 +759,15 @@ def exclude_tests(self, test_root, provider):
else: #item[1]=test
print(f'excluding:{path}/{item[0]}')

def build_mpich(self):
if (os.path.exists(f'{self.mpichpath}/config.log') !=True):
print("configure mpich")
os.chdir(self.mpichpath)
configure_cmd = f"./configure " \
f"--prefix={self.middlewares_path}/{self.mpi_type}_mpichtest "
configure_cmd += f"--with-libfabric={self.mpi.libfab_installpath} "
configure_cmd += "--disable-oshmem "
configure_cmd += "--disable-fortran "
configure_cmd += "--without-ch4-shmmods "
configure_cmd += "--with-device=ch4:ofi "
configure_cmd += "--without-ze "
print(configure_cmd)
common.run_command(['./autogen.sh'])
common.run_command(shlex.split(configure_cmd))
common.run_command(['make','-j'])
common.run_command(['make','install'])
os.chdir(self.pwd)

@property
def execute_condn(self):
return ((self.mpi_type == 'impi' or \
self.mpi_type == 'mpich') and \
(self.core_prov == 'verbs' or self.core_prov == 'tcp'))

def execute_cmd(self):
if (self.mpi_type == 'mpich'):
configure_cmd = f"./configure --with-mpi={self.middlewares_path}/" \
f"{self.mpi_type}_mpichtest "
configure_cmd = f"./configure --with-mpi={self.mpi.mpich_dir} "
if (self.weekly):
print(f'Weekly {self.mpi_type} mpichsuite tests')
os.chdir(self.mpichsuitepath)
Expand Down Expand Up @@ -808,14 +796,14 @@ def execute_cmd(self):
os.chdir(self.pwd)
if (self.mpi_type == 'impi' and self.weekly == True):
print (f'Weekly {self.mpi_type} mpichsuite tests')
os.chdir(self.mpichpath)
os.chdir(self.mpichsuitepath)
print(self.hosts)
self.create_hostfile(f'{self.mpichpath}/hostfile',
self.create_hostfile(f'{self.mpi.mpichpath}/hostfile',
self.hosts)
os.environ["I_MPI_HYDRA_HOST_FILE"] = \
f'{self.mpichpath}/hostfile'
f'{self.mpi.mpichpath}/hostfile'
test_cmd = f"export I_MPI_HYDRA_HOST_FILE=" \
f"{self.mpichpath}/hostfile; "
f"{self.mpi.mpichpath}/hostfile; "
test_cmd += f"./test.sh --exclude lin,{self.core_prov},*,*,*,*; "
common.run_command(shlex.split(self.mpi.env + test_cmd + '\''))
common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \
Expand Down

0 comments on commit cfe5218

Please sign in to comment.