Skip to content

Commit

Permalink
intel/ci: Add code changes to enable the entire mpich test suite
Browse files Browse the repository at this point in the history
with impi and mpich.
- test.py: class MpichtestSuite modified to build and run
	   PR tests as well as weekly tests (impi & mpich)
- build.py: options and functions added for extracting
	    mpich tar files. Also added libfabric_mpich
	    option to build libfabric with options exclusive
	    for mpich (-without-ze)
- run.py: changes to calls made to build and execute tests based
	  on MpichTestSuite class.
- summary.py: summary functions modified to create summary based
	      on new log file.
- Jenkinsfile: added prepare build stage; added parallel build stage
	       for libfabric_mpich; increased jenkins timeout limit
	       for weekly tests. Running tests for tcp and verbs-rxm.

Signed-off-by: Nikhil Nanal <[email protected]>
  • Loading branch information
nikhilnanal committed Sep 15, 2023
1 parent 5cc4fc7 commit 4e535a6
Show file tree
Hide file tree
Showing 6 changed files with 248 additions and 91 deletions.
50 changes: 40 additions & 10 deletions contrib/intel/jenkins/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ properties([disableConcurrentBuilds(abortPrevious: true)])
@Field def BUILD_MODES=["reg", "dbg", "dl"]
@Field def MPI_TYPES=["impi", "mpich", "ompi"]
@Field def PYTHON_VERSION="3.9"
@Field def TIMEOUT="3600"

def run_python(version, command, output=null) {
if (output != null)
Expand All @@ -17,8 +18,9 @@ def run_python(version, command, output=null) {
}

def slurm_batch(partition, node_num, output, command) {

try {
sh """timeout 3600 sbatch --partition=${partition} -N ${node_num} \
sh """timeout $TIMEOUT sbatch --partition=${partition} -N ${node_num} \
--wait -o ${output} --open-mode=append --wrap=\'env; ${command}\'
"""
} catch (Exception e) {
Expand Down Expand Up @@ -63,6 +65,9 @@ def run_middleware(providers, stage_name, test, partition, node_num, mpi=null,
if (imb_grp)
base_cmd = "${base_cmd} --imb_grp=${imb_grp}"

if (env.WEEKLY.toBoolean())
base_cmd = "${base_cmd} --weekly=${env.WEEKLY}"

for (prov in providers) {
if (prov[1]) {
echo "Running ${prov[0]}-${prov[1]} ${stage_name}"
Expand Down Expand Up @@ -225,7 +230,7 @@ pipeline {
}
options {
timestamps()
timeout(activity: true, time: 1, unit: 'HOURS')
timeout(activity: true, time: 6, unit: 'HOURS')
}
environment {
JOB_CADENCE = 'PR'
Expand All @@ -235,7 +240,6 @@ pipeline {
RUN_LOCATION="${env.WORKSPACE}/${SCRIPT_LOCATION}/"
CUSTOM_WORKSPACE="${CB_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}"
}

stages {
stage ('opt-out') {
steps {
Expand All @@ -250,6 +254,9 @@ pipeline {
} else {
weekly = env.WEEKLY.toBoolean()
}
if (weekly) {
TIMEOUT="21600"
}
skip = skip()
RELEASE = release()
if (skip && !weekly) {
Expand All @@ -258,17 +265,26 @@ pipeline {
}
}
}
stage ('prepare build') {
when { equals expected: true, actual: DO_RUN }
steps {
script {
echo "Copying build dirs."
build("builddir")
echo "Copying log dirs."
build("logdir", null, null, RELEASE)
build("extract_mpich")
build("extract_impi_mpich")
}
}
}
stage ('parallel-builds') {
when { equals expected: true, actual: DO_RUN }
parallel {
stage ('build') {
steps {
script {
dir (CUSTOM_WORKSPACE) {
echo "Copying build dirs."
build("builddir")
echo "Copying log dirs."
build("logdir", null, null, RELEASE)
for (mode in BUILD_MODES) {
echo "Building Libfabric $mode"
build("libfabric", "$mode")
Expand All @@ -279,6 +295,21 @@ pipeline {
}
}
}
stage ('buildmpich-libfabric') {
steps {
script {
dir("${CUSTOM_WORKSPACE}/mpich"){
checkout scm
echo "Building Libfabric reg"
slurm_batch("squirtle,totodile", "1",
"${env.LOG_DIR}/libfabric_mpich_log",
"""python$PYTHON_VERSION ${RUN_LOCATION}/build.py \
--build_item=libfabric_mpich """
)
}
}
}
}
stage ('build-daos') {
agent {
node {
Expand Down Expand Up @@ -464,8 +495,7 @@ pipeline {
steps {
script {
dir (RUN_LOCATION) {
def providers = [["verbs", "rxm"], ["tcp", null],
["tcp", "rxm"], ["sockets", null]]
def providers = [['tcp', null], ["verbs","rxm"]]
for (mpi in MPI_TYPES) {
run_middleware(providers, "mpichtestsuite", "mpichtestsuite",
"squirtle,totodile", "2", "${mpi}")
Expand Down Expand Up @@ -700,4 +730,4 @@ pipeline {
dir("${env.WORKSPACE}@tmp") { deleteDir() }
}
}
}
}
48 changes: 37 additions & 11 deletions contrib/intel/jenkins/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def build_libfabric(libfab_install_path, mode, cluster=None, ucx=None):
for op in common.common_disable_list:
config_cmd.append(f'--enable-{op}=no')

if (cluster == 'default'):
if (cluster == 'default' and build_item != 'libfabric_mpich'):
for op in common.default_enable_list:
config_cmd.append(f'--enable-{op}')

Expand Down Expand Up @@ -69,6 +69,30 @@ def build_fabtests(libfab_install_path, mode):
common.run_command(['make', '-j32'])
common.run_command(['make', 'install'])

def extract_mpich(mpitype):

dest = f'{install_path}/middlewares/{mpitype}_mpichtest'
if (mpitype == 'mpich'):
src_dir = 'mpich'
mpich_tar = cloudbees_config.mpich_tar
elif (mpitype == 'impi'):
src_dir = 'impi_mpichtest'
mpich_tar = cloudbees_config.impi_mpichtest_tar
else:
print(f"Invalid mpi type {mpitype}")
sys.exit(-1)

cwd = os.getcwd()
if (os.path.exists(dest)):
shutil.rmtree(dest)
os.makedirs(f'{dest}/{mpitype}_mpichsuite')
os.chdir(f'{cloudbees_config.scm_dir}/{src_dir}/')
common.run_command(['tar', '-xvf',
f"{cloudbees_config.scm_dir}/{src_dir}/{mpich_tar}",
'-C', f'{dest}/{mpitype}_mpichsuite',
'--strip-components', '1'])
os.chdir(cwd)

def copy_build_dir(install_path):
middlewares_path = f'{install_path}/middlewares'
if (os.path.exists(middlewares_path) != True):
Expand All @@ -78,9 +102,6 @@ def copy_build_dir(install_path):
f'{middlewares_path}/shmem')
shutil.copytree(f'{cloudbees_config.build_dir}/oneccl',
f'{middlewares_path}/oneccl')

os.symlink(f'{cloudbees_config.build_dir}/mpich',
f'{middlewares_path}/mpich')
os.symlink(f'{cloudbees_config.build_dir}/impi',
f'{middlewares_path}/impi')
os.symlink(f'{cloudbees_config.build_dir}/ompi',
Expand Down Expand Up @@ -111,12 +132,12 @@ def log_dir(install_path, release=False):
workspace = os.environ['WORKSPACE']

parser = argparse.ArgumentParser()
parser.add_argument('--build_item', help="build libfabric or fabtests",
choices=['libfabric', 'fabtests', 'builddir', 'logdir'])

parser.add_argument('--build_item', help="build libfabric or fabtests", \
choices=['libfabric', 'libfabric_mpich', 'fabtests', \
'builddir', 'logdir', 'extract_mpich', \
'extract_impi_mpich'])
parser.add_argument('--ofi_build_mode', help="select buildmode libfabric "\
"build mode", choices=['reg', 'dbg', 'dl'])

parser.add_argument('--build_cluster', help="build libfabric on specified cluster", \
choices=['daos', 'gpu'], default='default')
parser.add_argument('--release', help="This job is likely testing a "\
Expand Down Expand Up @@ -145,11 +166,16 @@ def log_dir(install_path, release=False):
p = re.compile('mpi*')

if (build_item == 'libfabric'):
build_libfabric(libfab_install_path, ofi_build_mode, cluster, ucx)

build_libfabric(libfab_install_path, ofi_build_mode, cluster, ucx)
elif (build_item == 'libfabric_mpich'):
build_libfabric(f'{libfab_install_path}/libfabric_mpich',
ofi_build_mode, cluster)
elif (build_item == 'fabtests'):
build_fabtests(libfab_install_path, ofi_build_mode)

elif (build_item == 'extract_mpich'):
extract_mpich('mpich')
elif (build_item == 'extract_impi_mpich'):
extract_mpich('impi')
elif (build_item == 'builddir'):
copy_build_dir(install_path)

Expand Down
12 changes: 8 additions & 4 deletions contrib/intel/jenkins/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,18 +126,22 @@ def intel_mpi_benchmark(core, hosts, mpi, mode, group, user_env, log_file, util)
print(f"Skipping {mpi.upper} {imb.testname} as execute condition fails")
print('-------------------------------------------------------------------')

def mpich_test_suite(core, hosts, mpi, mode, user_env, log_file, util):
def mpich_test_suite(core, hosts, mpi, mode, user_env, log_file, util, weekly=None):

mpich_tests = tests.MpichTestSuite(jobname=jbname,buildno=bno,
testname="MpichTestSuite",core_prov=core,
fabric=fab, mpitype=mpi, hosts=hosts,
ofi_build_mode=mode, user_env=user_env,
log_file=log_file, util_prov=util)
log_file=log_file, util_prov=util,
weekly=weekly)

print('-------------------------------------------------------------------')
if (mpich_tests.execute_condn == True):
print(f"Running mpichtestsuite: Spawn Tests for {core}-{util}-{fab}-{mpi}")
mpich_tests.execute_cmd("spawn")
if (mpi == "mpich"):
print("Building mpich")
mpich_tests.build_mpich()
print(f"Running mpichtestsuite for {core}-{util}-{fab}-{mpi}")
mpich_tests.execute_cmd()
else:
print(f"Skipping {mpi.upper()} {mpich_tests.testname} as exec condn fails")
print('-------------------------------------------------------------------')
Expand Down
4 changes: 3 additions & 1 deletion contrib/intel/jenkins/runtests.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __call__(self, parser, namespace, values, option_string=None):
choices=['impi', 'mpich', 'ompi'], default='impi')
parser.add_argument('--log_file', help="Full path to log file",
default=os.environ['DEFAULT_LOG_LOCATION'], type=str)
parser.add_argument('--weekly', help="run weekly", default=False, type=bool)

args = parser.parse_args()
args_core = args.prov
Expand All @@ -45,6 +46,7 @@ def __call__(self, parser, namespace, values, option_string=None):
args_device = args.device
user_env = args.user_env
log_file = args.log_file
weekly = args.weekly

if (args.ofi_build_mode):
ofi_build_mode = args.ofi_build_mode
Expand Down Expand Up @@ -131,7 +133,7 @@ def __call__(self, parser, namespace, values, option_string=None):
if (run_test == 'all' or run_test == 'mpichtestsuite'):
run.mpich_test_suite(args_core, hosts, mpi,
ofi_build_mode, user_env, log_file,
args_util)
args_util, weekly)

if (run_test == 'all' or run_test == 'IMB'):
run.intel_mpi_benchmark(args_core, hosts, mpi,
Expand Down
44 changes: 29 additions & 15 deletions contrib/intel/jenkins/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,27 +530,41 @@ def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name):
super().__init__(logger, log_dir, prov, file_name, stage_name)

self.mpi = mpi
if self.mpi == 'impi':
self.run = '/mpiexec'
else:
self.run = '/mpirun'
self.run = 'mpiexec'

def read_file(self):
previous = ""
with open(self.file_path,'r') as log_file:
for line in log_file:
line = line.lower().strip()
super().check_features(previous, line)
super().check_node(line)
super().check_line(line)
previous = line

def check_exclude(self, line):
if line.startswith('excluding:'):
test = line.split(':')[-1]
self.excludes += 1
self.excluded_tests.append(test)

def check_name(self, line):
if self.run in line:
self.name = line.split()[len(line.split()) - 1].split('/')[1]
#assume pass
if (line.startswith('ok') or
line.startswith('not ok')):
self.name = line.split('-')[1].split('#')[0].strip()

def check_pass(self, line):
if (line.startswith('ok') and not
line.split('#')[1].strip().startswith('skip')):
self.passes += 1
self.passed_tests.append(self.name)

def check_fail(self, line):
# Fail cases take away assumed pass
if "exiting with" in line:
if (line.startswith('not ok') and not
line.split('#')[1].strip().startswith('skip')):
self.fails += 1
self.passes -= 1
self.failed_tests.append(f'{self.name}')
#skip to next test
while self.run not in line:
line = self.log.readline().lower()
self.failed_tests.append(self.name)


class ImbSummarizer(Summarizer):
def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name):
Expand Down Expand Up @@ -806,7 +820,7 @@ def summarize_items(summary_item, logger, log_dir, mode):

if summary_item == 'mpichtestsuite' or summary_item == 'all':
for mpi in mpi_list:
for item in ['tcp-rxm', 'verbs-rxm', 'sockets', 'tcp']:
for item in ['tcp', 'verbs-rxm']:
ret = MpichTestSuiteSummarizer(
logger, log_dir, item, mpi,
f'mpichtestsuite_{item}_{mpi}_'\
Expand Down
Loading

0 comments on commit 4e535a6

Please sign in to comment.