From a58ce1446315f38d352f2171a59ef72262e15cbd Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Wed, 15 Nov 2023 15:50:25 -0800 Subject: [PATCH] contrib/intel: synchronize with main New CI uses slurm instead of Jenkins node allocation. Update branch to new scripts but remove invalid tests: - DSA (not supported in 1.16) - shm ZE testing (shm does not support full test suite in 1.16) - OneCCL tcp (function not supported in 1.16) - mpich test suite failing tests (were failing before but not caught properly because no summary stage). This is a small regression in testing but reduces testing and failures on 1.16.x which is not regularly maintained Also changes name of dmabuf test which was changed after 1.16 from fi-rdmabw-xe Signed-off-by: Alexia Ingerson --- contrib/intel/jenkins/Jenkinsfile | 950 +++++++++++++++--------------- contrib/intel/jenkins/build.py | 279 +++++---- contrib/intel/jenkins/common.py | 105 +++- contrib/intel/jenkins/run.py | 126 ++-- contrib/intel/jenkins/runtests.py | 156 +++-- contrib/intel/jenkins/summary.py | 468 ++++++++++++--- contrib/intel/jenkins/tests.py | 840 ++++++++++++++++---------- 7 files changed, 1793 insertions(+), 1131 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index df3a7cb7bf2..b0a66ad4936 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -1,9 +1,208 @@ +import groovy.transform.Field properties([disableConcurrentBuilds(abortPrevious: true)]) -def DO_RUN=1 -def TARGET="main" -def SCRIPT_LOCATION="py_scripts/contrib/intel/jenkins" -def RELEASE=0 +@Field def DO_RUN=true +@Field def TARGET="main" +@Field def SCRIPT_LOCATION="py_scripts/contrib/intel/jenkins" +@Field def RELEASE=false +@Field def BUILD_MODES=["reg", "dbg", "dl"] +@Field def PYTHON_VERSION="3.9" +@Field def TIMEOUT="3600" + +def run_python(version, command, output=null) { + if (output != null) + sh "python$version $command >> $output" + else + sh "python$version $command" +} + +def slurm_batch(partition, node_num, output, command) { + + try { + sh """timeout $TIMEOUT sbatch --partition=${partition} -N ${node_num} \ + --wait -o ${output} --open-mode=append --wrap=\'env; ${command}\' + """ + } catch (Exception e) { + sh "scancel \$(cat ${output} | grep SLURM_JOBID | cut -d \"=\" -f 2)" + sh "cat ${output}" + error("Build failed ${e}") + } + sh "cat ${output}" +} + +def run_fabtests(stage_name, partition, node_num, prov, util=null, + user_env=null, way=null) { + def command = "python3.9 ${RUN_LOCATION}/runtests.py" + def opts = "--prov=${prov} --test=fabtests" + def modes = BUILD_MODES + if (util) + opts = "${opts} --util=${util}" + + if (user_env) + opts = "${opts} --user_env ${user_env}" + + if (way) { + opts = "${opts} --way ${way}" + modes = ["reg"] + } + + for (mode in modes) { + echo "Running $stage_name fabtests $mode" + slurm_batch("${partition}", "${node_num}", + "${env.LOG_DIR}/${stage_name}_fabtests_${mode}", + "${command} ${opts} --ofi_build_mode=${mode}") + } + + echo "${stage_name} completed." +} + +def run_middleware(providers, stage_name, test, partition, node_num, mpi=null, + imb_grp=null) { + def base_cmd = "python3.9 ${RUN_LOCATION}/runtests.py --test=${test}" + def opts = "" + def prefix = "${env.LOG_DIR}/${stage_name}_" + def suffix = "_${test}_reg" + if (mpi) { + base_cmd = "${base_cmd} --mpi=${mpi}" + suffix = "_${mpi}${suffix}" + } + + if (imb_grp) + base_cmd = "${base_cmd} --imb_grp=${imb_grp}" + + if (env.WEEKLY.toBoolean()) + base_cmd = "${base_cmd} --weekly=${env.WEEKLY}" + + for (prov in providers) { + if (prov[1]) { + echo "Running ${prov[0]}-${prov[1]} ${stage_name}" + opts = "--prov=${prov[0]} --util=${prov[1]}" + output = "${prefix}${prov[0]}-${prov[1]}${suffix}" + } else { + echo "Running ${prov[0]} ${stage_name}" + opts = "--prov=${prov[0]}" + output = "${prefix}${prov[0]}${suffix}" + } + + slurm_batch("${partition}", "${node_num}", "${output}", + "${base_cmd} ${opts}") + } +} + +def gather_logs(cluster, key, dest, source) { + def address = "${env.USER}@${cluster}" + + try { + sh "scp -i ${key} ${address}:${source}/* ${dest}/" + } catch (Exception e) { + echo "Caught exception ${e} when transfering files from ${cluster}" + } +} + +def summarize(item, verbose=false, release=false, send_mail=false) { + def cmd = "${RUN_LOCATION}/summary.py --summary_item=all" + if (verbose) { + cmd = "${cmd} -v " + } + if (release) { + cmd = "${cmd} --release " + } + if (send_mail.toBoolean()) { + cmd = "${cmd} --send_mail " + } + + run_python(PYTHON_VERSION, cmd) +} + +def save_summary() { + sh """ + mkdir -p ${env.WORKSPACE}/internal + rm -rf ${env.WORKSPACE}/internal/* + git clone https://${env.PAT}@github.com/${env.INTERNAL} ${env.WORKSPACE}/internal + cd ${env.WORKSPACE}/internal + mkdir -p ${env.WORKSPACE}/internal/summaries + cp ${env.WORKSPACE}/summary_*.log ${env.WORKSPACE}/internal/summaries/ + git add ${env.WORKSPACE}/internal/summaries/ + git commit -am \"add ${env.JOB_NAME}'s summary\" + git pull -r origin master + git push origin master + """ +} + +def checkout_py_scripts() { + sh """ + if [[ ! -d ${env.WORKSPACE}/py_scripts ]]; then + mkdir ${env.WORKSPACE}/py_scripts + else + rm -rf ${env.WORKSPACE}/py_scripts && mkdir ${env.WORKSPACE}/py_scripts + fi + + git clone --branch ${TARGET} ${env.UPSTREAM} ${env.WORKSPACE}/py_scripts + """ +} + +def checkout_ci_resources() { + sh """ + if [[ ! -d ${env.WORKSPACE}/py_scripts ]]; then + mkdir ${env.WORKSPACE}/ci_resources + else + rm -rf ${env.WORKSPACE}/ci_resources && mkdir ${env.WORKSPACE}/ci_resources + fi + + git clone ${env.CI_RESOURCES} ${env.WORKSPACE}/ci_resources + + """ +} + +def checkout_external_resources() { + checkout_ci_resources() + checkout_py_scripts() +} + +def generate_diff(def branch_name, def output_loc) { + sh """ + git remote add mainRepo ${env.UPSTREAM} + git fetch mainRepo + git diff --name-only HEAD..mainRepo/${branch_name} > ${output_loc}/commit_id + git remote remove mainRepo + """ +} + +def generate_release_num(def branch_name, def output_loc) { + sh """ + git remote add mainRepo ${env.UPSTREAM} + git fetch mainRepo + git diff mainRepo/${branch_name}:Makefile.am Makefile.am > \ + ${output_loc}/Makefile.am.diff + git diff mainRepo/${branch_name}:configure.ac configure.ac > \ + ${output_loc}/configure.ac.diff + cat configure.ac | grep AC_INIT | cut -d ' ' -f 2 | \ + cut -d '[' -f 2 | cut -d ']' -f 1 > ${output_loc}/release_num.txt + git remote remove mainRepo + """ +} + +def build(item, mode=null, cluster=null, release=false, additional_args=null) { + def cmd = "${RUN_LOCATION}/build.py --build_item=${item}" + if (mode) { + cmd = "${cmd} --ofi_build_mode=${mode} " + } + + if (cluster) { + cmd = "${cmd} --build_cluster=${cluster} " + } + + if (release) { + cmd = "${cmd} --release " + } + + if (additional_args) { + cmd = "${cmd} ${additional_args} " + } + + run_python(PYTHON_VERSION, cmd) +} + def check_target() { echo "CHANGE_TARGET = ${env.CHANGE_TARGET}" if (changeRequest()) { @@ -34,10 +233,10 @@ def release() { if ((changeStrings.toArray().any { it =~ /(Makefile\.am)\b/ }) || (changeStrings.toArray().any { it =~ /(configure\.ac)\b/ })) { echo "This is probably a release" - return 1 + return true } - return 0 + return false } def skip() { @@ -57,568 +256,369 @@ def skip() { echo "Changeset is: ${changeStrings.toArray()}" if (changeStrings.toArray().every { it =~ /(?:fabtests\/pytests|man|prov\/efa|prov\/opx).*$/ }) { echo "DONT RUN!" - return 0 + return true } if (changeStrings.isEmpty()) { echo "DONT RUN!" - return 0 + return true } - return 1 + return false } pipeline { agent { node { - label 'master' - customWorkspace "${JENKINS_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" + label 'main' + customWorkspace "workspace/${JOB_NAME}/${env.BUILD_NUMBER}" } } options { timestamps() - timeout(activity: true, time: 1, unit: 'HOURS') + timeout(activity: true, time: 6, unit: 'HOURS') } environment { JOB_CADENCE = 'PR' + LOG_DIR = "${env.JOB_INSTALL_DIR}/${env.JOB_NAME}/${env.BUILD_NUMBER}/log_dir" + WITH_ENV="'PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH'" + DELETE_LOCATION="${env.JOB_INSTALL_DIR}/${env.JOB_NAME}/${env.BUILD_NUMBER}" + RUN_LOCATION="${env.WORKSPACE}/${SCRIPT_LOCATION}/" + CUSTOM_WORKSPACE="${CB_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" } - stages { stage ('opt-out') { steps { script { TARGET=check_target() + checkout_external_resources() + generate_diff("${TARGET}", "${env.WORKSPACE}") + generate_release_num("${TARGET}", "${env.WORKSPACE}") + + if (env.WEEKLY == null) { + weekly = false + } else { + weekly = env.WEEKLY.toBoolean() + } + if (weekly) { + TIMEOUT="21600" + } + skip = skip() + RELEASE = release() + if (skip && !weekly) { + DO_RUN=false + } } - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - mkdir ${env.WORKSPACE}/py_scripts - git clone --branch ${TARGET} ${env.UPSTREAM} ${env.WORKSPACE}/py_scripts - ${env.SKIP_PATH}/skip.sh ${env.WORKSPACE} ${TARGET} - ${env.SKIP_PATH}/release.sh ${env.WORKSPACE} ${TARGET} - """ - } + } + } + stage ('prepare build') { + when { equals expected: true, actual: DO_RUN } + steps { script { - DO_RUN=skip() - RELEASE=release() + echo "Copying build dirs." + build("builddir") + echo "Copying log dirs." + build("logdir", null, null, RELEASE) + build("extract_mpich") + build("extract_impi_mpich") } } } stage ('parallel-builds') { - when { equals expected: 1, actual: DO_RUN } + when { equals expected: true, actual: DO_RUN } parallel { stage ('build') { steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - echo "-----------------------------------------------------" - echo "Copy build dirs." - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=builddir - echo "Copy build dirs completed." - echo "-----------------------------------------------------" - echo "Copy log dirs." - if [[ $RELEASE -eq 1 ]]; then - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=logdir --release - else - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=logdir - fi - echo "Copy log dirs completed." - echo "-----------------------------------------------------" - echo "Building libfabric reg." - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=libfabric - echo "-----------------------------------------------------" - echo "Building libfabric dbg." - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=libfabric --ofi_build_mode=dbg - echo "-----------------------------------------------------" - echo "Building libfabric dl." - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=libfabric --ofi_build_mode=dl - echo "Libfabric builds completed." - echo "-----------------------------------------------------" - echo "Building fabtests reg." - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=fabtests - echo "-----------------------------------------------------" - echo "Building fabtests dbg." - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=fabtests --ofi_build_mode=dbg - echo "-----------------------------------------------------" - echo "Building fabtests dl." - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=fabtests --ofi_build_mode=dl - echo 'Fabtests builds completed.' - """ + script { + dir (CUSTOM_WORKSPACE) { + for (mode in BUILD_MODES) { + echo "Building Libfabric $mode" + build("libfabric", "$mode") + echo "Building Fabtests $mode" + build("fabtests", "$mode") + } + } + } + } + } + stage ('buildmpich-libfabric') { + steps { + script { + dir("${CUSTOM_WORKSPACE}/mpich"){ + checkout scm + echo "Building Libfabric reg" + slurm_batch("squirtle,totodile", "1", + "${env.LOG_DIR}/libfabric_mpich_log", + """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ + --build_item=libfabric_mpich """ + ) + slurm_batch("squirtle,totodile", "1", + "${env.LOG_DIR}/build_mpich_log", + """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ + --build_item=mpich """ + ) + } } } } stage ('build-daos') { agent { node { - label 'daos' - customWorkspace "${JENKINS_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" + label 'daos_head' + customWorkspace CUSTOM_WORKSPACE } } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - if [[ ! -d ${env.WORKSPACE}/py_scripts ]]; then - mkdir ${env.WORKSPACE}/py_scripts - else - rm -rf ${env.WORKSPACE}/py_scripts && mkdir ${env.WORKSPACE}/py_scripts - fi - - git clone --branch ${TARGET} ${env.UPSTREAM} ${env.WORKSPACE}/py_scripts - - echo "Copy log dirs." - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=logdir - echo "Copy log dirs completed." - echo "-----------------------------------------------------" - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=libfabric --build_cluster='daos' - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=fabtests - ) - """ + script { + checkout_external_resources() + dir (CUSTOM_WORKSPACE) { + build("logdir") + build("libfabric", "reg", "daos") + build("fabtests", "reg") + } } } } - /*stage ('build-dsa') { + stage ('build-gpu') { agent { node { - label 'dsa' - customWorkspace "${JENKINS_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" + label 'ze' + customWorkspace CUSTOM_WORKSPACE } } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - if [[ ! -d ${env.WORKSPACE}/py_scripts ]]; then - mkdir ${env.WORKSPACE}/py_scripts - else - rm -rf ${env.WORKSPACE}/py_scripts && mkdir ${env.WORKSPACE}/py_scripts - fi - git clone --branch ${TARGET} ${env.UPSTREAM} ${env.WORKSPACE}/py_scripts - python3.9 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=logdir - python3.9 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=libfabric --build_cluster='dsa' - python3.9 ${env.WORKSPACE}/${SCRIPT_LOCATION}/build.py --build_item=fabtests - ) - """ + script { + checkout_external_resources() + dir (CUSTOM_WORKSPACE) { + build("logdir") + build("builddir") + build("libfabric", "reg", "gpu") + build("fabtests", "reg") + } } } - }*/ + } } } stage('parallel-tests') { - when { equals expected: 1, actual: DO_RUN } + when { equals expected: true, actual: DO_RUN } parallel { - stage('MPI_verbs-rxm') { - agent { node { label 'mlx5' } } - options { skipDefaultCheckout() } + stage('MPI_verbs-rxm_IMB') { steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=verbs --util=rxm --test=IMB --imb_grp=1 - echo "IMB verbs-rxm Group 1 completed." - python3.7 runtests.py --prov=verbs --util=rxm --test=IMB --imb_grp=2 - echo "IMB verbs-rxm Group 2 completed." - python3.7 runtests.py --prov=verbs --util=rxm --test=IMB --imb_grp=3 - echo "IMB verbs-rxm Group 3 completed." - python3.7 runtests.py --prov=verbs --util=rxm --test=osu - echo "OSU verbs-rxm completed." - echo "MPI-verbs-rxm completed." - ) - """ + script { + dir (RUN_LOCATION) { + def providers = [["verbs", "rxm"]] + for (def mpi in ["impi"]) { + for (imb_grp = 1; imb_grp < 4; imb_grp++) { + run_middleware(providers, "MPI", "IMB", + "squirtle,totodile", "2", "${mpi}", + "${imb_grp}") + } + } + } } } } - stage('MPI_tcp-rxm-2') { - agent { node { label 'cvl' } } - options { skipDefaultCheckout() } + stage('MPI_verbs-rxm_OSU') { + steps { + script { + dir (RUN_LOCATION) { + def providers = [["verbs", "rxm"]] + for (def mpi in ["impi", "mpich"]) { + run_middleware(providers, "MPI", "osu", "squirtle,totodile", + "2", "${mpi}") + } + } + } + } + } + stage('MPI_tcp') { steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=tcp --util=rxm --test=IMB --imb_grp=2 - echo "MPI-tcp-rxm-2 completed." - ) - """ + script { + dir (RUN_LOCATION) { + def providers = [["tcp", null]] + for (imb_grp = 1; imb_grp < 4; imb_grp++) { + run_middleware(providers, "MPI", "IMB", + "bulbasaur", "2", "impi", "${imb_grp}") + } + for (def mpi in ["impi", "mpich"]) { + run_middleware(providers, "MPI", "osu", "bulbasaur", "2", + "${mpi}") + } + } } } } stage('tcp') { - agent { node { label 'cvl' } } - options { skipDefaultCheckout() } - steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=tcp --test=fabtests - python3.7 runtests.py --prov=tcp --test=fabtests --ofi_build_mode=dbg - python3.7 runtests.py --prov=tcp --test=fabtests --ofi_build_mode=dl - echo "tcp completed." - ) - """ + steps { + script { + dir (RUN_LOCATION) { + run_fabtests("tcp", "bulbasaur", "2", "tcp") + } } } } stage('verbs-rxm') { - agent { node { label 'mlx5' } } - options { skipDefaultCheckout() } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=verbs --util=rxm --test=fabtests - python3.7 runtests.py --prov=verbs --util=rxm --test=fabtests --ofi_build_mode=dbg - python3.7 runtests.py --prov=verbs --util=rxm --test=fabtests --ofi_build_mode=dl - echo "verbs-rxm completed." - ) - """ + script { + dir (RUN_LOCATION) { + run_fabtests("verbs-rxm", "squirtle,totodile", "2", "verbs", + "rxm") + run_fabtests("verbs-rxm", "squirtle,totodile", "2", "verbs", + "rxm", "FI_MR_CACHE_MAX_COUNT=0") + run_fabtests("verbs-rxm", "squirtle,totodile", "2", "verbs", + "rxm", "FI_MR_CACHE_MONITOR=userfaultfd") + } } } } stage('verbs-rxd') { - agent { node { label 'mlx5 && edr' } } - options { skipDefaultCheckout() } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=verbs --util=rxd --test=fabtests - python3.7 runtests.py --prov=verbs --util=rxd --test=fabtests --ofi_build_mode=dbg - python3.7 runtests.py --prov=verbs --util=rxd --test=fabtests --ofi_build_mode=dl - echo "verbs-rxd completed." - ) - """ + script { + dir (RUN_LOCATION) { + run_fabtests("verbs-rxd", "squirtle", "2", "verbs", + "rxd") + run_fabtests("verbs-rxd", "squirtle", "2", "verbs", + "rxd", "FI_MR_CACHE_MAX_COUNT=0") + run_fabtests("verbs-rxd", "squirtle", "2", "verbs", + "rxd", "FI_MR_CACHE_MONITOR=userfaultfd") + } } } } stage('udp') { - agent { node { label 'cvl' } } - options { skipDefaultCheckout() } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=udp --test=fabtests - python3.7 runtests.py --prov=udp --test=fabtests --ofi_build_mode=dbg - python3.7 runtests.py --prov=udp --test=fabtests --ofi_build_mode=dl - echo "udp completed." - ) - """ + script { + dir (RUN_LOCATION) { + run_fabtests("udp", "bulbasaur", "2", "udp") + } } } } stage('shm') { - agent { node { label 'cvl || mlx5' } } - options { skipDefaultCheckout() } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=shm --test=fabtests - python3.7 runtests.py --prov=shm --test=fabtests --ofi_build_mode=dbg - python3.7 runtests.py --prov=shm --test=fabtests --ofi_build_mode=dl - python3.7 runtests.py --prov=shm --test=fabtests --user_env="{'FI_SHM_DISABLE_CMA':1}" - python3.7 runtests.py --prov=shm --test=fabtests --ofi_build_mode=dbg --user_env="{'FI_SHM_DISABLE_CMA':1}" - python3.7 runtests.py --prov=shm --test=fabtests --ofi_build_mode=dl --user_env="{'FI_SHM_DISABLE_CMA':1}" - echo "shm completed." - ) - """ + script { + dir (RUN_LOCATION) { + run_fabtests("shm", "bulbasaur", "1", "shm") + run_fabtests("shm", "bulbasaur", "1", "shm", null, + "FI_SHM_DISABLE_CMA=1") + } } } } stage('sockets') { - agent { node { label 'cvl || mlx5' } } - options { skipDefaultCheckout() } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=sockets --test=fabtests - python3.7 runtests.py --prov=sockets --test=fabtests --ofi_build_mode=dbg - python3.7 runtests.py --prov=sockets --test=fabtests --ofi_build_mode=dl - echo "sockets completed." - ) - """ + script { + dir (RUN_LOCATION) { + run_fabtests("sockets", "bulbasaur", "2", "sockets") + } } } } stage('psm3') { - agent { node { label 'mlx5 && edr' } } - options { skipDefaultCheckout() } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - export PSM3_IDENTIFY=1 - export FI_LOG_LEVEL=info - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=psm3 --test=fabtests - python3.7 runtests.py --prov=psm3 --test=fabtests --ofi_build_mode=dbg - python3.7 runtests.py --prov=psm3 --test=fabtests --ofi_build_mode=dl - echo "psm3 completed." - ) - """ - } - } - } - stage('MPI_tcp-rxm-1') { - agent { node { label 'cvl' } } - options { skipDefaultCheckout() } - steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=tcp --util=rxm --test=IMB --imb_grp=1 - echo "MPI-tcp-rxm-1 completed." - ) - """ - } - } - } - stage('MPI_tcp-rxm-3') { - agent { node { label 'cvl' } } - options { skipDefaultCheckout() } - steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=tcp --util=rxm --test=IMB --imb_grp=3 - echo "MPI-tcp-rxm-3 completed." - python3.7 runtests.py --prov=tcp --util=rxm --test=osu - echo "OSU verbs-rxm completed." - ) - """ - } - } - } - stage('MPICH testsuite') { - agent { node { label 'mlx5' } } - options { skipDefaultCheckout() } - steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=verbs --util=rxm --test=mpichtestsuite - echo "verbs-rxm MPICH testsuite completed." - python3.7 runtests.py --prov=tcp --test=mpichtestsuite - echo "tcp MPICH testsuite completed." - python3.7 runtests.py --prov=tcp --util=rxm --test=mpichtestsuite - echo "tcp-rxm MPICH testsuite completed." - python3.7 runtests.py --prov=sockets --test=mpichtestsuite - echo "sockets MPICH testsuite completed." - echo "MPICH testsuite completed." - ) - """ + script { + dir (RUN_LOCATION) { + run_fabtests("psm3", "squirtle", "2", "psm3", null, + "PSM3_IDENTIFY=1") + } } } } stage('SHMEM') { - agent { node { label 'mlx5' } } - options { skipDefaultCheckout() } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=tcp --test=shmem - echo "SHMEM tcp completed." - python3.7 runtests.py --prov=verbs --test=shmem - echo "SHMEM verbs completed." - python3.7 runtests.py --prov=sockets --test=shmem - echo "SHMEM sockets completed." - echo "SHMEM completed." - ) - """ + script { + dir (RUN_LOCATION) { + run_middleware([["verbs", null], ["tcp", null], + ["sockets", null]], "SHMEM", "shmem", + "squirtle,totodile", "2") + } } } } - stage('multinode_performance') { - agent { node { label 'cvl' } } - options { skipDefaultCheckout() } + stage ('multinode_performance') { steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) - { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=tcp --test=multinode - echo "multinode tcp performance completed." - echo "multinode performance completed." - ) - """ + script { + dir (RUN_LOCATION) { + run_middleware([["tcp", null]], "multinode_performance", + "multinode", "bulbasaur", "2") } + } } } - stage('oneCCL') { - agent { node { label 'cvl || mlx5' } } + stage ('oneCCL-GPU-v3') { + agent { node { label 'ze' } } options { skipDefaultCheckout() } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=tcp --util=rxm --test=oneccl - echo "oneCCL tcp-rxm completed." - python3.7 runtests.py --prov=tcp --test=oneccl - echo "oneCCL tcp completed." - python3.7 runtests.py --prov=psm3 --test=oneccl - echo "oneCCL psm3 completed." - echo "OneCCL completed." - ) - """ + script { + dir (RUN_LOCATION) { + run_middleware([["verbs", "rxm"]], "oneCCL-GPU-v3", "onecclgpu", + "fabrics-ci", "2") + } } } } stage('daos_tcp') { - agent { - node { - label 'daos' - customWorkspace "${JENKINS_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" - } - } + agent { node { label 'daos_tcp' } } options { skipDefaultCheckout() } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - echo `hostname` - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov='tcp' --util='rxm' --test=daos - echo "daos-tcp test completed." - ) - """ + script { + dir (RUN_LOCATION) { + run_python(PYTHON_VERSION, + """runtests.py --prov='tcp' --util='rxm' \ + --test=daos \ + --log_file=${env.LOG_DIR}/daos_tcp-rxm_reg""") + } } } } - stage('daos_verbs') { - agent { - node { - label 'daos' - customWorkspace "${JENKINS_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" - } - } + stage('daos_verbs') { + agent { node { label 'daos_verbs' } } options { skipDefaultCheckout() } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - echo `hostname` - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov='verbs' --util='rxm' --test=daos - echo "daos-verbs test completed." - ) - """ + script { + dir (RUN_LOCATION) { + run_python(PYTHON_VERSION, + """runtests.py --prov='verbs' --util='rxm' \ + --test=daos \ + --log_file=${env.LOG_DIR}/daos_verbs-rxm_reg""") + } } } } - stage('ze-shm') { - agent {node {label 'ats'}} + stage ('DMABUF-Tests') { + agent { node { label 'ze' } } options { skipDefaultCheckout() } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.7 runtests.py --prov=shm --device='ze' - echo "ze-shm completed." - ) - """ + script { + dir ("${env.WORKSPACE}/${SCRIPT_LOCATION}/") { + dmabuf_output = "${LOG_DIR}/DMABUF-Tests_verbs-rxm_dmabuf" + cmd = """ python3.9 runtests.py --test=dmabuf \ + --prov=verbs --util=rxm""" + slurm_batch("fabrics-ci", "1", "${dmabuf_output}_1_reg", + "${cmd}") + slurm_batch("fabrics-ci", "2", "${dmabuf_output}_2_reg", + "${cmd}") + } } } } - /*stage('dsa') { - agent { - node { - label 'dsa' - customWorkspace "${JENKINS_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" - } - } - when { equals expected: 1, actual: DO_RUN } - steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - echo `hostname` - cd ${env.WORKSPACE}/${SCRIPT_LOCATION}/ - python3.9 runtests.py --prov=shm --test=fabtests --user_env="{'FI_SHM_DISABLE_CMA':1, 'FI_SHM_USE_DSA_SAR':1}" - ) - """ - } - } - }*/ } } stage ('Summary') { - when { equals expected: 1, actual: DO_RUN } - steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - if [[ $RELEASE -eq 1 ]]; then - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/summary.py --summary_item=all --release - else - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/summary.py --summary_item=all - fi - echo "------------" - if [[ ${RELEASE} -eq 1 ]]; then - mkdir -p ${env.WORKSPACE}/internal - rm -rf ${env.WORKSPACE}/internal/* - git clone https://${env.PAT}@github.com/${env.INTERNAL} ${env.WORKSPACE}/internal - cd ${env.WORKSPACE}/internal - mkdir -p ${env.WORKSPACE}/internal/summaries - cp ${env.WORKSPACE}/summary_*.log ${env.WORKSPACE}/internal/summaries/ - git add ${env.WORKSPACE}/internal/summaries/ - git commit -am \"add ${env.JOB_NAME}'s summary\" - git pull -r origin master - git push origin master - fi - ) - """ - } - } - } - stage ('Summary-daos') { - agent {node {label 'daos'}} - when { equals expected: 1, actual: DO_RUN } + when { equals expected: true, actual: DO_RUN } steps { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) { - sh """ - env - ( - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/summary.py --summary_item=daos - ) - """ + script { + gather_logs("${env.DAOS_ADDR}", "${env.DAOS_KEY}", "${env.LOG_DIR}", + "${env.LOG_DIR}") + gather_logs("${env.ZE_ADDR}", "${env.ZE_KEY}", "${env.LOG_DIR}", + "${env.LOG_DIR}") + + summarize("all", verbose=false, release=RELEASE, + send_mail=env.WEEKLY.toBoolean()) + if (RELEASE) { + save_summary() + } } } } @@ -626,67 +626,39 @@ pipeline { post { always { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { - sh "python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/summary.py --summary_item=all" + script { + summarize("all") } } success { - node ('daos') { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { - sh """ - if [[ ${DO_RUN} -eq 1 ]]; then - python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/summary.py -v --summary_item=daos - fi - """ - dir ("${env.DAOS_CLUSTER_HOME}/avocado") { - deleteDir() - } - } + script { + summarize("all", verbose=true, release=false, + send_mail=env.WEEKLY.toBoolean()) } - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { - dir("${env.WORKSPACE}") { - sh "python3.7 ${env.WORKSPACE}/${SCRIPT_LOCATION}/summary.py -v --summary_item=all" - } + } + aborted { + node ('daos_head') { + dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } + } + node ('ze') { + dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } } + dir ("${DELETE_LOCATION}/middlewares") { deleteDir() } } cleanup { - node ('daos') { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { - dir ("${env.CI_INSTALL_DIR}/${env.JOB_NAME}/${env.BUILD_NUMBER}") { - deleteDir() - } - dir("${env.WORKSPACE}") { - deleteDir() - } - dir("${env.WORKSPACE}@tmp") { - deleteDir() - } - } + node ('daos_head') { + dir ("${DELETE_LOCATION}") { deleteDir() } + dir("${env.WORKSPACE}") { deleteDir() } + dir("${env.WORKSPACE}@tmp") { deleteDir() } } - /*node ('dsa') { - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { - dir ("${env.CI_INSTALL_DIR}/${env.JOB_NAME}/${env.BUILD_NUMBER}") { - deleteDir() - } - dir("${env.WORKSPACE}") { - deleteDir() - } - dir("${env.WORKSPACE}@tmp") { - deleteDir() - } - } - }*/ - withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { - dir ("${env.CI_INSTALL_DIR}/${env.JOB_NAME}/${env.BUILD_NUMBER}") { - deleteDir() - } - dir("${env.WORKSPACE}") { - deleteDir() - } - dir("${env.WORKSPACE}@tmp") { - deleteDir() - } + node ('ze') { + dir("${DELETE_LOCATION}") { deleteDir() } + dir("${env.WORKSPACE}") { deleteDir() } + dir("${env.WORKSPACE}@tmp") { deleteDir() } } + dir("${DELETE_LOCATION}") { deleteDir() } + dir("${env.WORKSPACE}") { deleteDir() } + dir("${env.WORKSPACE}@tmp") { deleteDir() } } } -} +} \ No newline at end of file diff --git a/contrib/intel/jenkins/build.py b/contrib/intel/jenkins/build.py index 9271e9a6bcd..e9e62caab05 100755 --- a/contrib/intel/jenkins/build.py +++ b/contrib/intel/jenkins/build.py @@ -2,9 +2,9 @@ import sys # add jenkins config location to PATH -sys.path.append(os.environ['CI_SITE_CONFIG']) +sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") +import cloudbees_config -import ci_site_config import argparse import subprocess import shlex @@ -12,122 +12,201 @@ import re import shutil -def build_libfabric(libfab_install_path, mode, cluster=None): +def build_libfabric(libfab_install_path, mode, cluster=None, ucx=None): - if (os.path.exists(libfab_install_path) != True): - os.makedirs(libfab_install_path) + if (os.path.exists(libfab_install_path) != True): + os.makedirs(libfab_install_path) - config_cmd = ['./configure', f'--prefix={libfab_install_path}'] - enable_prov_val = 'yes' + config_cmd = ['./configure', f'--prefix={libfab_install_path}'] + enable_prov_val = 'yes' - if (mode == 'dbg'): - config_cmd.append('--enable-debug') - elif (mode == 'dl'): - enable_prov_val='dl' + if (mode == 'dbg'): + config_cmd.append('--enable-debug') + elif (mode == 'dl'): + enable_prov_val='dl' - if (cluster == 'daos'): - prov_list = common.daos_prov_list - elif (cluster == 'dsa'): - prov_list = common.dsa_prov_list - else: - prov_list = common.default_prov_list + if (cluster == 'daos'): + prov_list = common.daos_prov_list + elif (cluster == 'gpu'): + prov_list = common.gpu_prov_list + else: + prov_list = common.default_prov_list - for prov in prov_list: - config_cmd.append(f'--enable-{prov}={enable_prov_val}') + for prov in prov_list: + if (ucx): + config_cmd.append('--enable-ucx=yes') + break + else: + config_cmd.append(f'--enable-{prov}={enable_prov_val}') - for op in common.common_disable_list: - config_cmd.append(f'--enable-{op}=no') + for op in common.common_disable_list: + config_cmd.append(f'--enable-{op}=no') - if (cluster == 'default'): - for op in common.default_enable_list: - config_cmd.append(f'--enable-{op}') + if (cluster == 'default' and build_item != 'libfabric_mpich' and not ucx): + for op in common.default_enable_list: + config_cmd.append(f'--enable-{op}') - common.run_command(['./autogen.sh']) - common.run_command(shlex.split(" ".join(config_cmd))) - common.run_command(['make','clean']) - common.run_command(['make', '-j32']) - common.run_command(['make','install']) + common.run_command(['./autogen.sh']) + common.run_command(shlex.split(" ".join(config_cmd))) + common.run_command(['make','clean']) + common.run_command(['make', '-j32']) + common.run_command(['make','install']) def build_fabtests(libfab_install_path, mode): - os.chdir(f'{workspace}/fabtests') - if (mode == 'dbg'): - config_cmd = ['./configure', '--enable-debug', - f'--prefix={libfab_install_path}', - f'--with-libfabric={libfab_install_path}'] - else: - config_cmd = ['./configure', f'--prefix={libfab_install_path}', - f'--with-libfabric={libfab_install_path}'] - - common.run_command(['./autogen.sh']) - common.run_command(config_cmd) - common.run_command(['make','clean']) - common.run_command(['make', '-j32']) - common.run_command(['make', 'install']) + os.chdir(f'{workspace}/fabtests') + if (mode == 'dbg'): + config_cmd = ['./configure', '--enable-debug', + f'--prefix={libfab_install_path}', + f'--with-libfabric={libfab_install_path}'] + else: + config_cmd = ['./configure', f'--prefix={libfab_install_path}', + f'--with-libfabric={libfab_install_path}'] + + common.run_command(['./autogen.sh']) + common.run_command(config_cmd) + common.run_command(['make','clean']) + common.run_command(['make', '-j32']) + common.run_command(['make', 'install']) + +def extract_mpich(mpitype): + + dest = f'{install_path}/middlewares/{mpitype}_mpichtest' + if (mpitype == 'mpich'): + src_dir = 'mpich' + mpich_tar = cloudbees_config.mpich_tar + elif (mpitype == 'impi'): + src_dir = 'impi_mpichtest' + mpich_tar = cloudbees_config.impi_mpichtest_tar + else: + print(f"Invalid mpi type {mpitype}") + sys.exit(-1) + + cwd = os.getcwd() + if (os.path.exists(dest)): + shutil.rmtree(dest) + os.makedirs(f'{dest}/{mpitype}_mpichsuite') + os.chdir(f'{cloudbees_config.scm_dir}/{src_dir}/') + common.run_command(['tar', '-xvf', + f"{cloudbees_config.scm_dir}/{src_dir}/{mpich_tar}", + '-C', f'{dest}/{mpitype}_mpichsuite', + '--strip-components', '1']) + os.chdir(cwd) + +def build_mpich(libfab_installpath_mpich): + mpich_build_dir = f'{install_path}/middlewares/mpich_mpichtest' + mpich_path = f"{mpich_build_dir}/mpich_mpichsuite" + cwd = os.getcwd() + if (os.path.exists(f"{mpich_build_dir}/bin") !=True): + print("configure mpich") + os.chdir(mpich_path) + configure_cmd = f"./configure " + configure_cmd += f"--prefix={mpich_build_dir} " + configure_cmd += f"--with-libfabric={libfab_installpath_mpich} " + configure_cmd += "--disable-oshmem " + configure_cmd += "--disable-fortran " + configure_cmd += "--without-ch4-shmmods " + configure_cmd += "--with-device=ch4:ofi " + configure_cmd += "--without-ze " + print(configure_cmd) + common.run_command(['./autogen.sh']) + common.run_command(shlex.split(configure_cmd)) + common.run_command(['make','-j']) + common.run_command(['make','install']) + os.chdir(cwd) def copy_build_dir(install_path): - shutil.copytree(ci_site_config.build_dir, - f'{install_path}/ci_middlewares') + middlewares_path = f'{install_path}/middlewares' + if (os.path.exists(middlewares_path) != True): + os.makedirs(f'{install_path}/middlewares') + + shutil.copytree(f'{cloudbees_config.build_dir}/shmem', + f'{middlewares_path}/shmem') + shutil.copytree(f'{cloudbees_config.build_dir}/oneccl', + f'{middlewares_path}/oneccl') + + os.symlink(f'{cloudbees_config.build_dir}/mpich', + f'{middlewares_path}/mpich') + os.symlink(f'{cloudbees_config.build_dir}/impi', + f'{middlewares_path}/impi') + os.symlink(f'{cloudbees_config.build_dir}/ompi', + f'{middlewares_path}/ompi') + os.symlink(f'{cloudbees_config.build_dir}/oneccl_gpu', + f'{middlewares_path}/oneccl_gpu') def copy_file(file_name): - if (os.path.exists(f'{workspace}/{file_name}')): - shutil.copyfile(f'{workspace}/{file_name}', - f'{install_path}/log_dir/{file_name}') + if (os.path.exists(f'{workspace}/{file_name}')): + shutil.copyfile(f'{workspace}/{file_name}', + f'{install_path}/log_dir/{file_name}') def log_dir(install_path, release=False): - if (os.path.exists(f'{install_path}/log_dir') != True): - os.makedirs(f'{install_path}/log_dir') - if (release): - copy_file('Makefile.am.diff') - copy_file('configure.ac.diff') - copy_file('release_num.txt') + if (os.path.exists(f'{install_path}/log_dir') != True): + os.makedirs(f'{install_path}/log_dir') + + if (release): + copy_file('Makefile.am.diff') + copy_file('configure.ac.diff') + copy_file('release_num.txt') if __name__ == "__main__": #read Jenkins environment variables - # In Jenkins, JOB_NAME = 'ofi_libfabric/master' vs BRANCH_NAME = 'master' - # job name is better to use to distinguish between builds of different - # jobs but with same branch name. - jobname = os.environ['JOB_NAME'] - buildno = os.environ['BUILD_NUMBER'] - workspace = os.environ['WORKSPACE'] - - parser = argparse.ArgumentParser() - parser.add_argument('--build_item', help="build libfabric or fabtests", - choices=['libfabric', 'fabtests', 'builddir', 'logdir']) - - parser.add_argument('--ofi_build_mode', help="select buildmode debug or dl", \ - choices=['dbg', 'dl']) - - parser.add_argument('--build_cluster', help="build libfabric on specified cluster", \ - choices=['daos', 'dsa'], default='default') - parser.add_argument('--release', help="This job is likely testing a "\ - "release and will be checked into a git tree.", - action='store_true') - - args = parser.parse_args() - build_item = args.build_item - cluster = args.build_cluster - release = args.release - - if (args.ofi_build_mode): - ofi_build_mode = args.ofi_build_mode - else: - ofi_build_mode = 'reg' - - install_path = f'{ci_site_config.install_dir}/{jobname}/{buildno}' - libfab_install_path = f'{ci_site_config.install_dir}/{jobname}/{buildno}/{ofi_build_mode}' - - p = re.compile('mpi*') - - if (build_item == 'libfabric'): - build_libfabric(libfab_install_path, ofi_build_mode, cluster) - - elif (build_item == 'fabtests'): - build_fabtests(libfab_install_path, ofi_build_mode) - - elif (build_item == 'builddir'): - copy_build_dir(install_path) - - elif (build_item == 'logdir'): - log_dir(install_path, release) + # In Jenkins, JOB_NAME = 'ofi_libfabric/master' vs BRANCH_NAME = 'master' + # job name is better to use to distinguish between builds of different + # jobs but with same branch name. + jobname = os.environ['JOB_NAME'] + buildno = os.environ['BUILD_NUMBER'] + workspace = os.environ['WORKSPACE'] + + parser = argparse.ArgumentParser() + parser.add_argument('--build_item', help="build libfabric or fabtests", \ + choices=['libfabric', 'libfabric_mpich', 'fabtests', \ + 'builddir', 'logdir', 'extract_mpich', \ + 'extract_impi_mpich', 'mpich']) + parser.add_argument('--ofi_build_mode', help="select buildmode libfabric "\ + "build mode", choices=['reg', 'dbg', 'dl']) + parser.add_argument('--build_cluster', help="build libfabric on specified cluster", \ + choices=['daos', 'gpu'], default='default') + parser.add_argument('--release', help="This job is likely testing a "\ + "release and will be checked into a git tree.", + action='store_true') + parser.add_argument('--ucx', help="build with ucx", default=False, \ + action='store_true') + + args = parser.parse_args() + build_item = args.build_item + cluster = args.build_cluster + release = args.release + ucx = args.ucx + + if (args.ofi_build_mode): + ofi_build_mode = args.ofi_build_mode + else: + ofi_build_mode = 'reg' + + install_path = f'{cloudbees_config.install_dir}/{jobname}/{buildno}' + libfab_install_path = f'{cloudbees_config.install_dir}/{jobname}/{buildno}/{ofi_build_mode}' + + if (ucx): + libfab_install_path += '/ucx' + workspace += '/ucx' + + p = re.compile('mpi*') + + if (build_item == 'libfabric'): + build_libfabric(libfab_install_path, ofi_build_mode, cluster, ucx) + elif (build_item == 'libfabric_mpich'): + build_libfabric(f'{libfab_install_path}/libfabric_mpich', + ofi_build_mode, cluster) + elif (build_item == 'mpich'): + build_mpich(f'{libfab_install_path}/libfabric_mpich') + elif (build_item == 'fabtests'): + build_fabtests(libfab_install_path, ofi_build_mode) + elif (build_item == 'extract_mpich'): + extract_mpich('mpich') + elif (build_item == 'extract_impi_mpich'): + extract_mpich('impi') + elif (build_item == 'builddir'): + copy_build_dir(install_path) + elif (build_item == 'logdir'): + log_dir(install_path, release) diff --git a/contrib/intel/jenkins/common.py b/contrib/intel/jenkins/common.py index e5da5d74039..88732504691 100755 --- a/contrib/intel/jenkins/common.py +++ b/contrib/intel/jenkins/common.py @@ -1,47 +1,100 @@ import collections -import ci_site_config import subprocess import sys import os +from subprocess import Popen, TimeoutExpired +from time import sleep def get_node_name(host, interface): return '%s-%s' % (host, interface) -def run_command(command, logdir=None, test_type=None, ofi_build_mode=None): - stage_name = os.environ['STAGE_NAME'] - if (test_type and ('tcp-rxm' in stage_name)): - filename = f'{logdir}/MPI_tcp-rxm_{test_type}_{ofi_build_mode}' - elif (test_type and ('MPI_net' in stage_name)): - filename = f'{logdir}/MPI_net_{test_type}_{ofi_build_mode}' - elif (test_type and ofi_build_mode): - filename = f'{logdir}/{stage_name}_{test_type}_{ofi_build_mode}' - else: - filename = f'{logdir}/{stage_name}' - print("filename: ".format(filename)) - if (logdir): - f = open(filename, 'a') +def run_command(command): print(" ".join(command)) p = subprocess.Popen(command, stdout=subprocess.PIPE, text=True) print(p.returncode) - if (logdir): - f.write(" ".join(command) + '\n') while True: out = p.stdout.read(1) - if (logdir): - f.write(out) if (out == '' and p.poll() != None): break if (out != ''): sys.stdout.write(out) sys.stdout.flush() + + print(f"Return code is {p.returncode}") if (p.returncode != 0): print("exiting with " + str(p.poll())) - if (logdir): - f.close() sys.exit(p.returncode) - if (logdir): + +def run_logging_command(command, log_file): + print("filename: ".format(log_file)) + f = open(log_file, 'a') + print(" ".join(command)) + p = subprocess.Popen(command, stdout=subprocess.PIPE, text=True) + print(p.returncode) + f.write(" ".join(command) + '\n') + while True: + out = p.stdout.read(1) + f.write(out) + if (out == '' and p.poll() != None): + break + if (out != ''): + sys.stdout.write(out) + sys.stdout.flush() + + print(f"Return code is {p.returncode}") + if (p.returncode != 0): + print("exiting with " + str(p.poll())) f.close() + sys.exit(p.returncode) + f.close() + +def read_file(file_name): + with open(file_name) as file_out: + output = file_out.read() + return output + +class ClientServerTest: + def __init__(self, server_cmd, client_cmd, server_log, client_log, + timeout=None): + self.server_cmd = server_cmd + self.client_cmd = client_cmd + self.server_log = server_log + self.client_log = client_log + self._timeout = timeout + + def run(self): + server_process = Popen( + f"{self.server_cmd} > {self.server_log} 2>&1", + shell=True, close_fds=True + ) + sleep(1) + client_process = Popen( + f"{self.client_cmd} > {self.client_log} 2>&1", + shell=True, close_fds=True + ) + + try: + server_process.wait(timeout=self._timeout) + except TimeoutExpired: + server_process.terminate() + + try: + client_process.wait(timeout=self._timeout) + except TimeoutExpired: + client_process.terminate() + server_output = read_file(self.server_log) + client_output = read_file(self.client_log) + + print("") + print(f"server_command: {self.server_cmd}") + print('server_stdout:') + print(server_output) + print(f"client_command: {self.client_cmd}") + print('client_stdout:') + print(client_output) + + return (server_process.returncode, client_process.returncode) Prov = collections.namedtuple('Prov', 'core util') prov_list = [ @@ -54,6 +107,7 @@ def run_command(command, logdir=None, test_type=None, ofi_build_mode=None): Prov('udp', None), Prov('udp', 'rxd'), Prov('shm', None), + Prov('ucx', None) ] default_prov_list = [ 'verbs', @@ -70,9 +124,12 @@ def run_command(command, logdir=None, test_type=None, ofi_build_mode=None): dsa_prov_list = [ 'shm' ] +gpu_prov_list = [ + 'verbs', + 'shm' +] common_disable_list = [ 'usnic', - 'psm', 'efa', 'perf', 'rstream', @@ -82,5 +139,7 @@ def run_command(command, logdir=None, test_type=None, ofi_build_mode=None): 'opx' ] default_enable_list = [ - 'ze_dlopen' + 'ze-dlopen' ] + +cloudbees_log_start_string = "Begin Cloudbees Test Output" diff --git a/contrib/intel/jenkins/run.py b/contrib/intel/jenkins/run.py index c681e13c96c..88472c954da 100755 --- a/contrib/intel/jenkins/run.py +++ b/contrib/intel/jenkins/run.py @@ -5,34 +5,39 @@ import os import common -sys.path.append(os.environ['CI_SITE_CONFIG']) -import ci_site_config +sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") +import cloudbees_config # read Jenkins environment variables # In Jenkins, JOB_NAME = 'ofi_libfabric/master' vs BRANCH_NAME = 'master' # job name is better to use to distinguish between builds of different # jobs but with the same branch name. -fab = os.environ['FABRIC']#args.fabric +fab = os.environ['FABRIC'] +if 'slurm' in fab: + fab = cloudbees_config.fabric_map[f"{os.environ['SLURM_JOB_PARTITION']}"] + jbname = os.environ['JOB_NAME']#args.jobname bno = os.environ['BUILD_NUMBER']#args.buildno -def fi_info_test(core, hosts, mode, user_env, run_test, util): +def fi_info_test(core, hosts, mode, user_env, log_file, util): fi_info_test = tests.FiInfoTest(jobname=jbname,buildno=bno, testname='fi_info', core_prov=core, - fabric=fab, hosts=hosts, ofi_build_mode=mode, - user_env=user_env, run_test=run_test, util_prov=util) + fabric=fab, hosts=hosts, + ofi_build_mode=mode, user_env=user_env, + log_file=log_file, util_prov=util) print('-------------------------------------------------------------------') print(f"Running fi_info test for {core}-{util}-{fab}") fi_info_test.execute_cmd() print('-------------------------------------------------------------------') -def fabtests(core, hosts, mode, user_env, run_test, util): +def fabtests(core, hosts, mode, user_env, log_file, util, way): runfabtest = tests.Fabtest(jobname=jbname,buildno=bno, testname='runfabtests', core_prov=core, fabric=fab, hosts=hosts, ofi_build_mode=mode, - user_env=user_env, run_test=run_test, util_prov=util) + user_env=user_env, log_file=log_file, + util_prov=util, way=way) print('-------------------------------------------------------------------') if (runfabtest.execute_condn): @@ -42,12 +47,13 @@ def fabtests(core, hosts, mode, user_env, run_test, util): print(f"Skipping {core} {runfabtest.testname} as execute condition fails") print('-------------------------------------------------------------------') -def shmemtest(core, hosts, mode, user_env, run_test, util): +def shmemtest(core, hosts, mode, user_env, log_file, util): runshmemtest = tests.ShmemTest(jobname=jbname,buildno=bno, testname="shmem test", core_prov=core, fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, run_test=run_test, util_prov=util) + ofi_build_mode=mode, user_env=user_env, + log_file=log_file, util_prov=util) print('-------------------------------------------------------------------') if (runshmemtest.execute_condn): @@ -68,13 +74,13 @@ def shmemtest(core, hosts, mode, user_env, run_test, util): print(f"Skipping {core} {runshmemtest.testname} as execute condition fails") print('-------------------------------------------------------------------') -def multinodetest(core, hosts, mode, user_env, run_test, util): +def multinodetest(core, hosts, mode, user_env, log_file, util): runmultinodetest = tests.MultinodeTests(jobname=jbname,buildno=bno, testname="multinode performance test", core_prov=core, fabric=fab, hosts=hosts, ofi_build_mode=mode, user_env=user_env, - run_test=run_test, util_prov=util) + log_file=log_file, util_prov=util) print("-------------------------------------------------------------------") if (runmultinodetest.execute_condn): @@ -88,33 +94,13 @@ def multinodetest(core, hosts, mode, user_env, run_test, util): .format(runmultinodetest.testname)) print("-------------------------------------------------------------------") -def ze_fabtests(core, hosts, mode, user_env, run_test, util): - - runzefabtests = tests.ZeFabtests(jobname=jbname,buildno=bno, - testname="ze test", core_prov=core, - fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - run_test=run_test, util_prov=util) - - print('-------------------------------------------------------------------') - if (runzefabtests.execute_condn): - print(f"Running ze h2d tests for {core}-{util}-{fab}") - runzefabtests.execute_cmd('h2d') - print(f"Running ze d2d tests for {core}-{util}-{fab}") - runzefabtests.execute_cmd('d2d') - print(f"Running ze xd2d tests for {core}-{util}-{fab}") - runzefabtests.execute_cmd('xd2d') - else: - print(f"Skipping {core} {runzefabtests.testname} as execute condition fails") - print('-------------------------------------------------------------------') - -def intel_mpi_benchmark(core, hosts, mpi, mode, group, user_env, run_test, util): +def intel_mpi_benchmark(core, hosts, mpi, mode, group, user_env, log_file, util): imb = tests.IMBtests(jobname=jbname, buildno=bno, testname='IntelMPIbenchmark', core_prov=core, fabric=fab, hosts=hosts, mpitype=mpi, - ofi_build_mode=mode, user_env=user_env, run_test=run_test, - test_group=group, util_prov=util) + ofi_build_mode=mode, user_env=user_env, + log_file=log_file, test_group=group, util_prov=util) print('-------------------------------------------------------------------') if (imb.execute_condn == True): @@ -124,29 +110,30 @@ def intel_mpi_benchmark(core, hosts, mpi, mode, group, user_env, run_test, util) print(f"Skipping {mpi.upper} {imb.testname} as execute condition fails") print('-------------------------------------------------------------------') -def mpich_test_suite(core, hosts, mpi, mode, user_env, run_test, util): +def mpich_test_suite(core, hosts, mpi, mode, user_env, log_file, util, weekly=None): mpich_tests = tests.MpichTestSuite(jobname=jbname,buildno=bno, testname="MpichTestSuite",core_prov=core, fabric=fab, mpitype=mpi, hosts=hosts, ofi_build_mode=mode, user_env=user_env, - run_test=run_test, util_prov=util) + log_file=log_file, util_prov=util, + weekly=weekly) print('-------------------------------------------------------------------') if (mpich_tests.execute_condn == True): - print(f"Running mpichtestsuite: Spawn Tests for {core}-{util}-{fab}-{mpi}") - mpich_tests.execute_cmd("spawn") + print(f"Running mpichtestsuite for {core}-{util}-{fab}-{mpi}") + mpich_tests.execute_cmd() else: print(f"Skipping {mpi.upper()} {mpich_tests.testname} as exec condn fails") print('-------------------------------------------------------------------') -def osu_benchmark(core, hosts, mpi, mode, user_env, run_test, util): +def osu_benchmark(core, hosts, mpi, mode, user_env, log_file, util): osu_test = tests.OSUtests(jobname=jbname, buildno=bno, testname='osu-benchmarks', core_prov=core, fabric=fab, mpitype=mpi, hosts=hosts, ofi_build_mode=mode, user_env=user_env, - run_test=run_test, util_prov=util) + log_file=log_file, util_prov=util) print('-------------------------------------------------------------------') if (osu_test.execute_condn == True): @@ -156,33 +143,30 @@ def osu_benchmark(core, hosts, mpi, mode, user_env, run_test, util): print(f"Skipping {mpi.upper()} {osu_test.testname} as exec condn fails") print('-------------------------------------------------------------------') -def oneccltest(core, hosts, mode, user_env, run_test, util): +def oneccltest(core, hosts, mode, user_env, log_file, util): runoneccltest = tests.OneCCLTests(jobname=jbname,buildno=bno, testname="oneccl test", core_prov=core, fabric=fab, hosts=hosts, ofi_build_mode=mode, user_env=user_env, - run_test=run_test, util_prov=util) + log_file=log_file, util_prov=util) print('-------------------------------------------------------------------') if (runoneccltest.execute_condn): - print(f"Running oneCCL examples test for {core}-{util}-{fab}") - runoneccltest.execute_cmd("examples") - - print('---------------------------------------------------------------') - print(f"Running oneCCL functional test for {core}-{util}-{fab}") - runoneccltest.execute_cmd("functional") + print(f"Running oneCCL cpu tests for {core}-{util}-{fab}") + runoneccltest.execute_cmd() else: print(f"Skipping {runoneccltest.testname} as execute condition fails") print('-------------------------------------------------------------------') -def oneccltestgpu(core, hosts, mode, user_env, run_test, util): +def oneccltestgpu(core, hosts, mode, user_env, log_file, util): runoneccltestgpu = tests.OneCCLTestsGPU(jobname=jbname,buildno=bno, - testname="oneccl GPU test", core_prov=core, - fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - run_test=run_test, util_prov=util) + testname="oneccl GPU test", + core_prov=core, fabric=fab, + hosts=hosts, ofi_build_mode=mode, + user_env=user_env, log_file=log_file, + util_prov=util) print('-------------------------------------------------------------------') if (runoneccltestgpu.execute_condn): @@ -196,13 +180,13 @@ def oneccltestgpu(core, hosts, mode, user_env, run_test, util): print(f"Skipping {runoneccltestgpu.testname} as execute condition fails") print('-------------------------------------------------------------------') -def daos_cart_tests(core, hosts, mode, user_env, run_test, util): +def daos_cart_tests(core, hosts, mode, user_env, log_file, util): runcarttests = tests.DaosCartTest(jobname=jbname, buildno=bno, testname="Daos Cart Test", core_prov=core, fabric=fab, hosts=hosts, ofi_build_mode=mode, user_env=user_env, - run_test=run_test, util_prov=util) + log_file=log_file, util_prov=util) print('-------------------------------------------------------------------') if (runcarttests.execute_condn): @@ -210,5 +194,35 @@ def daos_cart_tests(core, hosts, mode, user_env, run_test, util): runcarttests.execute_cmd() print('-------------------------------------------------------------------') +def dmabuftests(core, hosts, mode, user_env, log_file, util): + + rundmabuftests = tests.DMABUFTest(jobname=jbname,buildno=bno, + testname="DMABUF Tests", core_prov=core, + fabric=fab, hosts=hosts, + ofi_build_mode=mode, user_env=user_env, + log_file=log_file, util_prov=util) + + print('-------------------------------------------------------------------') + if (rundmabuftests.execute_condn): + print(f"Running dmabuf H->H tests for {core}-{util}-{fab}") + rundmabuftests.execute_cmd('H2H') + + print('---------------------------------------------------------------') + print(f"Running dmabuf H->D tests for {core}-{util}-{fab}") + rundmabuftests.execute_cmd('H2D') + + print('---------------------------------------------------------------') + print(f"Running dmabuf D->H tests for {core}-{util}-{fab}") + rundmabuftests.execute_cmd('D2H') + + print('---------------------------------------------------------------') + print(f"Running dmabuf D->D tests for {core}-{util}-{fab}") + rundmabuftests.execute_cmd('D2D') + + print('---------------------------------------------------------------') + else: + print(f"Skipping {rundmabuftests.testname} as execute condition fails") + print('-------------------------------------------------------------------') + if __name__ == "__main__": pass diff --git a/contrib/intel/jenkins/runtests.py b/contrib/intel/jenkins/runtests.py index 7e581aea05f..51f5a0c2ef6 100755 --- a/contrib/intel/jenkins/runtests.py +++ b/contrib/intel/jenkins/runtests.py @@ -1,35 +1,50 @@ import argparse import os import sys -sys.path.append(os.environ['CI_SITE_CONFIG']) -import ci_site_config +sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") +import cloudbees_config +import subprocess import run import common +import shlex -parser = argparse.ArgumentParser() +class ParseDict(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, dict()) + for value in values: + key, value = value.split('=') + getattr(namespace, self.dest)[key] = value +parser = argparse.ArgumentParser() parser.add_argument('--prov', help="core provider", choices=['verbs', \ - 'tcp', 'udp', 'sockets', 'shm', 'psm3']) + 'tcp', 'udp', 'sockets', 'shm', 'psm3', 'ucx']) parser.add_argument('--util', help="utility provider", choices=['rxd', 'rxm']) -parser.add_argument('--ofi_build_mode', help="specify the build configuration", \ - choices = ['dbg', 'dl'], default='reg') +parser.add_argument('--ofi_build_mode', help="specify the build configuration",\ + choices = ['reg', 'dbg', 'dl'], default='reg') parser.add_argument('--test', help="specify test to execute", \ choices = ['all', 'shmem', 'IMB', 'osu', 'oneccl', \ 'mpichtestsuite', 'fabtests', 'onecclgpu', \ - 'fi_info', 'daos', 'multinode']) + 'fi_info', 'daos', 'multinode', 'dmabuf']) -parser.add_argument('--imb_grp', help="IMB test group {1:[MPI1, P2P], \ +parser.add_argument('--imb_grp', help="IMB test group 1:[MPI1, P2P], \ 2:[EXT, IO], 3:[NBC, RMA, MT]", choices=['1', '2', '3']) -parser.add_argument('--device', help="optional gpu device", choices=['ze']) -parser.add_argument('--user_env', help="Run with additional environment variables", \ - default='{}') +parser.add_argument('--way', help="direction to run with device option", + choices=['h2d', 'd2d', 'xd2d'], default=None) +parser.add_argument('--user_env', help="Run with additional environment " \ + "variables", nargs='*', action=ParseDict, default={}) +parser.add_argument('--mpi', help="Select mpi to use for middlewares", + choices=['impi', 'mpich', 'ompi'], default='impi') +parser.add_argument('--log_file', help="Full path to log file", + default=os.environ['DEFAULT_LOG_LOCATION'], type=str) +parser.add_argument('--weekly', help="run weekly", default=False, type=bool) args = parser.parse_args() args_core = args.prov args_util = args.util -args_device = args.device user_env = args.user_env +log_file = args.log_file +weekly = args.weekly if (args.ofi_build_mode): ofi_build_mode = args.ofi_build_mode @@ -46,10 +61,35 @@ else: imb_group = '1' -node = (os.environ['NODE_NAME']).split('_')[0] -hosts = [node] +mpi = args.mpi +way = args.way + +hosts = [] +if 'slurm' in os.environ['FABRIC']: + slurm_nodes = os.environ['SLURM_JOB_NODELIST'] # example cb[1-4,11] + common.run_command(shlex.split(f"sinfo --Format=Features -n {slurm_nodes}")) + if int(os.environ['SLURM_NNODES']) == 1: + hosts.append(slurm_nodes) + else: + prefix = slurm_nodes[0:slurm_nodes.find('[')] + nodes = slurm_nodes[slurm_nodes.find('[') + 1 : + slurm_nodes.find(']')].split(',') # ['1-4', '11'] + for item in nodes: # ['1-4', '11'] -> ['cb1', 'cb2', 'cb3', 'cb4', 'cb11'] + if '-' in item: + rng = item.split('-') + node_list = list(range(int(rng[0]), int(rng[1]) + 1)) + for node in node_list: + hosts.append(f'{prefix}{node}') + else: + hosts.append(f'{prefix}{item}') +else: + node = (os.environ['NODE_NAME']).split('_')[0] + hosts = [node] + for host in cloudbees_config.node_map[node]: + hosts.append(host) + print(f"hosts = {hosts}") -mpilist = ['impi', 'mpich', 'ompi'] +print(common.cloudbees_log_start_string) #this script is executed from /tmp #this is done since some mpi tests @@ -61,45 +101,51 @@ os.chdir('/tmp/') if(args_core): - for host in ci_site_config.node_map[node]: - hosts.append(host) - - if (args.device != 'ze'): - if (run_test == 'all' or run_test == 'fi_info'): - run.fi_info_test(args_core, hosts, ofi_build_mode, user_env, run_test, - util=args.util) - - if (run_test == 'all' or run_test == 'fabtests'): - run.fabtests(args_core, hosts, ofi_build_mode, user_env, run_test, args_util) - - if (run_test == 'all' or run_test == 'shmem'): - run.shmemtest(args_core, hosts, ofi_build_mode, user_env, run_test, args_util) - - if (run_test == 'all' or run_test == 'oneccl'): - run.oneccltest(args_core, hosts, ofi_build_mode, user_env, run_test, args_util) - - if (run_test == 'all' or run_test == 'onecclgpu'): - run.oneccltestgpu(args_core, hosts, ofi_build_mode, user_env, run_test, args_util) - - if (run_test == 'all' or run_test == 'daos'): - run.daos_cart_tests(args_core, hosts, ofi_build_mode, user_env, run_test, args_util) - - if (run_test == 'all' or run_test == 'multinode'): - run.multinodetest(args_core, hosts, ofi_build_mode, user_env, run_test, args_util) - - for mpi in mpilist: - if (run_test == 'all' or run_test == 'mpichtestsuite'): - run.mpich_test_suite(args_core, hosts, mpi, - ofi_build_mode, user_env, run_test, args_util) - if (run_test == 'all' or run_test == 'IMB'): - run.intel_mpi_benchmark(args_core, hosts, mpi, - ofi_build_mode, imb_group, - user_env, run_test, args_util) - if (run_test == 'all' or run_test == 'osu'): - run.osu_benchmark(args_core, hosts, mpi, - ofi_build_mode, user_env, run_test, args_util) - else: - run.ze_fabtests(args_core, hosts, ofi_build_mode, user_env, run_test, args_util) - + if (run_test == 'all' or run_test == 'fi_info'): + run.fi_info_test(args_core, hosts, ofi_build_mode, + user_env, log_file, util=args.util) + + if (run_test == 'all' or run_test == 'fabtests'): + run.fabtests(args_core, hosts, ofi_build_mode, user_env, log_file, + args_util, way) + + if (run_test == 'all' or run_test == 'shmem'): + run.shmemtest(args_core, hosts, ofi_build_mode, user_env, log_file, + args_util) + + if (run_test == 'all' or run_test == 'oneccl'): + run.oneccltest(args_core, hosts, ofi_build_mode, user_env, log_file, + args_util) + + if (run_test == 'all' or run_test == 'onecclgpu'): + run.oneccltestgpu(args_core, hosts, ofi_build_mode, + user_env, log_file, args_util) + + if (run_test == 'all' or run_test == 'daos'): + run.daos_cart_tests(args_core, hosts, ofi_build_mode, + user_env, log_file, args_util) + + if (run_test == 'all' or run_test == 'multinode'): + run.multinodetest(args_core, hosts, ofi_build_mode, + user_env, log_file, args_util) + + if (run_test == 'all' or run_test == 'mpichtestsuite'): + run.mpich_test_suite(args_core, hosts, mpi, + ofi_build_mode, user_env, log_file, + args_util, weekly) + + if (run_test == 'all' or run_test == 'IMB'): + run.intel_mpi_benchmark(args_core, hosts, mpi, + ofi_build_mode, imb_group, + user_env, log_file, args_util) + + if (run_test == 'all' or run_test == 'osu'): + run.osu_benchmark(args_core, hosts, mpi, + ofi_build_mode, user_env, log_file, + args_util) + + if (run_test == 'all' or run_test == 'dmabuf'): + run.dmabuftests(args_core, hosts, ofi_build_mode, + user_env, log_file, args_util) else: print("Error : Specify a core provider to run tests") diff --git a/contrib/intel/jenkins/summary.py b/contrib/intel/jenkins/summary.py index d9577d9e798..43199fc2a51 100755 --- a/contrib/intel/jenkins/summary.py +++ b/contrib/intel/jenkins/summary.py @@ -1,21 +1,64 @@ from abc import ABC, abstractmethod import shutil -from tempfile import NamedTemporaryFile from datetime import datetime from typing import Tuple import os from pickle import FALSE import sys +import smtplib +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from email.mime.base import MIMEBase +from email import encoders # add jenkins config location to PATH -sys.path.append(os.environ['CI_SITE_CONFIG']) +sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") -import ci_site_config +import cloudbees_config import argparse import common verbose = False +class SendEmail: + def __init__(self, sender=None, receivers=None, attachment=None): + self.sender = sender if sender is not None else os.environ['SENDER'] + self.receivers = (receivers if receivers is not None else \ + f"{os.environ['RECEIVER']}").split(',') + self.attachment = attachment + self.work_week = datetime.today().isocalendar()[1] + self.msg = MIMEMultipart() + + def __add_attachments(self): + print(f"Attachment is {self.attachment}") + if self.attachment is None: + return + + attachment = MIMEBase('application', 'octet-stream') + attachment.set_payload(open(self.attachment, 'rb').read()) + encoders.encode_base64(attachment) + name = f"Jenkins_Summary_ww{self.work_week}" + if (verbose): + name = f"{name}_all" + attachment.add_header('Content-Disposition', + f"attachment; filename={name}") + self.msg.attach(attachment) + + def __write_msg(self): + self.msg['Subject'] = f"Cloudbees Summary {os.environ['JOB_NAME']}" + self.msg['From'] = self.sender + self.msg['To'] = ", ".join(self.receivers) + self.msg.attach(MIMEText(f"WW{self.work_week} Summary for Libfabric "\ + "From Cloudbees")) + + def send_mail(self): + self.__write_msg() + self.__add_attachments() + server = smtplib.SMTP(os.environ['SMTP_SERVER'], + os.environ['SMTP_PORT']) + server.sendmail(self.sender, self.receivers, self.msg.as_string()) + server.quit() + class Release: def __init__(self, log_dir, output_file, logger, release_num): self.log_dir = log_dir @@ -45,10 +88,7 @@ def __init__(self, output_file, release): def log(self, line, end_delimiter='\n', lpad=0, ljust=0): print(f'{self.padding * lpad}{line}'.ljust(ljust), end = end_delimiter) - if (self.release): - self.output_file.write( - f'{self.padding * lpad}{line}{end_delimiter}' - ) + self.output_file.write(f'{self.padding * lpad}{line}{end_delimiter}') class Summarizer(ABC): @classmethod @@ -56,6 +96,10 @@ def __subclasshook__(cls, subclass): return ( hasattr(subclass, "print_results") and callable(subclass.print_results) + and hasattr(subclass, "check_features") + and callable(subclass.check_features) + and hasattr(subclass, "check_node") + and callable(subclass.check_node) and hasattr(subclass, "check_name") and callable(subclass.check_name) and hasattr(subclass, "check_pass") @@ -64,6 +108,8 @@ def __subclasshook__(cls, subclass): and callable(subclass.check_fail) and hasattr(subclass, "check_exclude") and callable(subclass.check_exclude) + and hasattr(subclass, "fast_forward") + and callable(subclass.fast_forward) and hasattr(subclass, "read_file") and callable(subclass.read_file) and hasattr(subclass, "run") @@ -87,7 +133,12 @@ def __init__(self, logger, log_dir, prov, file_name, stage_name): self.failed_tests = [] self.excludes = 0 self.excluded_tests = [] + self.error = 0 + self.errored_tests = [] self.test_name ='no_test' + self.name = 'no_name' + self.features = "no_features_found" + self.node = "no_node_found" def print_results(self): total = self.passes + self.fails @@ -98,12 +149,21 @@ def print_results(self): percent = self.passes/total * 100 if (verbose): self.logger.log( - f"<>{self.stage_name}: ", lpad=1, ljust=40, end_delimiter = '' + f"<>{self.stage_name} : ", lpad=1, ljust=50, end_delimiter = '' ) else: self.logger.log( - f"{self.stage_name}: ", lpad=1, ljust=40, end_delimiter = '' + f"{self.stage_name} : ", + lpad=1, ljust=50, end_delimiter = '' ) + self.logger.log( + f"{self.node} : ", + lpad=1, ljust=20, end_delimiter = '' + ) + self.logger.log( + f"[{self.features}] : ", + lpad=1, ljust=30, end_delimiter = '' + ) self.logger.log(f"{self.passes}:{total} ", ljust=10, end_delimiter = '') self.logger.log(f": {percent:.2f}% : ", ljust=12, end_delimiter = '') self.logger.log("Pass", end_delimiter = '') @@ -128,6 +188,22 @@ def print_results(self): for test in self.excluded_tests: self.logger.log(f'{test}', lpad=3) + if self.error: + self.logger.log( + "Errored, Interrupt, or Canceled Tests: "\ + f"{self.excludes} ", lpad=2 + ) + for test in self.errored_tests: + self.logger.log(f'{test}', lpad=3) + + def check_features(self, previous, line): + if ('avail_features') in previous: + self.features = line.strip() + + def check_node(self, line): + if ('slurm_nodelist' in line): + self.node = line.strip().split('=')[1] + def check_name(self, line): return @@ -147,8 +223,21 @@ def check_line(self, line): self.check_fail(line) self.check_exclude(line) + def fast_forward(self, log_file): + previous = "" + line = log_file.readline().lower() + while line != "": + self.check_node(line) + self.check_features(previous, line) + if common.cloudbees_log_start_string.lower() in line: + break + + previous = line + line = log_file.readline().lower() + def read_file(self): with open(self.file_path, 'r') as log_file: + self.fast_forward(log_file) for line in log_file: self.check_line(line.lower()) @@ -179,6 +268,7 @@ def read_file(self): class FabtestsSummarizer(Summarizer): def __init__(self, logger, log_dir, prov, file_name, stage_name): super().__init__(logger, log_dir, prov, file_name, stage_name) + self.trace = False def check_name(self, line): # don't double count ubertest output and don't count fi_ubertest's @@ -204,6 +294,11 @@ def check_pass(self, line): self.passes += 1 if 'ubertest' in self.test_name: idx = (result_line.index('result:') - 1) + try: + int((result_line[idx].split(',')[0])) + except: + return + ubertest_number = int((result_line[idx].split(',')[0])) self.passed_tests.append(f"{self.test_name}: "\ f"{ubertest_number}") @@ -216,6 +311,10 @@ def check_fail(self, line): self.fails += 1 if 'ubertest' in self.test_name: idx = (result_line.index('result:') - 1) + try: + int((result_line[idx].split(',')[0])) + except: + return ubertest_number = int((result_line[idx].split(',')[0])) self.failed_tests.append(f"{self.test_name}: " \ f"{ubertest_number}") @@ -232,12 +331,40 @@ def check_exclude(self, line): self.excludes += 1 self.excluded_tests.append(self.test_name) + def check_trace(self, line): + if not self.trace: + cmd_count = 0 + faults_count = 0 + if ("user to sar buffer" in line): + tokens = line.split(' ') + for i in range(0, len(tokens)): + if 'cmd' in tokens[i]: + cmd_count += int(tokens[i + 1]) + if 'faults' in tokens[i]: + faults_count += int(tokens[i + 1]) + + if (cmd_count > 0 or faults_count > 0): + self.trace = True + def check_line(self, line): self.check_name(line) if (self.test_name != 'no_test'): self.check_pass(line) self.check_fail(line) self.check_exclude(line) + if ('dsa' in self.file_name): + self.check_trace(line) + + def summarize(self): + if not self.exists: + return 0 + + self.read_file() + self.print_results() + if ('dsa' in self.file_name and not self.trace): + exit("Expected: DSA to run. Actual: DSA Not Run") + + return int(self.fails) class MultinodePerformanceSummarizer(Summarizer): def __init__(self, logger, log_dir, prov, file_name, stage_name): @@ -277,7 +404,7 @@ def check_name(self, line): f"{tokens[len(tokens) - 1]}" def check_pass(self, line): - if 'passed' in line: + if 'passed' in line or "all done" in line: self.passes += 1 self.passed_tests.append(self.name) @@ -290,17 +417,24 @@ class ShmemSummarizer(Summarizer): def __init__(self, logger, log_dir, prov, file_name, stage_name): super().__init__(logger, log_dir, prov, file_name, stage_name) self.shmem_type = { - 'uh' : { 'func' : self.check_uh, - 'keyphrase' : 'summary' + 'uh' : { 'func' : self.check_uh, + 'keyphrase' : 'summary', + 'passes' : 0, + 'fails' : 0 }, - 'isx' : { 'func' : self.check_isx, - 'keyphrase' : 'scaling' + 'isx' : { 'func' : self.check_isx, + 'keyphrase' : 'scaling', + 'passes' : 0, + 'fails' : 0 }, - 'prk' : { 'func' : self.check_prk, - 'keyphrase' : 'solution' + 'prk' : { 'func' : self.check_prk, + 'keyphrase' : 'solution', + 'passes' : 0, + 'fails' : 0 } } - self.keyphrase = self.shmem_type[self.prov]['keyphrase'] + self.test_type = 'prk' + self.keyphrase = self.shmem_type[self.test_type]['keyphrase'] self.name = 'no_test' def check_uh(self, line, log_file): @@ -312,10 +446,10 @@ def check_uh(self, line, log_file): if 'test_' in token: self.name = token if tokens[len(tokens) - 1] == 'ok': - self.passes += 1 + self.shmem_type[self.test_type]['passes'] += 1 self.passed_tests.append(self.name) else: - self.fails += 1 + self.shmem_type[self.test_type]['fails'] += 1 self.failed_tests.append(self.name) # Summary # x/z Passed. @@ -324,24 +458,26 @@ def check_uh(self, line, log_file): passed = log_file.readline().lower() failed = log_file.readline().lower() token = int(passed.split()[1].split('/')[0]) - if self.passes != token: + if self.shmem_type[self.test_type]['passes'] != token: self.logger.log( - f"passes {self.passes} do not match log reported passes "\ - f"{token}" + f"passes {self.shmem_type[self.test_type]['passes']} do " \ + f"not match log reported passes {token}" ) token = int(failed.split()[1].split('/')[0]) - if self.fails != int(token): + if self.shmem_type[self.test_type]['fails'] != int(token): self.logger.log( - f"fails {self.fails} does not match log fails "\ - f"{token}" + f"fails {self.shmem_type[self.test_type]['fails']} does "\ + f"not match log fails {token}" ) def check_prk(self, line, log_file=None): if self.keyphrase in line: - self.passes += 1 + self.shmem_type[self.test_type]['passes'] += 1 if 'error:' in line or "exiting with" in line: - self.fails += 1 - self.failed_tests.append(f"{self.prov} {self.passes + self.fails}") + self.shmem_type[self.test_type]['fails'] += 1 + p = self.shmem_type[self.test_type]['passes'] + f = self.shmem_type[self.test_type]['fails'] + self.failed_tests.append(f"{self.prov} {p + f}") if 'test(s)' in line: token = line.split()[0] if self.fails != int(token): @@ -352,59 +488,85 @@ def check_prk(self, line, log_file=None): def check_isx(self, line, log_file=None): if self.keyphrase in line: - self.passes += 1 + self.shmem_type[self.test_type]['passes'] += 1 if ('failed' in line and 'test(s)' not in line) or \ "exiting with" in line: - self.fails += 1 - self.failed_tests.append(f"{self.prov} {self.passes + self.fails}") + self.shmem_type[self.test_type]['fails'] += 1 + p = self.shmem_type[self.test_type]['passes'] + f = self.shmem_type[self.test_type]['fails'] + self.failed_tests.append(f"{self.prov} {p + f}") if 'test(s)' in line: token = line.split()[0] - if int(token) != self.fails: + if int(token) != self.shmem_type[self.test_type]['fails']: self.logger.log( - f"fails {self.fails} does not match log reported fails " \ - f"{int(token)}" + f"fails {self.shmem_type[self.test_type]['fails']} does " \ + f"not match log reported fails {int(token)}" ) def check_fails(self, line): if "exiting with" in line: - self.fails += 1 - self.failed_tests.append(f"{self.prov} {self.passes + self.fails}") + self.shmem_type[self.test_type]['fails'] += 1 + p = self.shmem_type[self.test_type]['passes'] + f = self.shmem_type[self.test_type]['fails'] + self.failed_tests.append(f"{self.prov} {p + f}") + + def check_test_type(self, line): + if "running shmem" in line: + self.test_type = line.split(' ')[2].lower() + self.keyphrase = self.shmem_type[self.test_type]['keyphrase'] def check_line(self, line, log_file): - self.shmem_type[self.prov]['func'](line, log_file) - self.check_fails(line) + self.check_test_type(line) + if self.test_type is not None: + self.shmem_type[self.test_type]['func'](line, log_file) + self.check_fails(line) def read_file(self): with open(self.file_path, 'r') as log_file: + super().fast_forward(log_file) for line in log_file: self.check_line(line.lower(), log_file) + for key in self.shmem_type.keys(): + self.passes += self.shmem_type[key]['passes'] + self.fails += self.shmem_type[key]['fails'] + class MpichTestSuiteSummarizer(Summarizer): def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name): super().__init__(logger, log_dir, prov, file_name, stage_name) self.mpi = mpi - if self.mpi == 'impi': - self.run = '/mpiexec' - else: - self.run = '/mpirun' + self.run = 'mpiexec' + + def read_file(self): + with open(self.file_path,'r') as log_file: + super().fast_forward(log_file) + for line in log_file: + super().check_line(line.lower().strip()) + + def check_exclude(self, line): + if line.startswith('excluding:'): + test = line.split(':')[-1] + self.excludes += 1 + self.excluded_tests.append(test) def check_name(self, line): - if self.run in line: - self.name = line.split()[len(line.split()) - 1].split('/')[1] - #assume pass + if (line.startswith('ok') or + line.startswith('not ok')): + self.name = line.split('-')[1].split('#')[0].strip() + + def check_pass(self, line): + if (line.startswith('ok') and not + line.split('#')[1].strip().startswith('skip')): self.passes += 1 self.passed_tests.append(self.name) def check_fail(self, line): - # Fail cases take away assumed pass - if "exiting with" in line: + if (line.startswith('not ok') and not + line.split('#')[1].strip().startswith('skip')): self.fails += 1 - self.passes -= 1 - self.failed_tests.append(f'{self.name}') - #skip to next test - while self.run not in line: - line = self.log.readline().lower() + self.failed_tests.append(self.name) + class ImbSummarizer(Summarizer): def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name): @@ -495,6 +657,14 @@ class DaosSummarizer(Summarizer): def __init__(self, logger, log_dir, prov, file_name, stage_name): super().__init__(logger, log_dir, prov, file_name, stage_name) + if (self.exists): + if ('verbs' in file_name): + self.node = cloudbees_config.daos_prov_node_map['verbs'] + if ('tcp' in file_name): + self.node = cloudbees_config.daos_prov_node_map['tcp'] + + self.features = cloudbees_config.daos_node_features + def check_name(self, line): if "reading ." in line: self.test_name = line.split('/')[len(line.split('/')) - 1] \ @@ -512,17 +682,91 @@ def check_pass(self, line): def check_fail(self, line): res_list = line.lstrip("results :").rstrip().split('|') for elem in res_list: - if 'pass' not in elem: - self.fails += [int(s) for s in elem.split() if s.isdigit()][0] - if self.fails != 0: + total = [int(s) for s in elem.split() if s.isdigit()][0] + if total != 0: + if 'fail' in elem: + self.fails += total self.failed_tests.append(f'{self.test_name}') - return (self.fails) + if 'error' in elem: + self.error += total + self.errored_tests.append(f'error: {self.test_name}') + if 'interrupt' in elem: + self.error += total + self.errored_tests.append(f'interrupt: {self.test_name}') + if 'cancel' in elem: + self.error += total + self.errored_tests.append(f'cancel: {self.test_name}') + + def check_exclude(self, line): + res_list = line.lstrip("results :").rstrip().split('|') + for elem in res_list: + total = [int(s) for s in elem.split() if s.isdigit()][0] + if total != 0: + if 'skip' in elem: + self.excludes += total + self.excluded_tests.append(f'skip: {self.test_name}') + if 'warn' in elem: + self.excludes += total + self.excluded_tests.append(f'warn: {self.test_name}') def check_line(self, line): self.check_name(line) if "results :" in line: self.check_pass(line) self.check_fail(line) + self.check_exclude(line) + +class DmabufSummarizer(Summarizer): + def __init__(self, logger, log_dir, prov, file_name, stage_name): + super().__init__(logger, log_dir, prov, file_name, stage_name) + + self.test_type = '' + + def check_type(self, line): + if "Running" in line: + self.test_type = line.split()[2] + + def check_num_node(self, line): + if "SLURM_NNODES" in line: + self.num_nodes = line.split("=")[-1].strip() + self.num_nodes = ' '.join([self.num_nodes, 'node']) + + def check_name(self, line): + if "client_command" in line: + name_list = line.split()[-2:] + name_list.insert(0, str(self.num_nodes)) + name_list.insert(1, str(self.test_type)) + self.test_name = name_list + + def check_pass(self, line): + if "TEST COMPLETED" in line: + self.passes += 1 + self.passed_tests.append(self.test_name) + + def check_fail(self, line): + if "TEST FAILED" in line: + self.fails += 1 + self.failed_tests.append(self.test_name) + + def fast_forward(self, log_file): + previous = "" + line = log_file.readline() + while line != "": + self.check_num_node(line) + self.check_node(line.lower()) + self.check_features(previous.lower(), line.lower()) + if common.cloudbees_log_start_string.lower() in line.lower(): + break + + previous = line + line = log_file.readline() + + def read_file(self): + with open(self.file_path, 'r') as log_file: + self.fast_forward(log_file) + for line in log_file: + self.check_type(line) + self.check_line(line) def get_release_num(log_dir): file_name = f'{log_dir}/release_num.txt' @@ -555,6 +799,16 @@ def summarize_items(summary_item, logger, log_dir, mode): ).summarize() err += ret if ret else 0 + if ((summary_item == 'daos' or summary_item == 'all') + and mode == 'reg'): + for prov in ['tcp-rxm', 'verbs-rxm']: + ret = DaosSummarizer( + logger, log_dir, prov, + f'daos_{prov}_{mode}', + f"{prov} daos {mode}" + ).summarize() + err += ret if ret else 0 + if summary_item == 'imb' or summary_item == 'all': for mpi in mpi_list: for item in ['tcp-rxm', 'verbs-rxm', 'tcp']: @@ -567,7 +821,7 @@ def summarize_items(summary_item, logger, log_dir, mode): if summary_item == 'osu' or summary_item == 'all': for mpi in mpi_list: - for item in ['tcp-rxm', 'verbs-rxm']: + for item in ['tcp-rxm', 'verbs-rxm', 'tcp']: ret = OsuSummarizer( logger, log_dir, item, mpi, f'MPI_{item}_{mpi}_osu_{mode}', @@ -577,10 +831,10 @@ def summarize_items(summary_item, logger, log_dir, mode): if summary_item == 'mpichtestsuite' or summary_item == 'all': for mpi in mpi_list: - for item in ['tcp-rxm', 'verbs-rxm', 'sockets']: + for item in ['tcp', 'verbs-rxm']: ret = MpichTestSuiteSummarizer( logger, log_dir, item, mpi, - f'MPICH testsuite_{item}_{mpi}_'\ + f'mpichtestsuite_{item}_{mpi}_'\ f'mpichtestsuite_{mode}', f"{item} {mpi} mpichtestsuite {mode}" ).summarize() @@ -592,55 +846,71 @@ def summarize_items(summary_item, logger, log_dir, mode): ret = MultinodePerformanceSummarizer( logger, log_dir, prov, - f'multinode_performance_{prov}_{mode}', + f'multinode_performance_{prov}_multinode_{mode}', f"multinode performance {prov} {mode}" ).summarize() err += ret if ret else 0 if summary_item == 'oneccl' or summary_item == 'all': - ret = OnecclSummarizer( - logger, log_dir, 'oneCCL', - f'oneCCL_oneccl_{mode}', - f'oneCCL {mode}' - ).summarize() - err += ret if ret else 0 - ret = OnecclSummarizer( - logger, log_dir, 'oneCCL-GPU', - f'oneCCL-GPU_onecclgpu_{mode}', - f'oneCCL-GPU {mode}' - ).summarize() + for prov in ['tcp-rxm', 'verbs-rxm']: + ret = OnecclSummarizer( + logger, log_dir, 'oneCCL', + f'oneCCL_{prov}_oneccl_{mode}', + f'oneCCL {prov} {mode}' + ).summarize() + err += ret if ret else 0 + ret = OnecclSummarizer( + logger, log_dir, 'oneCCL-GPU', + f'oneCCL-GPU_{prov}_onecclgpu_{mode}', + f'oneCCL-GPU {prov} {mode}' + ).summarize() err += ret if ret else 0 if summary_item == 'shmem' or summary_item == 'all': - shmem_types = ['uh', 'prk', 'isx'] - for type in shmem_types: + for prov in ['tcp', 'verbs', 'sockets']: ret= ShmemSummarizer( - logger, log_dir, f'{type}', - f'SHMEM_{type}_shmem_{mode}', - f'shmem {type} {mode}' + logger, log_dir, prov, + f'SHMEM_{prov}_shmem_{mode}', + f'shmem {prov} {mode}' ).summarize() err += ret if ret else 0 - if summary_item == 'ze' or summary_item == 'all': + if summary_item == 'v3' or summary_item == 'all': test_types = ['h2d', 'd2d', 'xd2d'] for type in test_types: ret = FabtestsSummarizer( logger, log_dir, 'shm', - f'ze-{prov}_{type}_{mode}', - f"ze {prov} {type} {mode}" + f'ze_v3_shm_{type}_{mode}', + f"ze v3 shm {type} {mode}" ).summarize() err += ret if ret else 0 - if ((summary_item == 'daos' or summary_item == 'all') - and mode == 'reg'): - for prov in ['tcp', 'verbs']: - ret = DaosSummarizer( - logger, log_dir, prov, - f'daos_{prov}_daos_{mode}', - f"{prov} daos {mode}" + ret = OnecclSummarizer( + logger, log_dir, 'oneCCL-GPU', + f'oneCCL-GPU-v3_verbs-rxm_onecclgpu_{mode}', + f'oneCCL-GPU-v3 verbs-rxm {mode}' + ).summarize() + err += ret if ret else 0 + + if summary_item == 'dsa' or summary_item == 'all': + for prov in ['shm']: + ret = FabtestsSummarizer( + logger, log_dir, 'shm', + f'{prov}_dsa_fabtests_{mode}', + f"{prov} dsa fabtests {mode}" ).summarize() err += ret if ret else 0 + if summary_item == 'dmabuf' or summary_item == 'all': + for prov in ['verbs-rxm']: + for num_nodes in range(1,3): + ret = DmabufSummarizer( + logger, log_dir, 'verbs-rxm', + f'DMABUF-Tests_{prov}_dmabuf_{num_nodes}_{mode}', + f"DMABUF-Tests {prov} dmabuf {num_nodes} node {mode}" + ).summarize() + err += ret if ret else 0 + return err if __name__ == "__main__": @@ -655,7 +925,8 @@ def summarize_items(summary_item, logger, log_dir, mode): parser = argparse.ArgumentParser() parser.add_argument('--summary_item', help="functional test to summarize", choices=['fabtests', 'imb', 'osu', 'mpichtestsuite', - 'oneccl', 'shmem', 'ze', 'multinode', 'daos', 'all']) + 'oneccl', 'shmem', 'multinode', 'daos', 'v3', + 'dsa', 'dmabuf', 'all']) parser.add_argument('--ofi_build_mode', help="select buildmode debug or dl", choices=['dbg', 'dl', 'reg'], default='all') parser.add_argument('-v', help="Verbose mode. Print all tests", \ @@ -663,24 +934,33 @@ def summarize_items(summary_item, logger, log_dir, mode): parser.add_argument('--release', help="This job is testing a release."\ "It will be saved and checked into a git tree.", action='store_true') + parser.add_argument('--send_mail', help="Email mailing list with summary "\ + "results", action='store_true') args = parser.parse_args() verbose = args.v summary_item = args.summary_item release = args.release ofi_build_mode = args.ofi_build_mode + send_mail = args.send_mail mpi_list = ['impi', 'mpich', 'ompi'] - log_dir = f'{ci_site_config.install_dir}/{jobname}/{buildno}/log_dir' + log_dir = f'{cloudbees_config.install_dir}/{jobname}/{buildno}/log_dir' + if (not os.path.exists(log_dir)): + os.makedirs(log_dir) + + job_name = os.environ['JOB_NAME'].replace('/', '_') + + print(f"Files to be summarized: {os.listdir(log_dir)}") if (release): release_num = get_release_num(log_dir) - job_name = os.environ['JOB_NAME'].replace('/', '_') date = datetime.now().strftime("%Y%m%d%H%M%S") output_name = f'summary_{release_num}_{job_name}_{date}.log' - full_file_name = f'{log_dir}/{output_name}' else: - full_file_name = NamedTemporaryFile(prefix="summary.out.").name + output_name = f'summary_{job_name}.log' + + full_file_name = f'{log_dir}/{output_name}' with open(full_file_name, 'a') as output_file: if (ofi_build_mode == 'all'): @@ -703,4 +983,10 @@ def summarize_items(summary_item, logger, log_dir, mode): if (release): shutil.copyfile(f'{full_file_name}', f'{workspace}/{output_name}') + if (send_mail): + SendEmail(sender = os.environ['SENDER'], + receivers = os.environ['mailrecipients'], + attachment = full_file_name + ).send_mail() + exit(err) diff --git a/contrib/intel/jenkins/tests.py b/contrib/intel/jenkins/tests.py index a172254354c..17cf063a2a5 100755 --- a/contrib/intel/jenkins/tests.py +++ b/contrib/intel/jenkins/tests.py @@ -1,21 +1,23 @@ import sys import os +import io -print(os.environ['CI_SITE_CONFIG']) -sys.path.append(os.environ['CI_SITE_CONFIG']) +sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") import subprocess import re -import ci_site_config +import cloudbees_config import common import shlex +import time # A Jenkins env variable for job name is composed of the name of the jenkins job and the branch name # it is building for. for e.g. in our case jobname = 'ofi_libfabric/master' class Test: def __init__ (self, jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, mpitype=None, util_prov=None): + hosts, ofi_build_mode, user_env, log_file, mpitype=None, + util_prov=None, way=None): self.jobname = jobname self.buildno = buildno self.testname = testname @@ -23,49 +25,57 @@ def __init__ (self, jobname, buildno, testname, core_prov, fabric, self.util_prov = f'ofi_{util_prov}' if util_prov != None else '' self.fabric = fabric self.hosts = hosts - self.run_test = run_test + self.log_file = log_file self.mpi_type = mpitype self.ofi_build_mode = ofi_build_mode - if (len(hosts) == 2): + if (len(hosts) == 1): + self.server = hosts[0] + self.client = hosts[0] + elif (len(hosts) == 2): self.server = hosts[0] self.client = hosts[1] - self.nw_interface = ci_site_config.interface_map[self.fabric] - self.libfab_installpath = f'{ci_site_config.install_dir}/'\ + self.nw_interface = cloudbees_config.interface_map[self.fabric] + self.libfab_installpath = f'{cloudbees_config.install_dir}/'\ f'{self.jobname}/{self.buildno}/'\ f'{self.ofi_build_mode}' - self.ci_middlewares_path = f'{ci_site_config.install_dir}/'\ + if (self.core_prov == 'ucx'): + self.libfab_installpath += "/ucx" + + self.middlewares_path = f'{cloudbees_config.install_dir}/'\ f'{self.jobname}/{self.buildno}/'\ - 'ci_middlewares' - self.ci_logdir_path = f'{ci_site_config.install_dir}/'\ + 'middlewares' + self.ci_logdir_path = f'{cloudbees_config.install_dir}/'\ f'{self.jobname}/{self.buildno}/'\ 'log_dir' - self.env = eval(user_env) + self.env = user_env + self.way = way self.mpi = '' if (self.mpi_type == 'impi'): self.mpi = IMPI(self.core_prov, self.hosts, self.libfab_installpath, self.nw_interface, - self.server, self.client, self.env, self.util_prov) + self.server, self.client, self.env, + self.middlewares_path, self.util_prov) elif (self.mpi_type == 'ompi'): self.mpi = OMPI(self.core_prov, self.hosts, self.libfab_installpath, self.nw_interface, self.server, self.client, self.env, - self.ci_middlewares_path, self.util_prov) + self.middlewares_path, self.util_prov) elif (self.mpi_type == 'mpich'): self.mpi = MPICH(self.core_prov, self.hosts, self.libfab_installpath, self.nw_interface, self.server, self.client, self.env, - self.ci_middlewares_path, self.util_prov) + self.middlewares_path, self.util_prov) class FiInfoTest(Test): def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, util_prov=None): + hosts, ofi_build_mode, user_env, log_file, util_prov=None): super().__init__(jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, None, util_prov) + hosts, ofi_build_mode, user_env, log_file, None, util_prov) self.fi_info_testpath = f'{self.libfab_installpath}/bin' @@ -87,17 +97,18 @@ def options(self): def execute_cmd(self): command = self.cmd + self.options outputcmd = shlex.split(command) - common.run_command(outputcmd, self.ci_logdir_path, self.run_test, - self.ofi_build_mode) + common.run_command(outputcmd) class Fabtest(Test): def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, util_prov=None): + hosts, ofi_build_mode, user_env, log_file, util_prov=None, + way=None): super().__init__(jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, None, util_prov) + hosts, ofi_build_mode, user_env, log_file, None, + util_prov, way) self.fabtestpath = f'{self.libfab_installpath}/bin' self.fabtestconfigpath = f'{self.libfab_installpath}/share/fabtests' @@ -137,11 +148,21 @@ def options(self): opts += f"-c {self.client} " opts += "-N " + if (self.core_prov == 'ucx'): + opts += "-b " + if (self.ofi_build_mode == 'dl'): opts += "-t short " else: opts += "-t all " + if (self.way == 'h2d'): + opts += "-C \"-H\" -L \"-D ze\" " + elif (self.way == 'd2d'): + opts += "-C \"-D ze\" -L \"-D ze\" " + elif (self.way == 'xd2d'): + opts += "-C \"-D ze\" -L \"-D ze -i 1\" " + if (self.core_prov == 'sockets' and self.ofi_build_mode == 'reg'): complex_test_file = f'{self.libfab_installpath}/share/fabtests/'\ f'test_configs/{self.core_prov}/quick.test' @@ -182,45 +203,97 @@ def execute_cmd(self): os.chdir(self.fabtestconfigpath) command = self.cmd + self.options outputcmd = shlex.split(command) - common.run_command(outputcmd, self.ci_logdir_path, self.run_test, - self.ofi_build_mode) + common.run_command(outputcmd) os.chdir(curdir) class ShmemTest(Test): def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, util_prov=None): + hosts, ofi_build_mode, user_env, log_file, util_prov=None): super().__init__(jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, None, util_prov) + hosts, ofi_build_mode, user_env, log_file, None, + util_prov) - #self.n - number of hosts * number of processes per host self.n = 4 - # self.ppn - number of processes per node. self.ppn = 2 - self.shmem_dir = f'{self.ci_middlewares_path}/shmem' + self.shmem_dir = f'{self.middlewares_path}/shmem' + self.hydra = f'{cloudbees_config.hydra}' + self.shmem_testname = '' + self.threshold = '1' + self.isx_shmem_total_size = 33554432 + self.isx_shmem_kernel_max = 134217728 + self.prk_iterations = 10 + self.prk_first_arr_dim = 1000 + self.prk_second_arr_dim = 1000 + if self.util_prov: + self.prov = f'{self.core_prov};{self.util_prov}' + else: + self.prov = self.core_prov - @property - def cmd(self): - return f"{ci_site_config.testpath}/run_shmem.sh " + self.test_dir = { + 'unit' : 'SOS', + 'uh' : 'tests-uh', + 'isx' : 'ISx/SHMEM', + 'prk' : 'PRK/SHMEM' + } - def options(self, shmem_testname): + self.make = { + 'unit' : 'make VERBOSE=1', + 'uh' : 'make C_feature_tests-run', + 'isx' : '', + 'prk' : '' + } - if self.util_prov: - prov = f"{self.core_prov};{self.util_prov} " - else: - prov = self.core_prov + self.shmem_environ = { + 'SHMEM_OFI_USE_PROVIDER': self.prov, + 'OSHRUN_LAUNCHER' : self.hydra, + 'PATH' : f'{self.shmem_dir}/bin:$PATH', + 'LD_LIBRARY_PATH' : f'{self.shmem_dir}/lib:'\ + f'{self.libfab_installpath}/lib', + 'SHMEM_SYMMETRIC_SIZE' : '4G', + 'LD_PRELOAD' : f'{self.libfab_installpath}'\ + '/lib/libfabric.so', + 'threshold' : self.threshold + } + + def export_env(self): + environ = '' + if self.shmem_testname == 'isx' or self.shmem_testname == 'prk': + self.threshold = '0' + + for key,val in self.shmem_environ.items(): + environ += f"export {key}={val}; " + return environ + + def cmd(self): + cmd = '' + if self.shmem_testname == 'unit': + cmd += f"{self.make[self.shmem_testname]} " + cmd += "mpiexec.hydra " + cmd += f"-n {self.n} " + cmd += f"-np {self.ppn} " + cmd += 'check' + elif self.shmem_testname == 'uh': + cmd += f'{self.make[self.shmem_testname]}' + elif self.shmem_testname == 'isx': + cmd += f"oshrun -np 4 ./bin/isx.strong {self.isx_shmem_kernel_max}"\ + " output_strong; " + cmd += f"oshrun -np 4 ./bin/isx.weak {self.isx_shmem_total_size} "\ + "output_weak; " + cmd += f"oshrun -np 4 ./bin/isx.weak_iso "\ + f"{self.isx_shmem_total_size} output_weak_iso " + elif self.shmem_testname == 'prk': + cmd += f"oshrun -np 4 ./Stencil/stencil {self.prk_iterations} "\ + f"{self.prk_first_arr_dim}; " + cmd += f"oshrun -np 4 ./Synch_p2p/p2p {self.prk_iterations} "\ + f"{self.prk_first_arr_dim} {self.prk_second_arr_dim}; " + cmd += f"oshrun -np 4 ./Transpose/transpose {self.prk_iterations} "\ + f"{self.prk_first_arr_dim} " + + return cmd - opts = f"-n {self.n} " - opts += f"-hosts {self.server},{self.client} " - opts += f"-shmem_dir={self.shmem_dir} " - opts += f"-libfabric_path={self.libfab_installpath}/lib " - opts += f"-prov {prov} " - opts += f"-test {shmem_testname} " - opts += f"-server {self.server} " - opts += f"-inf {ci_site_config.interface_map[self.fabric]}" - return opts @property def execute_condn(self): @@ -229,19 +302,23 @@ def execute_condn(self): else False def execute_cmd(self, shmem_testname): - command = self.cmd + self.options(shmem_testname) + self.shmem_testname = shmem_testname + cwd = os.getcwd() + os.chdir(f'{self.shmem_dir}/{self.test_dir[self.shmem_testname]}') + print("Changed directory to "\ + f'{self.shmem_dir}/{self.test_dir[self.shmem_testname]}') + command = f"bash -c \'{self.export_env()} {self.cmd()}\'" outputcmd = shlex.split(command) - common.run_command(outputcmd, self.ci_logdir_path, - f'{shmem_testname}_{self.run_test}', - self.ofi_build_mode) + common.run_command(outputcmd) + os.chdir(cwd) class MultinodeTests(Test): def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, util_prov=None): + hosts, ofi_build_mode, user_env, log_file, util_prov=None): super().__init__(jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, None, util_prov) + hosts, ofi_build_mode, user_env, log_file, None, util_prov) self.fabtestpath = f'{self.libfab_installpath}/bin' self.fabtestconfigpath = f'{self.libfab_installpath}/share/fabtests' self.n = 2 @@ -283,51 +360,14 @@ def execute_cmd(self): os.chdir(self.fabtestconfigpath) command = self.cmd + self.options outputcmd = shlex.split(command) - common.run_command(outputcmd, self.ci_logdir_path, prov, - self.ofi_build_mode) - os.chdir(curdir) - -class ZeFabtests(Test): - def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, util_prov=None): - - super().__init__(jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, None, util_prov) - - self.fabtestpath = f'{self.libfab_installpath}/bin' - self.zefabtest_script_path = f'{ci_site_config.ze_testpath}' - self.fabtestconfigpath = f'{self.libfab_installpath}/share/fabtests' - - @property - def cmd(self): - return f'{self.zefabtest_script_path}/runfabtests_ze.sh ' - - def options(self, test_name): - opts = f"-p {self.fabtestpath} " - opts += f"-B {self.fabtestpath} " - opts += f"-t {test_name} " - opts += f"{self.server} {self.client} " - return opts - - @property - def execute_condn(self): - return True if (self.core_prov == 'shm') else False - - def execute_cmd(self, test_name): - curdir = os.getcwd() - os.chdir(self.fabtestconfigpath) - command = self.cmd + self.options(test_name) - outputcmd = shlex.split(command) - common.run_command(outputcmd, self.ci_logdir_path, - f'{test_name}', self.ofi_build_mode) + common.run_command(outputcmd) os.chdir(curdir) - class OMPI: def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, - server, client, environ, ci_middlewares_path, util_prov=None): + server, client, environ, middlewares_path, util_prov=None): - self.ompi_src = f'{ci_middlewares_path}/ompi' + self.ompi_src = f'{middlewares_path}/ompi' self.core_prov = core_prov self.hosts = hosts self.util_prov = util_prov @@ -387,13 +427,14 @@ def cmd(self): class MPICH: def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, - server, client, environ, ci_middlewares_path, util_prov=None): + server, client, environ, middlewares_path, util_prov=None): - self.mpich_src = f'{ci_middlewares_path}/mpich' + self.mpich_dir = f'{middlewares_path}/mpich_mpichtest' + self.mpichpath = f'{self.mpich_dir}/mpich_mpichsuite' self.core_prov = core_prov self.hosts = hosts self.util_prov = util_prov - self.libfab_installpath = libfab_installpath + self.libfab_installpath = f'{libfab_installpath}/libfabric_mpich' self.nw_interface = nw_interface self.server = server self.client = client @@ -410,11 +451,11 @@ def env(self): cmd += f"export FI_PROVIDER={self.core_prov}; " cmd += "export I_MPI_FABRICS=ofi; " cmd += "export MPIR_CVAR_CH4_OFI_ENABLE_ATOMICS=0; " - cmd += "export MPIR_CVAR_CH4_OFI_CAPABILITY_SETS_DEBUG=1; " - cmd += f"export LD_LIBRARY_PATH={self.mpich_src}/lib:$LD_LIBRARY_PATH; " + cmd += "export MPIR_CVAR_CH4_OFI_CAPABILITY_SETS_DEBUG=0; " + cmd += f"export LD_LIBRARY_PATH={self.mpich_dir}/lib:$LD_LIBRARY_PATH; " cmd += f"export LD_LIBRARY_PATH={self.libfab_installpath}/lib/:"\ "$LD_LIBRARY_PATH; " - cmd += f"export PATH={self.mpich_src}/bin:$PATH; " + cmd += f"export PATH={self.mpich_dir}/bin:$PATH; " cmd += f"export PATH={self.libfab_installpath}/bin:$PATH; " return cmd @@ -422,8 +463,10 @@ def env(self): def options(self): opts = f"-n {self.n} " opts += f"-ppn {self.ppn} " - opts += f"-hosts {common.get_node_name(self.server, self.nw_interface)},"\ - f"{common.get_node_name(self.client, self.nw_interface)} " + opts += "-launcher ssh " + # Removed because sbatch does this for us whenwe use mpirun + # opts += f"-hosts {common.get_node_name(self.server, self.nw_interface)},"\ + # f"{common.get_node_name(self.client, self.nw_interface)} " for key in self.environ: opts += f"-genv {key} {self.environ[key]} " @@ -431,14 +474,15 @@ def options(self): @property def cmd(self): - return f"{self.mpich_src}/bin/mpirun {self.options}" - + return f"{self.mpich_dir}/bin/mpirun {self.options}" class IMPI: def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, - server, client, environ, util_prov=None): + server, client, environ, middlewares_path, util_prov=None): - self.impi_src = ci_site_config.impi_root + self.impi_src = f'{cloudbees_config.impi_root}' + self.mpichpath = f"{middlewares_path}/impi_mpichtest/" \ + f"impi_mpichsuite/" self.core_prov = core_prov self.hosts = hosts self.util_prov = util_prov @@ -454,10 +498,15 @@ def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, def env(self): cmd = f"bash -c \'source {self.impi_src}/env/vars.sh "\ "-i_mpi_ofi_internal=0; " + cmd += f"source {cloudbees_config.intel_compiler_root}/env/vars.sh; " if (self.util_prov): cmd += f"export FI_PROVIDER={self.core_prov}\\;{self.util_prov}; " else: cmd += f"export FI_PROVIDER={self.core_prov}; " + if (self.core_prov == 'tcp'): + cmd += "export FI_IFACE=eth0; " + elif (self.core_prov == 'verbs'): + cmd += "export FI_IFACE=ib0; " cmd += "export I_MPI_FABRICS=ofi; " cmd += f"export LD_LIBRARY_PATH={self.impi_src}/lib:$LD_LIBRARY_PATH; " cmd += f"export LD_LIBRARY_PATH={self.impi_src}/lib/release:"\ @@ -485,11 +534,11 @@ def cmd(self): class IMBtests(Test): def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, mpitype, ofi_build_mode, user_env, run_test, test_group, + hosts, mpitype, ofi_build_mode, user_env, log_file, test_group, util_prov=None): super().__init__(jobname, buildno, testname, core_prov, - fabric, hosts, ofi_build_mode, user_env, run_test, mpitype, + fabric, hosts, ofi_build_mode, user_env, log_file, mpitype, util_prov) self.test_group = test_group @@ -530,21 +579,22 @@ def __init__(self, jobname, buildno, testname, core_prov, fabric, self.exclude = { 'MPI1':[], 'P2P':[], - 'EXT':[], + 'EXT':[ + 'Accumulate' + ], 'IO':[], 'NBC':[], 'RMA':[ 'Accumulate', 'Get_accumulate', 'Fetch_and_op', - 'Compare_and_swap' + 'Compare_and_swap', + 'All_put_all', + 'All_get_all' ], 'MT':[] } - if (self.mpi_type == 'impi'): - self.imb_src = ci_site_config.impi_root - elif (self.mpi_type == 'ompi' or self.mpi_type == 'mpich'): - self.imb_src = f'{self.ci_middlewares_path}/{self.mpi_type}/imb' + self.imb_src = f'{self.middlewares_path}/{self.mpi_type}/imb' @property def execute_condn(self): @@ -553,7 +603,7 @@ def execute_condn(self): def imb_cmd(self, imb_test): print(f"Running IMB-{imb_test}") - cmd = f"{self.imb_src}/bin/IMB-{imb_test} " + cmd = f"{self.imb_src}/IMB-{imb_test} " if (imb_test != 'MT'): cmd += f"-iter {self.iter} " @@ -569,18 +619,16 @@ def execute_cmd(self): for test_type in self.imb_tests[self.test_group]: outputcmd = shlex.split(self.mpi.env + self.mpi.cmd + \ self.imb_cmd(test_type) + '\'') - common.run_command(outputcmd, self.ci_logdir_path, - f'{self.mpi_type}_{self.run_test}', - self.ofi_build_mode) + common.run_command(outputcmd) class OSUtests(Test): def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, mpitype, ofi_build_mode, user_env, run_test, util_prov=None): + hosts, mpitype, ofi_build_mode, user_env, log_file, util_prov=None): super().__init__(jobname, buildno, testname, core_prov, - fabric, hosts, ofi_build_mode, user_env, run_test, mpitype, + fabric, hosts, ofi_build_mode, user_env, log_file, mpitype, util_prov) self.n_ppn = { @@ -589,7 +637,7 @@ def __init__(self, jobname, buildno, testname, core_prov, fabric, 'one-sided': (2, 1), 'startup': (2, 1) } - self.osu_src = f'{self.ci_middlewares_path}/{mpitype}/osu/libexec/'\ + self.osu_src = f'{self.middlewares_path}/{mpitype}/osu/libexec/'\ 'osu-micro-benchmarks/mpi/' self.mpi_type = mpitype @@ -620,9 +668,7 @@ def execute_cmd(self): osu_command = self.osu_cmd(os.path.basename(root), test) outputcmd = shlex.split(self.mpi.env + self.mpi.cmd + \ osu_command + '\'') - common.run_command(outputcmd, self.ci_logdir_path, - f'{self.mpi_type}_{self.run_test}', - self.ofi_build_mode) + common.run_command(outputcmd) if (test == 'osu_latency_mp' and self.core_prov == 'verbs'): self.env.pop('IBV_FORK_SAFE') @@ -631,193 +677,262 @@ def execute_cmd(self): class MpichTestSuite(Test): def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, mpitype, ofi_build_mode, user_env, run_test, util_prov=None): + hosts, mpitype, ofi_build_mode, user_env, log_file, util_prov=None, weekly=None): super().__init__(jobname, buildno, testname, core_prov, - fabric, hosts, ofi_build_mode, user_env, run_test, mpitype, + fabric, hosts, ofi_build_mode, user_env, log_file, mpitype, util_prov) - - self.mpichsuitepath = f'{self.ci_middlewares_path}/{mpitype}/'\ - 'mpichsuite/test/mpi/' - self.pwd = os.getcwd() self.mpi_type = mpitype + if (mpitype != 'ompi'): + self.mpichsuitepath = f'{self.mpi.mpichpath}/test/mpi/' + self.pwd = os.getcwd() + self.weekly = weekly + self.mpichtests_exclude = { + 'tcp' : { '.' : [('spawn','dir')], + 'rma' : [('win_shared_put_flush_load 3', 'test')], + 'threads' : [('spawn','dir')], + 'threads/comm' : [('idup_nb 4','test'), + ('idup_comm_gen 4','test')], + 'errors' : [('spawn','dir')] + }, + 'verbs' : { '.' : [('spawn','dir')], + 'threads/comm' : [('idup_nb 4','test')], + 'threads' : [('spawn','dir'), ('rma','dir')], + 'pt2pt' : [('sendrecv3 2','test'), + ('sendrecv3 2 arg=-isendrecv','test')], + 'threads/pt2pt': [(f"mt_improbe_sendrecv_huge 2 " + f"arg=-iter=64 arg=-count=4194304 " + f"env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE" + f"=16384", 'test')] + } + } - def testgroup(self, testgroupname): - testpath = f'{self.mpichsuitepath}/{testgroupname}' - tests = [] - with open(f'{testpath}/testlist') as file: - for line in file: - if(line[0] != '#' and line[0] != '\n'): - tests.append((line.rstrip('\n')).split(' ')) - - return tests - - def set_options(self, nprocs, timeout=None): - self.mpi.n = nprocs - if (timeout != None): - os.environ['MPIEXEC_TIMEOUT']=timeout - + def create_hostfile(self, file, hostlist): + with open(file, "w") as f: + for host in hostlist: + f.write(f"{host}\n") + + def update_testlists(self, filename, category): + with open(filename, 'r') as file: + lines = file.read().splitlines() + for line in lines: + if (line == category): + lines[lines.index(line)] = f'#{line}' + else: + continue + with open(filename, 'w') as file: + file.write('\n'.join(lines)) + + def exclude_tests(self, test_root, provider): + for path,exclude_list in self.mpichtests_exclude[f'{provider}'].items(): + for item in exclude_list: + self.update_testlists(f'{test_root}/{path}/testlist', item[0]) + if (item[1] == 'dir'): + filename = f'{test_root}/{path}/{item[0]}/testlist' + with open(filename,'r') as file: + for line in file: + line = line.strip() + if (not line.startswith('#')): + print(f'excluding:{path}/{item[0]}:{line}') + else: #item[1]=test + print(f'excluding:{path}/{item[0]}') @property def execute_condn(self): - return (self.mpi_type == 'impi' or \ - (self.mpi_type == 'mpich' and self.core_prov == 'verbs')) - - def execute_cmd(self, testgroupname): - print("Running Tests: " + testgroupname) - tests = [] - time = None - os.chdir(f'{self.mpichsuitepath}/{testgroupname}') - tests = self.testgroup(testgroupname) - for test in tests: - testname = test[0] - nprocs = test[1] - args = test[2:] - for item in args: - itemlist = item.split('=') - if (itemlist[0] == 'timelimit'): - time = itemlist[1] - self.set_options(nprocs, timeout=time) - testcmd = f'./{testname}' - outputcmd = shlex.split(self.mpi.env + self.mpi.cmd + testcmd + '\'') - if self.util_prov: - util_prov = self.util_prov.strip('ofi_') - log_file_name = f'{self.core_prov}-{util_prov}_' \ - f'{self.mpi_type}_{self.run_test}' - else: - log_file_name = f'{self.core_prov}_{self.mpi_type}_{self.run_test}' - - common.run_command(outputcmd, self.ci_logdir_path, log_file_name, - self.ofi_build_mode) - os.chdir(self.pwd) + return ((self.mpi_type == 'impi' and self.weekly) or \ + self.mpi_type == 'mpich') + def execute_cmd(self): + if (self.mpi_type == 'mpich'): + configure_cmd = f"./configure --with-mpi={self.mpi.mpich_dir} " + if (self.weekly): + print(f'Weekly {self.mpi_type} mpichsuite tests') + os.chdir(self.mpichsuitepath) + common.run_command(shlex.split(self.mpi.env + + configure_cmd + '\'')) + self.exclude_tests(self.mpichsuitepath, self.core_prov) + testcmd = 'make testing' + outputcmd = shlex.split(self.mpi.env + testcmd + '\'') + common.run_command(outputcmd) + common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \ + f"summary.tap")) + os.chdir(self.pwd) + else: + print(f"PR {self.mpi_type} mpichsuite tests") + os.chdir(self.mpichsuitepath) + common.run_command(shlex.split(self.mpi.env + + configure_cmd + '\'')) + common.run_command(['make', '-j']) + self.exclude_tests(self.mpichsuitepath, self.core_prov) + testcmd = "./runtests -tests=testlist " + testcmd += f" -xmlfile=summary.xml -tapfile=summary.tap " \ + f"-junitfile=summary.junit.xml " + common.run_command(shlex.split(self.mpi.env + testcmd + '\'')) + common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \ + f"summary.tap")) + os.chdir(self.pwd) + if (self.mpi_type == 'impi' and self.weekly == True): + print (f'Weekly {self.mpi_type} mpichsuite tests') + os.chdir(self.mpi.mpichpath) + print(self.hosts) + self.create_hostfile(f'{self.mpi.mpichpath}/hostfile', + self.hosts) + os.environ["I_MPI_HYDRA_HOST_FILE"] = \ + f'{self.mpi.mpichpath}/hostfile' + test_cmd = f"export I_MPI_HYDRA_HOST_FILE=" \ + f"{self.mpi.mpichpath}/hostfile; " + test_cmd += f"./test.sh --exclude lin,{self.core_prov},*,*,*,*; " + common.run_command(shlex.split(self.mpi.env + test_cmd + '\'')) + common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \ + f"summary.tap")) + os.chdir(self.pwd) class OneCCLTests(Test): def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, util_prov=None): + hosts, ofi_build_mode, user_env, log_file, util_prov=None): super().__init__(jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, None, util_prov) + hosts, ofi_build_mode, user_env, log_file, None, util_prov) - self.n = 2 - self.ppn = 1 - self.oneccl_path = f'{self.ci_middlewares_path}/oneccl/build' - - self.examples_tests = { - 'allgatherv', - 'allreduce', - 'alltoallv', - 'broadcast', - 'communicator', - 'cpu_allgatherv_test', - 'cpu_allreduce_bf16_test', - 'cpu_allreduce_test', - 'custom_allreduce', - 'datatype', - 'external_kvs', - 'priority_allreduce', - 'reduce', - 'reduce_scatter', - 'unordered_allreduce' - } - self.functional_tests = { - 'allgatherv_test', - 'allreduce_test', - 'alltoall_test', - 'alltoallv_test', - 'bcast_test', - 'reduce_scatter_test', - 'reduce_test' - } + self.oneccl_path = f'{self.middlewares_path}/oneccl/' + self.test_dir = f'{self.middlewares_path}/oneccl/ci_tests' + if self.util_prov: + self.prov = f"{self.core_prov}\;{self.util_prov}" + else: + self.prov = self.core_prov + self.oneccl_environ = { + 'FI_PROVIDER' : f"\"{self.prov}\"", + 'CCL_ATL_TRANSPORT' : 'ofi', + 'CCL_ATL_TRANSPORT_LIST' : 'ofi' + } + + self.ld_library = [ + f'{self.libfab_installpath}/lib', + f'{self.oneccl_path}/build/_install/lib' + ] + + def export_env(self): + environ = f"source {cloudbees_config.oneapi_root}/setvars.sh; " + environ += f"source {self.oneccl_path}/build/_install/env/vars.sh; " + if self.core_prov == 'psm3': + self.oneccl_environ['PSM3_MULTI_EP'] = '1' + + for key, val in self.oneccl_environ.items(): + environ += f"export {key}={val}; " + + ld_library_path = 'LD_LIBRARY_PATH=' + for item in self.ld_library: + ld_library_path += f'{item}:' + + environ += f"export {ld_library_path}$LD_LIBRARY_PATH; " + return environ - @property def cmd(self): - return f"{ci_site_config.testpath}/run_oneccl.sh " + return './run.sh ' - def options(self, oneccl_test): - opts = f"-n {self.n} " - opts += f"-ppn {self.ppn} " - opts += f"-hosts {self.server},{self.client} " - opts += f"-prov '{self.core_prov}' " - opts += f"-test {oneccl_test} " - opts += f"-libfabric_path={self.libfab_installpath}/lib " - opts += f'-oneccl_root={self.oneccl_path}' + def options(self): + opts = "--mode cpu " return opts @property def execute_condn(self): return True + @property + def execute_condn(self): + return True - def execute_cmd(self, oneccl_test): - if oneccl_test == 'examples': - for test in self.examples_tests: - command = self.cmd + self.options(oneccl_test) + \ - f" {test}" - outputcmd = shlex.split(command) - common.run_command(outputcmd, self.ci_logdir_path, self.run_test, - self.ofi_build_mode) - elif oneccl_test == 'functional': - for test in self.functional_tests: - command = self.cmd + self.options(oneccl_test) + \ - f" {test}" - outputcmd = shlex.split(command) - common.run_command(outputcmd, self.ci_logdir_path, self.run_test, - self.ofi_build_mode) + def execute_cmd(self): + curr_dir = os.getcwd() + os.chdir(self.test_dir) + command = f"bash -c \'{self.export_env()} {self.cmd()} "\ + f"{self.options()}\'" + outputcmd = shlex.split(command) + common.run_command(outputcmd) + os.chdir(curr_dir) class OneCCLTestsGPU(Test): def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, util_prov=None): + hosts, ofi_build_mode, user_env, log_file, util_prov=None): super().__init__(jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, None, util_prov) + hosts, ofi_build_mode, user_env, log_file, None, util_prov) self.n = 2 - self.ppn = 4 - self.oneccl_path = f'{self.ci_middlewares_path}/oneccl_gpu/build' - - self.examples_tests = { - 'sycl_allgatherv_custom_usm_test', - 'sycl_allgatherv_inplace_test', - 'sycl_allgatherv_inplace_usm_test', - 'sycl_allgatherv_test', - 'sycl_allgatherv_usm_test', - 'sycl_allreduce_inplace_usm_test', - 'sycl_allreduce_test', - 'sycl_allreduce_usm_test', - 'sycl_alltoall_test', - 'sycl_alltoall_usm_test', - 'sycl_alltoallv_test', - 'sycl_alltoallv_usm_test', - 'sycl_broadcast_test', - 'sycl_broadcast_usm_test', - 'sycl_reduce_inplace_usm_test', - 'sycl_reduce_scatter_test', - 'sycl_reduce_scatter_usm_test', - 'sycl_reduce_test', - 'sycl_reduce_usm_test' - } - self.functional_tests = { - 'allgatherv_test', - 'alltoall_test', - 'alltoallv_test', - 'bcast_test', - 'reduce_scatter_test', - 'reduce_test' - } + self.ppn = 1 + self.oneccl_path = f'{self.middlewares_path}/oneccl_gpu/build' + if self.util_prov: + self.prov = f"{self.core_prov}\;{self.util_prov}" + else: + self.prov = self.core_prov + + self.onecclgpu_environ = { + 'FI_PROVIDER' : self.prov, + # 'LD_PRELOAD' : f"{self.libfab_installpath}/lib/libfabric.so", + 'CCL_ATL_TRANSPORT' : 'ofi', + 'CCL_ROOT' : f"{self.oneccl_path}/_install" + } + + self.ld_library = [ + f'{self.libfab_installpath}/lib', + '$LD_LIBRARY_PATH', + f'{self.oneccl_path}/_install/lib' + ] + + self.tests = { + 'examples' : [ + 'sycl_allgatherv_custom_usm_test', + 'sycl_allgatherv_inplace_test', + 'sycl_allgatherv_inplace_usm_test', + 'sycl_allgatherv_test', + 'sycl_allgatherv_usm_test', + 'sycl_allreduce_inplace_usm_test', + 'sycl_allreduce_test', + 'sycl_allreduce_usm_test', + 'sycl_alltoall_test', + 'sycl_alltoall_usm_test', + 'sycl_alltoallv_test', + 'sycl_alltoallv_usm_test', + 'sycl_broadcast_test', + 'sycl_broadcast_usm_test', + 'sycl_reduce_inplace_usm_test', + 'sycl_reduce_scatter_test', + 'sycl_reduce_scatter_usm_test', + 'sycl_reduce_test', + 'sycl_reduce_usm_test' + ], + 'functional' : [ + 'allgatherv_test', + 'alltoall_test', + 'alltoallv_test', + 'bcast_test', + 'reduce_scatter_test', + 'reduce_test' + ] + } + + def export_env(self): + environ = f"source {cloudbees_config.impi_root}/env/vars.sh "\ + "-i_mpi_internal=0; " + environ += f"source {cloudbees_config.intel_compiler_root}/env/vars.sh; " + for key, val in self.onecclgpu_environ.items(): + environ += f"export {key}={val}; " + + ld_library_path = 'LD_LIBRARY_PATH=' + for item in self.ld_library: + ld_library_path += f'{item}:' + + environ += f"export {ld_library_path}$LD_LIBRARY_PATH; " + return environ - @property def cmd(self): - return f"{ci_site_config.testpath}/run_oneccl_gpu.sh " + return f"{self.oneccl_path}/_install/bin/mpiexec " - def options(self, oneccl_test_gpu): - opts = f"-n {self.n} " + def options(self): + opts = "-l " + opts += f"-n {self.n} " opts += f"-ppn {self.ppn} " opts += f"-hosts {self.server},{self.client} " - opts += f"-prov '{self.core_prov}' " - opts += f"-test {oneccl_test_gpu} " - opts += f"-libfabric_path={self.libfab_installpath}/lib " - opts += f'-oneccl_root={self.oneccl_path}' return opts @property @@ -826,79 +941,84 @@ def execute_condn(self): def execute_cmd(self, oneccl_test_gpu): - if oneccl_test_gpu == 'examples': - for test in self.examples_tests: - command = self.cmd + self.options(oneccl_test_gpu) + \ - f" {test}" - outputcmd = shlex.split(command) - common.run_command(outputcmd, self.ci_logdir_path, - self.run_test, self.ofi_build_mode) - elif oneccl_test_gpu == 'functional': - for test in self.functional_tests: - command = self.cmd + self.options(oneccl_test_gpu) + \ - f" {test}" - outputcmd = shlex.split(command) - common.run_command(outputcmd, self.ci_logdir_path, - self.run_test, self.ofi_build_mode) + curr_dir = os.getcwd() + if 'examples' in oneccl_test_gpu: + os.chdir(f"{self.oneccl_path}/_install/examples/sycl") + else: + os.chdir(f"{self.oneccl_path}/tests/functional") + + for test in self.tests[oneccl_test_gpu]: + if '_usm_' in test: + gpu_selector = 'device' + else: + gpu_selector = 'default' + + command = f"bash -c \'{self.export_env()} {self.cmd()} "\ + f"{self.options()} ./{test} " + if 'examples' in oneccl_test_gpu: + command += f"gpu {gpu_selector}" + command += "\'" + + outputcmd = shlex.split(command) + common.run_command(outputcmd) + os.chdir(curr_dir) class DaosCartTest(Test): def __init__(self, jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, util_prov=None): + hosts, ofi_build_mode, user_env, log_file, util_prov=None): super().__init__(jobname, buildno, testname, core_prov, fabric, - hosts, ofi_build_mode, user_env, run_test, None, util_prov) + hosts, ofi_build_mode, user_env, log_file, None, util_prov) + - self.set_paths() - self.set_environment(core_prov,util_prov) + self.set_paths(core_prov) print(core_prov) - self.daos_nodes = ci_site_config.prov_node_map[core_prov] + self.daos_nodes = cloudbees_config.prov_node_map[core_prov] print(self.daos_nodes) + self.launch_node = self.daos_nodes[0] self.cart_tests = { 'corpc_one_node' : {'tags' :'cart,corpc,one_node', 'numservers':1, 'numclients':0}, 'corpc_two_node' : {'tags' :'cart,corpc,two_node', 'numservers':2, 'numclients':0}, 'ctl_one_node' : {'tags' :'cart,ctl,one_node', 'numservers':1, 'numclients':1}, -# 'ghost_rank_rpc_one_node' : {'tags' :'cart,ghost_rank_rpc,one_node', 'numservers':1, 'numclients':0}, + 'ghost_rank_rpc_one_node' : {'tags' :'cart,ghost_rank_rpc,one_node', 'numservers':1, 'numclients':0}, 'group_test' : {'tags' :'cart,group_test,one_node', 'numservers':1, 'numclients':0}, 'iv_one_node' : {'tags' :'cart,iv,one_node', 'numservers':1, 'numclients':1}, 'iv_two_node' : {'tags' :'cart,iv,two_node', 'numservers':2, 'numclients':1}, 'launcher_one_node' : {'tags' :'cart,no_pmix_launcher,one_node','numservers':1, 'numclients':1}, -# 'multictx_one_node' : {'tags' :'cart,no_pmix,one_node', 'numservers':1, 'numclients':0}, + 'multictx_one_node' : {'tags' :'cart,no_pmix,one_node', 'numservers':1, 'numclients':0}, 'rpc_one_node' : {'tags' :'cart,rpc,one_node', 'numservers':1, 'numclients':1}, 'rpc_two_node' : {'tags' :'cart,rpc,two_node','numservers':2, 'numclients':1}, 'swim_notification' : {'tags' :'cart,rpc,swim_rank_eviction,one_node', 'numservers':1, 'numclients':1} } - def set_paths(self): - self.ci_middlewares_path = f'{ci_site_config.ci_middlewares}' + def set_paths(self, core_prov): + self.ci_middlewares_path = f'{cloudbees_config.build_dir}/{core_prov}' self.daos_install_root = f'{self.ci_middlewares_path}/daos/install' self.cart_test_scripts = f'{self.daos_install_root}/lib/daos/TESTING/ftest' - self.mpipath = f'{ci_site_config.daos_mpi}/bin' + self.mpipath = f'{cloudbees_config.daos_mpi}/bin' self.pathlist = [f'{self.daos_install_root}/bin/', self.cart_test_scripts, self.mpipath, \ f'{self.daos_install_root}/lib/daos/TESTING/tests'] self.daos_prereq = f'{self.daos_install_root}/prereq' + common.run_command(['rm', '-rf', f'{self.ci_middlewares_path}/daos_logs/*']) common.run_command(['rm','-rf', f'{self.daos_prereq}/debug/ofi']) common.run_command(['ln', '-sfn', self.libfab_installpath, f'{self.daos_prereq}/debug/ofi']) - def set_environment(self, core_prov, util_prov): - prov_name = f'ofi+{core_prov}' - if util_prov: - prov_name = f'{prov_name};ofi_{util_prov}' - if (core_prov == 'verbs'): - os.environ["OFI_DOMAIN"] = 'mlx5_0' - else: - os.environ["OFI_DOMAIN"] = 'ib0' - os.environ["OFI_INTERFACE"] = 'ib0' - os.environ["CRT_PHY_ADDR_STR"] = prov_name - os.environ["PATH"] += os.pathsep + os.pathsep.join(self.pathlist) - os.environ["DAOS_TEST_SHARED_DIR"] = ci_site_config.daos_share - os.environ["DAOS_TEST_LOG_DIR"] = ci_site_config.daos_logs - os.environ["LD_LIBRARY_PATH"] = f'{self.ci_middlewares_path}/daos/install/lib64:{self.mpipath}' - @property def cmd(self): - return "./launch.py " + return f"env; echo {common.cloudbees_log_start_string}; "\ + "python3.6 launch.py " + + def remote_launch_cmd(self, testname): + +# The following env variables must be set appropriately prior +# to running the daos/cart tests OFI_DOMAIN, OFI_INTERFACE, +# CRT_PHY_ADDR_STR, PATH, DAOS_TEST_SHARED_DIR DAOS_TEST_LOG_DIR, +# LD_LIBRARY_PATH in the script being sourced below. + launch_cmd = f"ssh {self.launch_node} \"source {self.ci_middlewares_path}/daos_ci_env_setup.sh && \ + cd {self.cart_test_scripts} &&\" " + return launch_cmd def options(self, testname): opts = "-s " @@ -918,18 +1038,104 @@ def execute_condn(self): def execute_cmd(self): sys.path.append(f'{self.daos_install_root}/lib64/python3.6/site-packages') os.environ['PYTHONPATH']=f'{self.daos_install_root}/lib64/python3.6/site-packages' - print("PATH:" + os.environ["PATH"]) - print("LD_LIBRARY_PATH:" + os.environ["LD_LIBRARY_PATH"]) - print("MODULEPATH:" + os.environ["MODULEPATH"]) test_dir=self.cart_test_scripts curdir=os.getcwd() os.chdir(test_dir) for test in self.cart_tests: print(test) - command = self.cmd + self.options(test) + command = self.remote_launch_cmd(test) + self.cmd + self.options(test) outputcmd = shlex.split(command) - common.run_command(outputcmd, self.ci_logdir_path, - self.run_test, self.ofi_build_mode) + common.run_logging_command(outputcmd, self.log_file) print("--------------------TEST COMPLETED----------------------") os.chdir(curdir) + +class DMABUFTest(Test): + + def __init__(self, jobname, buildno, testname, core_prov, fabric, + hosts, ofi_build_mode, user_env, log_file, util_prov=None): + + super().__init__(jobname, buildno, testname, core_prov, fabric, + hosts, ofi_build_mode, user_env, log_file, + None, util_prov) + self.DMABUFtestpath = f'{self.libfab_installpath}/bin' + self.timeout = 300 + self.n = os.environ['SLURM_NNODES'] if 'SLURM_NNODES' \ + in os.environ.keys() \ + else 0 + + if util_prov: + self.prov = f'{self.core_prov}\;{self.util_prov}' + else: + self.prov = self.core_prov + + self.dmabuf_environ = { + 'ZEX_NUMBER_OF_CCS' : '0:4,1:4', + 'NEOReadDebugKeys' : '1', + 'EnableImplicitScaling' : '0', + 'MLX5_SCATTER_TO_CQE' : '0' + } + + self.tests = { + 'H2H' : [ + 'write', + 'read', + 'send' + ], + 'H2D' : [ + 'write', + 'read', + 'send' + ], + 'D2H' : [ + 'write', + 'read', + 'send' + ], + 'D2D' : [ + 'write', + 'read', + 'send' + ] + } + + @property + def execute_condn(self): + return True if (self.core_prov == 'verbs') \ + else False + + @property + def cmd(self): + return f"{self.DMABUFtestpath}/fi-rdmabw-xe" + + def dmabuf_env(self): + return ' '.join([f"{key}={self.dmabuf_environ[key]}" \ + for key in self.dmabuf_environ]) + + def execute_cmd(self, test_type): + os.chdir(self.DMABUFtestpath) + base_cmd = '' + log_prefix = f"{os.environ['LOG_DIR']}/dmabuf_{self.n}" + if 'H2H' in test_type or 'D2H' in test_type: + base_cmd = f"{self.cmd} -m malloc -p {self.core_prov}" + else: + base_cmd = f"{self.cmd} -m device -d 0 -p {self.core_prov}" + + for test in self.tests[test_type]: + client_command = f"{base_cmd} -t {test} {self.server}" + if 'send' in test: + server_command = f"{base_cmd} -t {test} " + else: + server_command = f"{base_cmd} " + RC = common.ClientServerTest( + f"ssh {self.server} {self.dmabuf_env()} {server_command}", + f"ssh {self.client} {self.dmabuf_env()} {client_command}", + f"{log_prefix}_server.log", f"{log_prefix}_client.log", + self.timeout + ).run() + + if RC == (0, 0): + print("------------------ TEST COMPLETED -------------------") + else: + print("------------------ TEST FAILED -------------------") + sys.exit(f"Exiting with returncode: {RC}")