Skip to content

Commit

Permalink
intel/ci: debug mpich osu failing in CI
Browse files Browse the repository at this point in the history
Signed-off-by: Nikhil Nanal
  • Loading branch information
nikhilnanal committed Sep 19, 2023
1 parent 13f28f7 commit f4aa232
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 361 deletions.
333 changes: 12 additions & 321 deletions contrib/intel/jenkins/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import groovy.transform.Field
properties([disableConcurrentBuilds(abortPrevious: true)])
@Field def DO_RUN=true
@Field def TARGET="main"
@Field def SCRIPT_LOCATION="py_scripts/contrib/intel/jenkins"
@Field def SCRIPT_LOCATION="contrib/intel/jenkins"
@Field def RELEASE=false
@Field def BUILD_MODES=["reg", "dbg", "dl"]
@Field def MPI_TYPES=["impi", "mpich", "ompi"]
Expand Down Expand Up @@ -305,44 +305,12 @@ pipeline {
"${env.LOG_DIR}/libfabric_mpich_log",
"""python$PYTHON_VERSION ${RUN_LOCATION}/build.py \
--build_item=libfabric_mpich """
)
}
}
}
}
stage ('build-daos') {
agent {
node {
label 'daos_head'
customWorkspace CUSTOM_WORKSPACE
}
}
steps {
script {
checkout_py_scripts()
dir (CUSTOM_WORKSPACE) {
build("logdir")
build("libfabric", "reg", "daos")
build("fabtests", "reg")
}
}
}
}
stage ('build-gpu') {
agent {
node {
label 'ze'
customWorkspace CUSTOM_WORKSPACE
}
}
steps {
script {
checkout_py_scripts()
dir (CUSTOM_WORKSPACE) {
build("logdir")
build("builddir")
build("libfabric", "reg", "gpu")
build("fabtests", "reg")
)
slurm_batch("squirtle,totodile", "1",
"${env.LOG_DIR}/build_mpich_log",
"""python$PYTHON_VERSION ${RUN_LOCATION}/build.py \
--build_item=mpich """
)
}
}
}
Expand All @@ -352,22 +320,6 @@ pipeline {
stage('parallel-tests') {
when { equals expected: true, actual: DO_RUN }
parallel {
stage('MPI_verbs-rxm_IMB') {
steps {
script {
dir (RUN_LOCATION) {
def providers = [["verbs", "rxm"]]
for (mpi in MPI_TYPES) {
for (imb_grp = 1; imb_grp < 4; imb_grp++) {
run_middleware(providers, "MPI", "IMB",
"squirtle,totodile", "2", "${mpi}",
"${imb_grp}")
}
}
}
}
}
}
stage('MPI_verbs-rxm_OSU') {
steps {
script {
Expand All @@ -389,105 +341,12 @@ pipeline {
for (mpi in MPI_TYPES) {
for (imb_grp = 1; imb_grp < 4; imb_grp++) {
run_middleware(providers, "MPI", "IMB",
"bulbasaur", "2", "${mpi}", "${imb_grp}")
"bulbasaur", "2", "${mpi}", "${imb_grp}")
}
run_middleware(providers, "MPI", "osu", "bulbasaur", "2",
"${mpi}")
}
}
}
}
}
stage('tcp') {
steps {
script {
dir (RUN_LOCATION) {
run_fabtests("tcp", "bulbasaur", "2", "tcp")
}
}
}
}
stage('verbs-rxm') {
steps {
script {
dir (RUN_LOCATION) {
run_fabtests("verbs-rxm", "squirtle,totodile", "2", "verbs",
"rxm")
run_fabtests("verbs-rxm", "squirtle,totodile", "2", "verbs",
"rxm", "FI_MR_CACHE_MAX_COUNT=0")
run_fabtests("verbs-rxm", "squirtle,totodile", "2", "verbs",
"rxm", "FI_MR_CACHE_MONITOR=userfaultfd")
}
}
}
}
stage('verbs-rxd') {
steps {
script {
dir (RUN_LOCATION) {
run_fabtests("verbs-rxd", "squirtle", "2", "verbs",
"rxd")
run_fabtests("verbs-rxd", "squirtle", "2", "verbs",
"rxd", "FI_MR_CACHE_MAX_COUNT=0")
run_fabtests("verbs-rxd", "squirtle", "2", "verbs",
"rxd", "FI_MR_CACHE_MONITOR=userfaultfd")
}
}
}
}
stage('udp') {
steps {
script {
dir (RUN_LOCATION) {
run_fabtests("udp", "bulbasaur", "2", "udp")
}
}
}
}
stage('shm') {
steps {
script {
dir (RUN_LOCATION) {
run_fabtests("shm", "bulbasaur", "1", "shm")
run_fabtests("shm", "bulbasaur", "1", "shm", null,
"FI_SHM_DISABLE_CMA=1")
}
}
}
}
stage('sockets') {
steps {
script {
dir (RUN_LOCATION) {
run_fabtests("sockets", "bulbasaur", "2", "sockets")
}
}
}
}
stage('ucx') {
steps {
script {
dir (CUSTOM_WORKSPACE) {
for (mode in BUILD_MODES) {
echo "Building Libfabric $mode"
build("libfabric", "${mode}", null, false, "--ucx")
echo "Building Fabtests $mode"
build("fabtests", "${mode}", null, false, "--ucx")
"${mpi}")
}
}
dir (RUN_LOCATION) {
run_fabtests("ucx", "totodile", "2", "ucx")
}
}
}
}
stage('psm3') {
steps {
script {
dir (RUN_LOCATION) {
run_fabtests("psm3", "squirtle", "2", "psm3", null,
"PSM3_IDENTIFY=1")
}
}
}
}
Expand All @@ -498,180 +357,12 @@ pipeline {
def providers = [['tcp', null], ["verbs","rxm"]]
for (mpi in MPI_TYPES) {
run_middleware(providers, "mpichtestsuite", "mpichtestsuite",
"squirtle,totodile", "2", "${mpi}")
}
}
}
}
}
stage('SHMEM') {
steps {
script {
dir (RUN_LOCATION) {
run_middleware([["verbs", null], ["tcp", null],
["sockets", null]], "SHMEM", "shmem",
"squirtle,totodile", "2")
}
}
}
}
stage ('multinode_performance') {
steps {
script {
dir (RUN_LOCATION) {
run_middleware([["tcp", null]], "multinode_performance",
"multinode", "bulbasaur", "2")
}
}
}
}
stage ('oneCCL') {
steps {
script {
dir (RUN_LOCATION) {
run_middleware([["tcp", "rxm"]/*, ["psm3", null]*/], "oneCCL",
"oneccl", "bulbasaur", "2")
}
}
}
}
stage ('oneCCL-GPU') {
steps {
script {
dir (RUN_LOCATION) {
run_middleware([["verbs", "rxm"]], "oneCCL-GPU", "onecclgpu",
"charmander", "2")
}
}
}
}
stage ('oneCCL-GPU-v3') {
agent { node { label 'ze' } }
options { skipDefaultCheckout() }
steps {
script {
dir (RUN_LOCATION) {
run_middleware([["verbs", "rxm"]], "oneCCL-GPU-v3", "onecclgpu",
"fabrics-ci", "2")
}
}
}
}
stage('daos_tcp') {
agent { node { label 'daos_tcp' } }
options { skipDefaultCheckout() }
steps {
script {
dir (RUN_LOCATION) {
run_python(PYTHON_VERSION,
"""runtests.py --prov='tcp' --util='rxm' \
--test=daos \
--log_file=${env.LOG_DIR}/daos_tcp-rxm_reg""")
}
}
}
}
stage('daos_verbs') {
agent { node { label 'daos_verbs' } }
options { skipDefaultCheckout() }
steps {
script {
dir (RUN_LOCATION) {
run_python(PYTHON_VERSION,
"""runtests.py --prov='verbs' --util='rxm' \
--test=daos \
--log_file=${env.LOG_DIR}/daos_verbs-rxm_reg""")
}
}
}
}
stage ('DMABUF-Tests') {
agent { node { label 'ze' } }
options { skipDefaultCheckout() }
steps {
script {
dir ("${env.WORKSPACE}/${SCRIPT_LOCATION}/") {
dmabuf_output = "${LOG_DIR}/DMABUF-Tests_verbs-rxm_dmabuf_reg"
cmd = """ python3.9 runtests.py --test=dmabuf \
--prov=verbs --util=rxm"""
slurm_batch("fabrics-ci", "1", "${dmabuf_output}", "${cmd}")
slurm_batch("fabrics-ci", "2", "${dmabuf_output}", "${cmd}")
}
}
}
}
stage ('ze-shm') {
steps {
script {
dir (RUN_LOCATION) {
def providers = [["shm", null]]
def directions = ["h2d", "d2d", "xd2d"]
def base_cmd = "python3.9 runtests.py --device=ze"
def prefix = "${env.LOG_DIR}/ze_"
def suffix = "_reg"
for (prov in providers) {
for (way in directions) {
if (prov[1]) {
echo "Running ${prov[0]}-${prov[1]} ze"
slurm_batch("charmander", "1",
"${prefix}${prov[0]}-${prov[1]}_${way}${suffix}",
"""${base_cmd} --prov=${prov[0]} \
--util=${prov[1]} --way=${way}""")
} else {
echo "Running ${prov[0]} ze"
slurm_batch("charmander", "1",
"${prefix}${prov[0]}_${way}${suffix}",
"${base_cmd} --prov=${prov[0]} --way=${way}")
}
}
"squirtle,totodile", "2", "${mpi}")
}
}
}
}
}
stage ('ze-shm-v3') {
agent { node { label 'ze' } }
options { skipDefaultCheckout() }
steps {
script {
dir (RUN_LOCATION) {
def providers = [["shm", null]]
def directions = ["h2d", "d2d", "xd2d"]
def base_cmd = "python3.9 runtests.py --device=ze"
def prefix = "${env.LOG_DIR}/ze_v3_"
def suffix = "_reg"
for (prov in providers) {
for (way in directions) {
if (prov[1]) {
echo "Running ${prov[0]}-${prov[1]} ze"
slurm_batch("fabrics-ci", "1",
"${prefix}${prov[0]}-${prov[1]}_${way}${suffix}",
"""${base_cmd} --prov=${prov[0]} \
--util=${prov[1]} --way=${way}""")
} else {
echo "Running ${prov[0]} ze"
slurm_batch("fabrics-ci", "1",
"${prefix}${prov[0]}_${way}${suffix}",
"${base_cmd} --prov=${prov[0]} --way=${way}")
}
}
}
}
}
}
}
stage('dsa') {
when { equals expected: true, actual: DO_RUN }
steps {
script {
dir (RUN_LOCATION) {
run_fabtests("shm_dsa", "mudkip", "1", "shm", null,
"""FI_SHM_DISABLE_CMA=1 FI_SHM_USE_DSA_SAR=1 \
FI_LOG_LEVEL=warn""")
}
}
}
}
}
}
stage ('Summary') {
Expand Down Expand Up @@ -712,7 +403,7 @@ pipeline {
node ('ze') {
dir ("${DELETE_LOCATION}/middlewares") { deleteDir() }
}
dir ("${DELETE_LOCATION}/middlewares") { deleteDir() }
/*dir ("${DELETE_LOCATION}/middlewares") { deleteDir() }*/
}
cleanup {
node ('daos_head') {
Expand All @@ -725,7 +416,7 @@ pipeline {
dir("${env.WORKSPACE}") { deleteDir() }
dir("${env.WORKSPACE}@tmp") { deleteDir() }
}
dir("${DELETE_LOCATION}") { deleteDir() }
/*dir("${DELETE_LOCATION}") { deleteDir() }*/
dir("${env.WORKSPACE}") { deleteDir() }
dir("${env.WORKSPACE}@tmp") { deleteDir() }
}
Expand Down
Loading

0 comments on commit f4aa232

Please sign in to comment.