intel/ci: Add code changes to enable the entire mpich test suite

with impi and mpich. - test.py: class MpichtestSuite modified to build and run PR tests as well as weekly tests (impi & mpich) - build.py: options and functions added for extracting mpich tar files. Also added libfabric_mpich option to build libfabric with options exclusive for mpich (-without-ze) - run.py: changes to calls made to build and execute tests based on MpichTestSuite class. - summary.py: summary functions modified to create summary based on new log file. - Jenkinsfile: added prepare build stage; added parallel build stage for libfabric_mpich; increased jenkins timeout limit for weekly tests. Running tests for tcp and verbs-rxm. Signed-off-by: Nikhil Nanal <[email protected]>
ofiwg · Sep 15, 2023 · 4e535a6 · 4e535a6
1 parent 5cc4fc7
commit 4e535a6
Show file tree

Hide file tree

Showing 6 changed files with 248 additions and 91 deletions.
diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile
@@ -8,6 +8,7 @@ properties([disableConcurrentBuilds(abortPrevious: true)])
 @Field def BUILD_MODES=["reg", "dbg", "dl"]
 @Field def MPI_TYPES=["impi", "mpich", "ompi"]
 @Field def PYTHON_VERSION="3.9"
+@Field def TIMEOUT="3600"
 
 def run_python(version, command, output=null) {
   if (output != null)
@@ -17,8 +18,9 @@ def run_python(version, command, output=null) {
 }
 
 def slurm_batch(partition, node_num, output, command) {
+
   try {
-    sh """timeout 3600 sbatch --partition=${partition} -N ${node_num} \
+    sh """timeout $TIMEOUT sbatch --partition=${partition} -N ${node_num} \
           --wait -o ${output} --open-mode=append --wrap=\'env; ${command}\'
        """
   } catch (Exception e) {
@@ -63,6 +65,9 @@ def run_middleware(providers, stage_name, test, partition, node_num, mpi=null,
   if (imb_grp)
     base_cmd = "${base_cmd} --imb_grp=${imb_grp}"
 
+  if (env.WEEKLY.toBoolean())
+    base_cmd = "${base_cmd} --weekly=${env.WEEKLY}"
+
   for (prov in providers) {
     if (prov[1]) {
       echo "Running ${prov[0]}-${prov[1]} ${stage_name}"
@@ -225,7 +230,7 @@ pipeline {
   }
   options {
       timestamps()
-      timeout(activity: true, time: 1, unit: 'HOURS')
+      timeout(activity: true, time: 6, unit: 'HOURS')
   }
   environment {
       JOB_CADENCE = 'PR'
@@ -235,7 +240,6 @@ pipeline {
       RUN_LOCATION="${env.WORKSPACE}/${SCRIPT_LOCATION}/"
       CUSTOM_WORKSPACE="${CB_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}"
   }
-
   stages {
     stage ('opt-out') {
       steps {
@@ -250,6 +254,9 @@ pipeline {
           } else {
             weekly = env.WEEKLY.toBoolean()
           }
+          if (weekly) {
+            TIMEOUT="21600" 
+          } 
           skip = skip()
           RELEASE = release()
           if (skip && !weekly) {
@@ -258,17 +265,26 @@ pipeline {
         }
       }
     }
+    stage ('prepare build') {
+      when { equals expected: true, actual: DO_RUN }
+      steps {
+        script {  
+          echo "Copying build dirs."
+          build("builddir")
+          echo "Copying log dirs."
+          build("logdir", null, null, RELEASE)
+          build("extract_mpich")
+          build("extract_impi_mpich")
+        }
+      }
+    }
     stage ('parallel-builds') {
       when { equals expected: true, actual: DO_RUN }
       parallel {
         stage ('build') {
           steps {
             script {
               dir (CUSTOM_WORKSPACE) {
-                echo "Copying build dirs."
-                build("builddir")
-                echo "Copying log dirs."
-                build("logdir", null, null, RELEASE)
                 for (mode in  BUILD_MODES) {
                   echo "Building Libfabric $mode"
                   build("libfabric", "$mode")
@@ -279,6 +295,21 @@ pipeline {
             }
           }
         }
+        stage ('buildmpich-libfabric') {
+          steps {
+            script {
+              dir("${CUSTOM_WORKSPACE}/mpich"){
+                checkout scm
+                echo "Building Libfabric reg"
+                slurm_batch("squirtle,totodile", "1",
+                            "${env.LOG_DIR}/libfabric_mpich_log", 
+                            """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \
+                              --build_item=libfabric_mpich """
+                          )  
+              }
+            }
+          }
+        }
         stage ('build-daos') {
           agent {
             node {
@@ -464,8 +495,7 @@ pipeline {
           steps {
             script {
               dir (RUN_LOCATION) {
-                def providers = [["verbs", "rxm"], ["tcp", null],
-                                 ["tcp", "rxm"], ["sockets", null]]
+                def providers = [['tcp', null], ["verbs","rxm"]]
                 for (mpi in MPI_TYPES) {
                   run_middleware(providers, "mpichtestsuite", "mpichtestsuite",
                                  "squirtle,totodile", "2", "${mpi}")
@@ -700,4 +730,4 @@ pipeline {
       dir("${env.WORKSPACE}@tmp") { deleteDir() }
     }
   }
-}
+}
diff --git a/contrib/intel/jenkins/build.py b/contrib/intel/jenkins/build.py
@@ -41,7 +41,7 @@ def build_libfabric(libfab_install_path, mode, cluster=None, ucx=None):
     for op in common.common_disable_list:
          config_cmd.append(f'--enable-{op}=no')
 
-    if (cluster == 'default'):
+    if (cluster == 'default' and build_item != 'libfabric_mpich'):
         for op in common.default_enable_list:
             config_cmd.append(f'--enable-{op}')
 
@@ -69,6 +69,30 @@ def build_fabtests(libfab_install_path, mode):
     common.run_command(['make', '-j32'])
     common.run_command(['make', 'install'])
 
+def extract_mpich(mpitype):
+
+    dest = f'{install_path}/middlewares/{mpitype}_mpichtest'
+    if (mpitype == 'mpich'):
+        src_dir = 'mpich'
+        mpich_tar = cloudbees_config.mpich_tar 
+    elif (mpitype == 'impi'):
+        src_dir = 'impi_mpichtest'
+        mpich_tar = cloudbees_config.impi_mpichtest_tar
+    else:
+        print(f"Invalid mpi type {mpitype}")
+        sys.exit(-1)
+
+    cwd = os.getcwd()
+    if (os.path.exists(dest)):
+        shutil.rmtree(dest)
+    os.makedirs(f'{dest}/{mpitype}_mpichsuite')
+    os.chdir(f'{cloudbees_config.scm_dir}/{src_dir}/')
+    common.run_command(['tar', '-xvf', 
+             f"{cloudbees_config.scm_dir}/{src_dir}/{mpich_tar}",
+             '-C', f'{dest}/{mpitype}_mpichsuite', 
+             '--strip-components', '1'])
+    os.chdir(cwd)
+
 def copy_build_dir(install_path):
     middlewares_path = f'{install_path}/middlewares'
     if (os.path.exists(middlewares_path) != True):
@@ -78,9 +102,6 @@ def copy_build_dir(install_path):
                     f'{middlewares_path}/shmem')
     shutil.copytree(f'{cloudbees_config.build_dir}/oneccl',
                     f'{middlewares_path}/oneccl')
-
-    os.symlink(f'{cloudbees_config.build_dir}/mpich',
-               f'{middlewares_path}/mpich')
     os.symlink(f'{cloudbees_config.build_dir}/impi',
                f'{middlewares_path}/impi')
     os.symlink(f'{cloudbees_config.build_dir}/ompi',
@@ -111,12 +132,12 @@ def log_dir(install_path, release=False):
     workspace = os.environ['WORKSPACE']
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--build_item', help="build libfabric or fabtests",
-                        choices=['libfabric', 'fabtests', 'builddir', 'logdir'])
-
+    parser.add_argument('--build_item', help="build libfabric or fabtests", \
+                        choices=['libfabric', 'libfabric_mpich', 'fabtests', \
+                                 'builddir', 'logdir', 'extract_mpich', \
+                                 'extract_impi_mpich'])
     parser.add_argument('--ofi_build_mode', help="select buildmode libfabric "\
                         "build mode", choices=['reg', 'dbg', 'dl'])
-
     parser.add_argument('--build_cluster', help="build libfabric on specified cluster", \
                         choices=['daos', 'gpu'], default='default')
     parser.add_argument('--release', help="This job is likely testing a "\
@@ -145,11 +166,16 @@ def log_dir(install_path, release=False):
     p = re.compile('mpi*')
 
     if (build_item == 'libfabric'):
-        build_libfabric(libfab_install_path, ofi_build_mode, cluster, ucx)
-
+            build_libfabric(libfab_install_path, ofi_build_mode, cluster, ucx)
+    elif (build_item == 'libfabric_mpich'):
+            build_libfabric(f'{libfab_install_path}/libfabric_mpich',
+                            ofi_build_mode, cluster)
     elif (build_item == 'fabtests'):
         build_fabtests(libfab_install_path, ofi_build_mode)
-
+    elif (build_item == 'extract_mpich'):
+        extract_mpich('mpich')
+    elif (build_item == 'extract_impi_mpich'):
+        extract_mpich('impi')
     elif (build_item == 'builddir'):
         copy_build_dir(install_path)
 

diff --git a/contrib/intel/jenkins/run.py b/contrib/intel/jenkins/run.py
@@ -126,18 +126,22 @@ def intel_mpi_benchmark(core, hosts, mpi, mode, group, user_env, log_file, util)
         print(f"Skipping {mpi.upper} {imb.testname} as execute condition fails")
     print('-------------------------------------------------------------------')
 
-def mpich_test_suite(core, hosts, mpi, mode, user_env, log_file, util):
+def mpich_test_suite(core, hosts, mpi, mode, user_env, log_file, util, weekly=None):
 
     mpich_tests = tests.MpichTestSuite(jobname=jbname,buildno=bno,
                                        testname="MpichTestSuite",core_prov=core,
                                        fabric=fab, mpitype=mpi, hosts=hosts,
                                        ofi_build_mode=mode, user_env=user_env,
-                                       log_file=log_file, util_prov=util)
+                                       log_file=log_file, util_prov=util, 
+                                       weekly=weekly)
 
     print('-------------------------------------------------------------------')
     if (mpich_tests.execute_condn == True):
-        print(f"Running mpichtestsuite: Spawn Tests for {core}-{util}-{fab}-{mpi}")
-        mpich_tests.execute_cmd("spawn")
+        if (mpi == "mpich"):
+            print("Building mpich")
+            mpich_tests.build_mpich()
+        print(f"Running mpichtestsuite for {core}-{util}-{fab}-{mpi}")
+        mpich_tests.execute_cmd()
     else:
         print(f"Skipping {mpi.upper()} {mpich_tests.testname} as exec condn fails")
     print('-------------------------------------------------------------------')

diff --git a/contrib/intel/jenkins/runtests.py b/contrib/intel/jenkins/runtests.py
@@ -37,6 +37,7 @@ def __call__(self, parser, namespace, values, option_string=None):
                     choices=['impi', 'mpich', 'ompi'], default='impi')
 parser.add_argument('--log_file', help="Full path to log file",
                     default=os.environ['DEFAULT_LOG_LOCATION'], type=str)
+parser.add_argument('--weekly', help="run weekly", default=False, type=bool)
 
 args = parser.parse_args()
 args_core = args.prov
@@ -45,6 +46,7 @@ def __call__(self, parser, namespace, values, option_string=None):
 args_device = args.device
 user_env = args.user_env
 log_file = args.log_file
+weekly = args.weekly
 
 if (args.ofi_build_mode):
     ofi_build_mode = args.ofi_build_mode
@@ -131,7 +133,7 @@ def __call__(self, parser, namespace, values, option_string=None):
         if (run_test == 'all' or run_test == 'mpichtestsuite'):
             run.mpich_test_suite(args_core, hosts, mpi,
                                 ofi_build_mode, user_env, log_file,
-                                args_util)
+                                args_util, weekly)
 
         if (run_test == 'all' or run_test == 'IMB'):
             run.intel_mpi_benchmark(args_core, hosts, mpi,

diff --git a/contrib/intel/jenkins/summary.py b/contrib/intel/jenkins/summary.py
@@ -530,27 +530,41 @@ def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name):
         super().__init__(logger, log_dir, prov, file_name, stage_name)
 
         self.mpi = mpi
-        if self.mpi == 'impi':
-            self.run = '/mpiexec'
-        else:
-            self.run = '/mpirun'
+        self.run = 'mpiexec'
+
+    def read_file(self):
+        previous = ""
+        with open(self.file_path,'r') as log_file:
+            for line in log_file:
+                line = line.lower().strip()
+                super().check_features(previous, line)
+                super().check_node(line)
+                super().check_line(line)
+                previous = line
+
+    def check_exclude(self, line):
+        if line.startswith('excluding:'):
+            test = line.split(':')[-1]
+            self.excludes += 1
+            self.excluded_tests.append(test)
 
     def check_name(self, line):
-        if self.run in line:
-            self.name = line.split()[len(line.split()) - 1].split('/')[1]
-            #assume pass
+        if (line.startswith('ok') or 
+            line.startswith('not ok')):
+                self.name = line.split('-')[1].split('#')[0].strip()
+
+    def check_pass(self, line):
+        if (line.startswith('ok') and not
+            line.split('#')[1].strip().startswith('skip')):
             self.passes += 1
             self.passed_tests.append(self.name)
 
     def check_fail(self, line):
-        # Fail cases take away assumed pass
-        if "exiting with" in line:
+        if (line.startswith('not ok') and not
+            line.split('#')[1].strip().startswith('skip')):
             self.fails += 1
-            self.passes -= 1
-            self.failed_tests.append(f'{self.name}')
-            #skip to next test
-            while self.run not in line:
-                line = self.log.readline().lower()
+            self.failed_tests.append(self.name)
+
 
 class ImbSummarizer(Summarizer):
     def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name):
@@ -806,7 +820,7 @@ def summarize_items(summary_item, logger, log_dir, mode):
 
     if summary_item == 'mpichtestsuite' or summary_item == 'all':
         for mpi in mpi_list:
-            for item in ['tcp-rxm', 'verbs-rxm', 'sockets', 'tcp']:
+            for item in ['tcp', 'verbs-rxm']:
                 ret = MpichTestSuiteSummarizer(
                     logger, log_dir, item, mpi,
                     f'mpichtestsuite_{item}_{mpi}_'\