From 1aba51a86f3ba9196cb2d9e53f9b1b760e936ef2 Mon Sep 17 00:00:00 2001 From: Hannes Vogt Date: Fri, 20 Sep 2024 13:51:21 +0200 Subject: [PATCH 01/12] Re-enable CI on GH200 --- ci/cscs-ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 7fcd65106d..aa3f35a486 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -53,8 +53,6 @@ stages: variables: CUDA_VERSION: 12.4.1 CUPY_PACKAGE: cupy-cuda12x - # TODO: enable CI job when Todi is back in operational state - when: manual build_py311_baseimage_x86_64: extends: .build_baseimage_x86_64 From eaa06ec9af8dd27bcd702609f1e31d1c15005296 Mon Sep 17 00:00:00 2001 From: Hannes Vogt Date: Fri, 20 Sep 2024 14:35:01 +0200 Subject: [PATCH 02/12] Update cscs-ci.yml --- ci/cscs-ci.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index aa3f35a486..496c179602 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -53,6 +53,8 @@ stages: variables: CUDA_VERSION: 12.4.1 CUPY_PACKAGE: cupy-cuda12x + SLURM_PARTITION: debug + SLURM_TIMELIMIT: '00:15:00' build_py311_baseimage_x86_64: extends: .build_baseimage_x86_64 @@ -99,6 +101,9 @@ build_py38_baseimage_x86_64: extends: [.container-builder-cscs-zen2, .build_image] .build_image_aarch64: extends: [.container-builder-cscs-gh200, .build_image] + variables: + SLURM_PARTITION: debug + SLURM_TIMELIMIT: '00:15:00' build_py311_image_x86_64: extends: .build_image_x86_64 @@ -182,6 +187,8 @@ build_py38_image_x86_64: # Another problem, observed in test stage, is that gpu tests hang in combination with CUDA MPS, # when high test parallelism is used. NUM_PROCESSES: 16 + SLURM_PARTITION: debug + SLURM_TIMELIMIT: '00:15:00' test_py311_x86_64: extends: [.test_helper_x86_64] From 0abbde33ea9143ea1215af4eeaa2dce7e9902300 Mon Sep 17 00:00:00 2001 From: Hannes Vogt Date: Tue, 1 Oct 2024 15:48:20 +0200 Subject: [PATCH 03/12] Update cscs-ci.yml --- ci/cscs-ci.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 496c179602..96cab75166 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -53,7 +53,8 @@ stages: variables: CUDA_VERSION: 12.4.1 CUPY_PACKAGE: cupy-cuda12x - SLURM_PARTITION: debug + SLURM_ACCOUNT: d75 + SLURM_RESERVATION: daint SLURM_TIMELIMIT: '00:15:00' build_py311_baseimage_x86_64: @@ -102,7 +103,8 @@ build_py38_baseimage_x86_64: .build_image_aarch64: extends: [.container-builder-cscs-gh200, .build_image] variables: - SLURM_PARTITION: debug + SLURM_ACCOUNT: d75 + SLURM_RESERVATION: daint SLURM_TIMELIMIT: '00:15:00' build_py311_image_x86_64: @@ -187,7 +189,8 @@ build_py38_image_x86_64: # Another problem, observed in test stage, is that gpu tests hang in combination with CUDA MPS, # when high test parallelism is used. NUM_PROCESSES: 16 - SLURM_PARTITION: debug + SLURM_ACCOUNT: d75 + SLURM_RESERVATION: daint SLURM_TIMELIMIT: '00:15:00' test_py311_x86_64: From 3d229eb18f19812a33cca8dd31285133c9cfea2a Mon Sep 17 00:00:00 2001 From: Hannes Vogt Date: Tue, 1 Oct 2024 15:50:56 +0200 Subject: [PATCH 04/12] Update cscs-ci.yml --- ci/cscs-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 96cab75166..3aaabd1823 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -53,7 +53,7 @@ stages: variables: CUDA_VERSION: 12.4.1 CUPY_PACKAGE: cupy-cuda12x - SLURM_ACCOUNT: d75 + SLURM_ACCOUNT: g154 SLURM_RESERVATION: daint SLURM_TIMELIMIT: '00:15:00' @@ -103,7 +103,7 @@ build_py38_baseimage_x86_64: .build_image_aarch64: extends: [.container-builder-cscs-gh200, .build_image] variables: - SLURM_ACCOUNT: d75 + SLURM_ACCOUNT: g154 SLURM_RESERVATION: daint SLURM_TIMELIMIT: '00:15:00' @@ -189,7 +189,7 @@ build_py38_image_x86_64: # Another problem, observed in test stage, is that gpu tests hang in combination with CUDA MPS, # when high test parallelism is used. NUM_PROCESSES: 16 - SLURM_ACCOUNT: d75 + SLURM_ACCOUNT: g154 SLURM_RESERVATION: daint SLURM_TIMELIMIT: '00:15:00' From 0dd950caa3cde7e27ba1b74b94bcdbc32ee949e9 Mon Sep 17 00:00:00 2001 From: Hannes Vogt Date: Mon, 11 Nov 2024 10:37:09 +0100 Subject: [PATCH 05/12] Update cscs-ci.yml --- ci/cscs-ci.yml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 3aaabd1823..aa3f35a486 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -53,9 +53,6 @@ stages: variables: CUDA_VERSION: 12.4.1 CUPY_PACKAGE: cupy-cuda12x - SLURM_ACCOUNT: g154 - SLURM_RESERVATION: daint - SLURM_TIMELIMIT: '00:15:00' build_py311_baseimage_x86_64: extends: .build_baseimage_x86_64 @@ -102,10 +99,6 @@ build_py38_baseimage_x86_64: extends: [.container-builder-cscs-zen2, .build_image] .build_image_aarch64: extends: [.container-builder-cscs-gh200, .build_image] - variables: - SLURM_ACCOUNT: g154 - SLURM_RESERVATION: daint - SLURM_TIMELIMIT: '00:15:00' build_py311_image_x86_64: extends: .build_image_x86_64 @@ -189,9 +182,6 @@ build_py38_image_x86_64: # Another problem, observed in test stage, is that gpu tests hang in combination with CUDA MPS, # when high test parallelism is used. NUM_PROCESSES: 16 - SLURM_ACCOUNT: g154 - SLURM_RESERVATION: daint - SLURM_TIMELIMIT: '00:15:00' test_py311_x86_64: extends: [.test_helper_x86_64] From 6abd1cc9d93dd8e37e760f3cf75006bc9a22806e Mon Sep 17 00:00:00 2001 From: Edoardo Paone Date: Fri, 29 Nov 2024 09:27:52 +0100 Subject: [PATCH 06/12] fix for dace build --- .../runners_tests/dace_tests/test_gtir_to_sdfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/next_tests/unit_tests/program_processor_tests/runners_tests/dace_tests/test_gtir_to_sdfg.py b/tests/next_tests/unit_tests/program_processor_tests/runners_tests/dace_tests/test_gtir_to_sdfg.py index c7466b853f..b1ba4ccf22 100644 --- a/tests/next_tests/unit_tests/program_processor_tests/runners_tests/dace_tests/test_gtir_to_sdfg.py +++ b/tests/next_tests/unit_tests/program_processor_tests/runners_tests/dace_tests/test_gtir_to_sdfg.py @@ -1984,7 +1984,7 @@ def test_gtir_index(): ) testee = gtir.Program( - id="gtir_cast", + id="gtir_index", function_definitions=[], params=[ gtir.Sym(id="x", type=ts.FieldType(dims=[IDim], dtype=SIZE_TYPE)), From 95de2d765357ec06f27fe424baa7883314806f71 Mon Sep 17 00:00:00 2001 From: Hannes Vogt Date: Mon, 16 Dec 2024 10:05:44 +0100 Subject: [PATCH 07/12] try switch to santis --- ci/cscs-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 2402d22117..da4395ac33 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -169,7 +169,7 @@ build_py38_image_x86_64: VARIANT: [-nomesh, -atlas] SUBVARIANT: [-cuda11x, -cpu] .test_helper_aarch64: - extends: [.container-runner-todi-gh200, .test_helper] + extends: [.container-runner-santis-gh200, .test_helper] parallel: matrix: - SUBPACKAGE: [cartesian, storage] From 4fac1a1ddbccf7510c8d3b947bed371a671503ec Mon Sep 17 00:00:00 2001 From: Edoardo Paone Date: Thu, 19 Dec 2024 11:49:52 +0100 Subject: [PATCH 08/12] switch from todi to daint --- ci/cscs-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index da4395ac33..484690d8ba 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -169,7 +169,7 @@ build_py38_image_x86_64: VARIANT: [-nomesh, -atlas] SUBVARIANT: [-cuda11x, -cpu] .test_helper_aarch64: - extends: [.container-runner-santis-gh200, .test_helper] + extends: [.container-runner-daint-gh200, .test_helper] parallel: matrix: - SUBPACKAGE: [cartesian, storage] From 305c3868862a357c395b38876c3e26b083289c21 Mon Sep 17 00:00:00 2001 From: Edoardo Paone Date: Thu, 19 Dec 2024 15:15:13 +0100 Subject: [PATCH 09/12] Increase slurm timeout --- ci/cscs-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 484690d8ba..274ea86f84 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -154,7 +154,7 @@ build_py38_image_x86_64: variables: CRAY_CUDA_MPS: 1 SLURM_JOB_NUM_NODES: 1 - SLURM_TIMELIMIT: 15 + SLURM_TIMELIMIT: 30 NUM_PROCESSES: auto VIRTUALENV_SYSTEM_SITE_PACKAGES: 1 .test_helper_x86_64: From 33b632a910f67d9c19d0f5df59f77c31f9647b92 Mon Sep 17 00:00:00 2001 From: Edoardo Paone Date: Fri, 20 Dec 2024 11:04:34 +0100 Subject: [PATCH 10/12] try custom santis config --- ci/cscs-ci.yml | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 274ea86f84..e594a30df2 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -144,6 +144,21 @@ build_py38_image_x86_64: <<: *py38 + +# TODO(edopao): remove santis config once available in remote CSCS GitLab config +.container-runner-santis-alps: + extends: .f7t-container-runner + variables: + FIRECREST_SYSTEM: 'santis' +.container-runner-santis-gh200: + extends: .container-runner-santis-alps + variables: + ARCH: 'aarch64' + USE_CE: 'YES' + SLURM_MPI_TYPE: pmi2 + NVIDIA_VISIBLE_DEVICES: 'all' + NVIDIA_DRIVER_CAPABILITIES: 'compute,utility' + .test_helper: stage: test image: $CSCS_REGISTRY_PATH/public/$ARCH/gt4py/gt4py-ci:$CI_COMMIT_SHA-$PYVERSION @@ -154,7 +169,7 @@ build_py38_image_x86_64: variables: CRAY_CUDA_MPS: 1 SLURM_JOB_NUM_NODES: 1 - SLURM_TIMELIMIT: 30 + SLURM_TIMELIMIT: 15 NUM_PROCESSES: auto VIRTUALENV_SYSTEM_SITE_PACKAGES: 1 .test_helper_x86_64: @@ -169,7 +184,7 @@ build_py38_image_x86_64: VARIANT: [-nomesh, -atlas] SUBVARIANT: [-cuda11x, -cpu] .test_helper_aarch64: - extends: [.container-runner-daint-gh200, .test_helper] + extends: [.container-runner-santis-gh200, .test_helper] parallel: matrix: - SUBPACKAGE: [cartesian, storage] From d2662b4b8d38cf4ebb125dd7e5d84f1e3144bba3 Mon Sep 17 00:00:00 2001 From: Edoardo Paone Date: Fri, 20 Dec 2024 11:09:30 +0100 Subject: [PATCH 11/12] change vCluster back santis -> daint --- ci/cscs-ci.yml | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index e594a30df2..484690d8ba 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -144,21 +144,6 @@ build_py38_image_x86_64: <<: *py38 - -# TODO(edopao): remove santis config once available in remote CSCS GitLab config -.container-runner-santis-alps: - extends: .f7t-container-runner - variables: - FIRECREST_SYSTEM: 'santis' -.container-runner-santis-gh200: - extends: .container-runner-santis-alps - variables: - ARCH: 'aarch64' - USE_CE: 'YES' - SLURM_MPI_TYPE: pmi2 - NVIDIA_VISIBLE_DEVICES: 'all' - NVIDIA_DRIVER_CAPABILITIES: 'compute,utility' - .test_helper: stage: test image: $CSCS_REGISTRY_PATH/public/$ARCH/gt4py/gt4py-ci:$CI_COMMIT_SHA-$PYVERSION @@ -184,7 +169,7 @@ build_py38_image_x86_64: VARIANT: [-nomesh, -atlas] SUBVARIANT: [-cuda11x, -cpu] .test_helper_aarch64: - extends: [.container-runner-santis-gh200, .test_helper] + extends: [.container-runner-daint-gh200, .test_helper] parallel: matrix: - SUBPACKAGE: [cartesian, storage] From f6b87c32173a19b4e2a2f7c5e8c8247af909e61e Mon Sep 17 00:00:00 2001 From: Edoardo Paone Date: Fri, 20 Dec 2024 11:11:29 +0100 Subject: [PATCH 12/12] Revert "change vCluster back santis -> daint" This reverts commit d2662b4b8d38cf4ebb125dd7e5d84f1e3144bba3. --- ci/cscs-ci.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 484690d8ba..e594a30df2 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -144,6 +144,21 @@ build_py38_image_x86_64: <<: *py38 + +# TODO(edopao): remove santis config once available in remote CSCS GitLab config +.container-runner-santis-alps: + extends: .f7t-container-runner + variables: + FIRECREST_SYSTEM: 'santis' +.container-runner-santis-gh200: + extends: .container-runner-santis-alps + variables: + ARCH: 'aarch64' + USE_CE: 'YES' + SLURM_MPI_TYPE: pmi2 + NVIDIA_VISIBLE_DEVICES: 'all' + NVIDIA_DRIVER_CAPABILITIES: 'compute,utility' + .test_helper: stage: test image: $CSCS_REGISTRY_PATH/public/$ARCH/gt4py/gt4py-ci:$CI_COMMIT_SHA-$PYVERSION @@ -169,7 +184,7 @@ build_py38_image_x86_64: VARIANT: [-nomesh, -atlas] SUBVARIANT: [-cuda11x, -cpu] .test_helper_aarch64: - extends: [.container-runner-daint-gh200, .test_helper] + extends: [.container-runner-santis-gh200, .test_helper] parallel: matrix: - SUBPACKAGE: [cartesian, storage]