From 39cd680844dbf58ef3e62262740a2071b86c329b Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Thu, 10 Oct 2024 08:53:28 -0400 Subject: [PATCH] Only capture nsys in solve-atmos Try with timed_solve Try range-at-domain Try with single quotes Try using delay keyword Shorten simulations wip --- .buildkite/comparison/pipeline.sh | 2 +- .buildkite/gpu_pipeline/pipeline.yml | 19 ++++++++++--------- .buildkite/pipeline.yml | 8 ++++---- config/model_configs/aquaplanet_diagedmf.yml | 16 ++++++++-------- config/model_configs/aquaplanet_progedmf.yml | 2 +- 5 files changed, 24 insertions(+), 23 deletions(-) diff --git a/.buildkite/comparison/pipeline.sh b/.buildkite/comparison/pipeline.sh index cf7b8c2dfc..5cb1ff564c 100755 --- a/.buildkite/comparison/pipeline.sh +++ b/.buildkite/comparison/pipeline.sh @@ -75,7 +75,7 @@ else fi if [[ "$profiling" == "enable" ]]; then - command="nsys profile --trace=nvtx,mpi --mpi-impl=mpich --output=${job_id}/report.%q{NPROCS}.%q{PMI_RANK} $command" + command="nsys profile --delay 100 --trace=nvtx,mpi --mpi-impl=mpich --output=${job_id}/report.%q{NPROCS}.%q{PMI_RANK} $command" fi cat << EOM diff --git a/.buildkite/gpu_pipeline/pipeline.yml b/.buildkite/gpu_pipeline/pipeline.yml index 98efdb55d3..924e642c98 100644 --- a/.buildkite/gpu_pipeline/pipeline.yml +++ b/.buildkite/gpu_pipeline/pipeline.yml @@ -51,7 +51,7 @@ steps: command: - mkdir -p target_gpu_implicit_baroclinic_wave - > - nsys profile --trace=nvtx,mpi,cuda,osrt --output=target_gpu_implicit_baroclinic_wave/output_active/report + nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=target_gpu_implicit_baroclinic_wave/output_active/report julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl --config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave.yml --job_id target_gpu_implicit_baroclinic_wave @@ -68,7 +68,7 @@ steps: command: - mkdir -p gpu_hs_rhoe_equil_55km_nz63_0M - > - nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_hs_rhoe_equil_55km_nz63_0M/output_active/report + nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_hs_rhoe_equil_55km_nz63_0M/output_active/report julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl --config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equil_55km_nz63_0M.yml --job_id gpu_hs_rhoe_equil_55km_nz63_0M @@ -86,7 +86,7 @@ steps: - mkdir -p gpu_hs_rhoe_equil_55km_nz63_0M_4process - > srun --cpu-bind=threads --cpus-per-task=4 - nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_hs_rhoe_equil_55km_nz63_0M_4process/output_active/report-%q{PMI_RANK} + nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_hs_rhoe_equil_55km_nz63_0M_4process/output_active/report-%q{PMI_RANK} julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl --config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equil_55km_nz63_0M.yml --job_id gpu_hs_rhoe_equil_55km_nz63_0M_4process @@ -106,7 +106,7 @@ steps: - mkdir -p target_gpu_implicit_baroclinic_wave_4process - > srun --cpu-bind=threads --cpus-per-task=4 - nsys profile --trace=osrt,nvtx,cuda,mpi,ucx --output=target_gpu_implicit_baroclinic_wave_4process/output_active/report-%q{PMI_RANK} + nsys profile --delay 100 --trace=osrt,nvtx,cuda,mpi,ucx --output=target_gpu_implicit_baroclinic_wave_4process/output_active/report-%q{PMI_RANK} julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl --config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave.yml --job_id target_gpu_implicit_baroclinic_wave_4process @@ -128,7 +128,7 @@ steps: - mkdir -p gpu_aquaplanet_dyamond_diag_1process - > srun --cpu-bind=threads --cpus-per-task=4 - nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_dyamond_diag_1process/output_active/report julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl + nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_dyamond_diag_1process/output_active/report julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond_diag_1process.yml --job_id gpu_aquaplanet_dyamond_diag_1process artifact_paths: "gpu_aquaplanet_dyamond_diag_1process/output_active/*" @@ -148,7 +148,7 @@ steps: - mkdir -p gpu_aquaplanet_dyamond_ss_1process - > srun --cpu-bind=threads --cpus-per-task=4 - nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_dyamond_ss_1process/output_active/report + nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_dyamond_ss_1process/output_active/report julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond_ss.yml --job_id gpu_aquaplanet_dyamond_ss_1process @@ -307,7 +307,7 @@ steps: command: - mkdir -p gpu_aquaplanet_diagedmf - > - nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_diagedmf/output_active/report + nsys profile --delay 200 --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_diagedmf/output_active/report julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl --config_file ${MODEL_CONFIG_PATH}aquaplanet_diagedmf.yml --job_id gpu_aquaplanet_diagedmf @@ -319,7 +319,7 @@ steps: slurm_gpus_per_task: 1 slurm_cpus_per_task: 4 slurm_ntasks: 1 - slurm_mem: 32G + slurm_mem: 64G slurm_exclusive: - label: "gpu_aquaplanet_diagedmf_benchmark" @@ -331,6 +331,7 @@ steps: env: CLIMACOMMS_DEVICE: "CUDA" agents: + slurm_mem: 64G slurm_gpus: 1 - group: "Prognostic EDMF GPU" @@ -340,7 +341,7 @@ steps: command: - mkdir -p gpu_aquaplanet_progedmf - > - nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_progedmf/output_active/report + nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_progedmf/output_active/report julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl --config_file ${MODEL_CONFIG_PATH}aquaplanet_progedmf.yml --job_id gpu_aquaplanet_progedmf diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a8126280d9..e20ce5dd9c 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -871,7 +871,7 @@ steps: command: - mkdir -p target_gpu_implicit_baroclinic_wave - > - nsys profile --trace=nvtx,cuda --output=target_gpu_implicit_baroclinic_wave/output_active/report + nsys profile --delay 100 --trace=nvtx,cuda --output=target_gpu_implicit_baroclinic_wave/output_active/report julia --color=yes --project=examples examples/hybrid/driver.jl --config_file ${GPU_CONFIG_PATH}/target_gpu_implicit_baroclinic_wave.yml --job_id target_gpu_implicit_baroclinic_wave @@ -885,7 +885,7 @@ steps: - label: "GPU: GPU dry baroclinic wave - 4 gpus" key: "target_gpu_implicit_baroclinic_wave_4process" command: - # nsys profile --trace=nvtx,cuda,mpi --output=target_gpu_implicit_baroclinic_wave_4process/output_active/report-%q{PMI_RANK} + nsys profile --delay 100 --trace=nvtx,cuda,mpi --output=target_gpu_implicit_baroclinic_wave_4process/output_active/report-%q{PMI_RANK} - mkdir -p target_gpu_implicit_baroclinic_wave_4process - > srun --cpu-bind=threads --cpus-per-task=4 @@ -905,7 +905,7 @@ steps: - label: "GPU: GPU moist Held-Suarez" command: - > - nsys profile --trace=nvtx,cuda --output=central_gpu_hs_rhoe_equil_55km_nz63_0M/output_active/report + nsys profile --delay 100 --trace=nvtx,cuda --output=central_gpu_hs_rhoe_equil_55km_nz63_0M/output_active/report julia --color=yes --project=examples examples/hybrid/driver.jl --config_file $CONFIG_PATH/central_gpu_hs_rhoe_equil_55km_nz63_0M.yml --job_id central_gpu_hs_rhoe_equil_55km_nz63_0M @@ -919,7 +919,7 @@ steps: - label: "GPU: GPU moist Held-Suarez cloud diagnostics per stage" command: - > - nsys profile --trace=nvtx,cuda --output=central_cloud_diag_gpu_hs_rhoe_equil_55km_nz63_0M/output_active/report + nsys profile --delay 100 --trace=nvtx,cuda --output=central_cloud_diag_gpu_hs_rhoe_equil_55km_nz63_0M/output_active/report julia --color=yes --project=examples examples/hybrid/driver.jl --config_file $CONFIG_PATH/central_cloud_diag_gpu_hs_rhoe_equil_55km_nz63_0M.yml --job_id central_cloud_diag_gpu_hs_rhoe_equil_55km_nz63_0M diff --git a/config/model_configs/aquaplanet_diagedmf.yml b/config/model_configs/aquaplanet_diagedmf.yml index 358593a950..a5e53e1654 100644 --- a/config/model_configs/aquaplanet_diagedmf.yml +++ b/config/model_configs/aquaplanet_diagedmf.yml @@ -7,25 +7,25 @@ z_elem: 63 dz_bottom: 30.0 rayleigh_sponge: true viscous_sponge: true -moist: equil -surface_setup: DefaultMoninObukhov +moist: equil +surface_setup: DefaultMoninObukhov rad: allskywithclear insolation: "timevarying" dt_rad: 1hours dt_cloud_fraction: 1hours -turbconv: diagnostic_edmfx +turbconv: diagnostic_edmfx implicit_diffusion: true approximate_linear_solve_iters: 2 prognostic_tke: true -edmfx_upwinding: first_order -edmfx_entr_model: "Generalized" -edmfx_detr_model: "Generalized" -edmfx_nh_pressure: true +edmfx_upwinding: first_order +edmfx_entr_model: "Generalized" +edmfx_detr_model: "Generalized" +edmfx_nh_pressure: true edmfx_sgs_mass_flux: true edmfx_sgs_diffusive_flux: true cloud_model: "quadrature_sgs" precip_model: 0M dt: 90secs -t_end: 1days +t_end: 61mins toml: [toml/diagnostic_edmfx.toml] ode_algo: ARS343 diff --git a/config/model_configs/aquaplanet_progedmf.yml b/config/model_configs/aquaplanet_progedmf.yml index 0320b0c13d..bb60139e32 100644 --- a/config/model_configs/aquaplanet_progedmf.yml +++ b/config/model_configs/aquaplanet_progedmf.yml @@ -28,6 +28,6 @@ edmfx_sgs_mass_flux: true edmfx_sgs_diffusive_flux: true precip_model: 0M dt: 10secs -t_end: 3hours +t_end: 61mins toml: [toml/prognostic_edmfx.toml] ode_algo: ARS343