Merge branch 'master' into multipole-boundary-data

mabarnes · Nov 15, 2024 · 5eb60c2 · 5eb60c2
2 parents 42c7806 + d8b8fd7
commit 5eb60c2
Show file tree

Hide file tree

Showing 27 changed files with 957 additions and 873 deletions.
diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
@@ -23,4 +23,4 @@ jobs:
           touch Project.toml
           julia -O3 --project -e 'import Pkg; Pkg.develop(path="moment_kinetics/"); Pkg.add("NCDatasets"); Pkg.precompile()'
           # Reduce nstep for each example to 10 to avoid the CI job taking too long
-          julia -O3 --project -e 'using moment_kinetics; for (root, dirs, files) in walkdir("examples") for file in files if endswith(file, ".toml") filename = joinpath(root, file); println(filename); input = moment_kinetics.moment_kinetics_input.read_input_file(filename); t_input = get(input, "timestepping", Dict{String,Any}()); t_input["nstep"] = 10; t_input["dt"] = 1.0e-12; input["timestepping"] = t_input; pop!(get(input, "z", Dict{String,Any}()), "nelement_local", ""); pop!(get(input, "r", Dict{String,Any}()), "nelement_local", ""); electron_t_input = get(input, "electron_timestepping", Dict{String,Any}()); electron_t_input["initialization_residual_value"] = 1.0e8; electron_t_input["converged_residual_value"] = 1.0e8; input["electron_timestepping"] = electron_t_input; nl_solver_input = get(input, "nonlinear_solver", Dict{String,Any}()); nl_solver_input["rtol"] = 1.0e6; nl_solver_input["atol"] = 1.0e6; input["nonlinear_solver"] = nl_solver_input; run_moment_kinetics(input) end end end'
+          julia -O3 --project -e 'using moment_kinetics; using moment_kinetics.type_definitions: OptionsDict; for (root, dirs, files) in walkdir("examples") for file in files if endswith(file, ".toml") filename = joinpath(root, file); println(filename); input = moment_kinetics.moment_kinetics_input.read_input_file(filename); t_input = get(input, "timestepping", OptionsDict()); t_input["nstep"] = 10; t_input["dt"] = 1.0e-12; input["timestepping"] = t_input; pop!(get(input, "z", OptionsDict()), "nelement_local", ""); pop!(get(input, "r", OptionsDict()), "nelement_local", ""); electron_t_input = get(input, "electron_timestepping", OptionsDict()); electron_t_input["initialization_residual_value"] = 1.0e8; electron_t_input["converged_residual_value"] = 1.0e8; input["electron_timestepping"] = electron_t_input; nl_solver_input = get(input, "nonlinear_solver", OptionsDict()); nl_solver_input["rtol"] = 1.0e6; nl_solver_input["atol"] = 1.0e6; input["nonlinear_solver"] = nl_solver_input; run_moment_kinetics(input) end end end'
diff --git a/.github/workflows/parallel_test.yml b/.github/workflows/parallel_test.yml
@@ -10,7 +10,7 @@ permissions:
 jobs:
   test-ubuntu:
     runs-on: ubuntu-latest
-    timeout-minutes: 150
+    timeout-minutes: 180
 
     steps:
       - uses: actions/checkout@v4
@@ -21,14 +21,15 @@ jobs:
       - uses: julia-actions/cache@v2
       - run: |
           touch Project.toml
-          julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["MPI", "MPIPreferences"]); using MPIPreferences; MPIPreferences.use_jll_binary("OpenMPI_jll")'
+          julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["MPI", "MPIPreferences", "PackageCompiler"]); using MPIPreferences; MPIPreferences.use_jll_binary("OpenMPI_jll")'
           julia --project -O3 --check-bounds=no -e 'using MPI; MPI.install_mpiexecjl(; destdir=".")'
           julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["NCDatasets", "Random", "SpecialFunctions", "StatsBase", "Test"]); Pkg.develop(path="moment_kinetics/")'
           julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.precompile()'
+          julia --project -O3 --check-bounds=no precompile.jl
           # Need to use openmpi so that we can use `--oversubscribe` to allow using more MPI ranks than physical cores
-          ./mpiexecjl -np 3 --oversubscribe julia --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1
-          ./mpiexecjl -np 4 --oversubscribe julia --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1
-          ./mpiexecjl -np 2 --oversubscribe julia --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1 --long
+          ./mpiexecjl -np 3 --oversubscribe julia -J moment_kinetics.so --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1
+          ./mpiexecjl -np 4 --oversubscribe julia -J moment_kinetics.so --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1
+          ./mpiexecjl -np 2 --oversubscribe julia -J moment_kinetics.so --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1 --long
           # Note: MPI.jl's default implementation is mpich, which has a similar option
           # `--with-device=ch3:sock`, but that needs to be set when compiling mpich.
         shell: bash
@@ -46,12 +47,13 @@ jobs:
       - uses: julia-actions/cache@v2
       - run: |
           touch Project.toml
-          julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["MPI", "MPIPreferences"]); using MPIPreferences; MPIPreferences.use_jll_binary("OpenMPI_jll")'
+          julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["MPI", "MPIPreferences", "PackageCompiler"]); using MPIPreferences; MPIPreferences.use_jll_binary("OpenMPI_jll")'
           julia --project -O3 --check-bounds=no -e 'using MPI; MPI.install_mpiexecjl(; destdir=".")'
           julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["NCDatasets", "Random", "SpecialFunctions", "StatsBase", "Test"]); Pkg.develop(path="moment_kinetics/")'
           julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.precompile()'
+          julia --project -O3 --check-bounds=no precompile.jl
           # Need to use openmpi so that we can use `--oversubscribe` to allow using more MPI ranks than physical cores
-          ./mpiexecjl -np 4 --oversubscribe julia --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1
+          ./mpiexecjl -np 4 --oversubscribe julia -J moment_kinetics.so --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1
           # Note: MPI.jl's default implementation is mpich, which has a similar option
           # `--with-device=ch3:sock`, but that needs to be set when compiling mpich.
         shell: bash
diff --git a/docs/src/developing.md b/docs/src/developing.md
@@ -41,6 +41,18 @@ String and as a tree of HDF5 variables.
     use `nothing` as a default for some setting, that is fine, but must be done
     after the input is read, and not stored in the `input_dict`.
 
+!!! warning "Parallel I/O consistency"
+    To ensure consistency between all MPI ranks in the order of reads and/or
+    writes when using Parallel I/O, all dictionary types used to store options
+    must be either `OrderedDict` or `SortedDict`, so that their order of
+    entries is deterministic (which is not the case for `Dict`, which instead
+    optimises for look-up speed). This should mostly be taken care of by using
+    `moment_kinetics`'s `OptionsDict` type (which is an alias for
+    `OrderedDict{String,Any}`). We also need to sort the input after it is read
+    by `TOML`, which is taken care of by
+    [`moment_kinetics.input_structs.convert_to_sorted_nested_OptionsDict`](@ref).
+    See also [Parallel I/O](@ref parallel_io_section).
+
 
 ## Array types
 
@@ -219,6 +231,48 @@ communicator is `comm_block[]`).
 See also notes on debugging the 'anyv' parallelisation: [Collision operator and
 'anyv' region](@ref).
 
+## [Parallel I/O](@id parallel_io_section)
+
+The code provides an option to use parallel I/O, which allows all output to be
+written to a single output file even when using distributed-MPI parallelism -
+this is the default option when the linked HDF5 library is compiled with
+parallel-I/O support.
+
+There are a few things to be aware of to ensure parallel I/O works correctly:
+* Some operations have to be called simultaneously on all the MPI ranks that
+  have the output file open. Roughly, these are any operations that change the
+  'metadata' of the file, for example opening/closing files, creating
+  variables, extending dimensions of variables, changing attributes of
+  variables. Reading or writing the data from a variable does not have to be
+  done collectively - actually when we write data we ensure that every rank
+  that is writing writes a non-overlapping slice of the array to avoid
+  contention that could slow down the I/O (because one rank has to wait for
+  another) and to avoid slight inconsistencies because it is uncertain which
+  rank writes the data last. For more details see the [HDF5.jl
+  documentation](https://juliaio.github.io/HDF5.jl/stable/mpi/#Reading-and-writing-data-in-parallel)
+  and the [HDF5
+  documentation](https://support.hdfgroup.org/archive/support/HDF5/doc/RM/CollectiveCalls.html).
+* One important subtlety is that the `Dict` type does not guarantee a
+  deterministic order of entries. When you iterate over a `Dict`, you can get
+  the results in a different order at different times or on different MPI
+  ranks. If we iterated over a `Dict` to create variables to write to an output
+  file, or to read from a file, then different MPI ranks might (sometimes) get
+  the variables in a different order, causing errors. We therefore use either
+  `OrderedDict` or `SortedDict` types for anything that might be written to or
+  read from an HDF5 file.
+
+If the collective operations are not done perfectly consistently, the errors
+can be extremely non-obvious. The inconsistent operations may appear to execute
+correctly, for example because the same number of variables are created, and
+the metadata may only actually be written from the rank-0 process, but the
+inconsistency may cause errors later. [JTO, 3/11/2024: my best guess as to the
+reason for this is that it puts HDF5's 'metadata cache' in inconsistent states
+on different ranks, and this means that at some later time the ranks will cycle
+some metadata out of the cache in different orders, and then some ranks will be
+able to get the metadata from the cache, while others have to read it from the
+file. The reading from the file requires some collective MPI call, which is
+only called from some ranks and not others, causing the code to hang.]
+
 ## Package structure
 
 The structure of the packages in the `moment_kinetics` repo is set up so that

diff --git a/...rons/nonuniform_periodic_split3_kinetic-coarse-tails-compressed-z4-PareschiRusso2222.toml b/...rons/nonuniform_periodic_split3_kinetic-coarse-tails-compressed-z4-PareschiRusso2222.toml
@@ -0,0 +1,152 @@
+[evolve_moments]
+parallel_pressure = true
+density = true
+moments_conservation = true
+parallel_flow = true
+
+[reactions]
+electron_ionization_frequency = 0.0
+ionization_frequency = 0.5
+charge_exchange_frequency = 0.75
+
+[r]
+ngrid = 1
+nelement = 1
+
+[z]
+ngrid = 5
+discretization = "gausslegendre_pseudospectral"
+nelement = 32
+#nelement_local = 4
+bc = "periodic"
+element_spacing_option = "compressed_4"
+
+[vpa]
+ngrid = 6
+discretization = "gausslegendre_pseudospectral"
+nelement = 31
+L = 30.0
+element_spacing_option = "coarse_tails"
+bc = "zero"
+
+[vz]
+ngrid = 6
+discretization = "gausslegendre_pseudospectral"
+nelement = 31
+L = 36.0
+element_spacing_option = "coarse_tails"
+bc = "zero"
+
+[composition]
+T_e = 0.2
+electron_physics = "kinetic_electrons"
+n_ion_species = 1
+n_neutral_species = 1
+
+[ion_species_1]
+initial_temperature = 0.2
+initial_density = 1.0
+
+[z_IC_ion_species_1]
+initialization_option = "sinusoid"
+density_amplitude = 0.0 #0.2
+temperature_amplitude = 0.3
+density_phase = 0.0
+upar_amplitude = 0.0 #0.1
+temperature_phase = 1.0
+upar_phase = 2.0
+
+[vpa_IC_ion_species_1]
+initialization_option = "gaussian"
+density_amplitude = 1.0
+temperature_amplitude = 0.0
+density_phase = 0.0
+upar_amplitude = 0.0
+temperature_phase = 0.0
+upar_phase = 0.0
+
+[neutral_species_1]
+initial_temperature = 0.2
+initial_density = 1.0
+
+[z_IC_neutral_species_1]
+initialization_option = "sinusoid"
+temperature_amplitude = 0.0
+density_amplitude = 0.0
+density_phase = 0.0
+upar_amplitude = 0.0
+temperature_phase = 0.0
+upar_phase = 0.0
+
+[vz_IC_neutral_species_1]
+initialization_option = "gaussian"
+density_amplitude = 1.0
+temperature_amplitude = 0.0
+density_phase = 0.0
+upar_amplitude = 0.0
+temperature_phase = 0.0
+upar_phase = 0.0
+
+[krook_collisions]
+use_krook = true
+
+[timestepping]
+type = "PareschiRusso2(2,2,2)"
+implicit_electron_advance = false
+implicit_electron_ppar = true
+implicit_ion_advance = false
+implicit_vpa_advection = false
+nstep = 100000
+dt = 1.0e-5
+nwrite = 1000
+nwrite_dfns = 1000
+steady_state_residual = true
+converged_residual_value = 1.0e-3
+
+#write_after_fixed_step_count = true
+#nstep = 1
+#nwrite = 1
+#nwrite_dfns = 1
+
+[electron_timestepping]
+nstep = 5000000
+#nstep = 1
+dt = 2.0e-8
+#maximum_dt = 1.0e-8
+nwrite = 10 #10000
+nwrite_dfns = 10 #100000
+#type = "SSPRK4"
+type = "Fekete4(3)"
+rtol = 1.0e-3
+atol = 1.0e-14
+minimum_dt = 1.0e-9
+decrease_dt_iteration_threshold = 100
+increase_dt_iteration_threshold = 20
+cap_factor_ion_dt = 5.0
+initialization_residual_value = 2.5
+converged_residual_value = 1.0e-2
+
+#debug_io = 1
+
+[nonlinear_solver]
+nonlinear_max_iterations = 100
+rtol = 1.0e-6 #1.0e-8
+atol = 1.0e-14 #1.0e-16
+linear_restart = 5
+preconditioner_update_interval = 100
+
+[ion_numerical_dissipation]
+#vpa_dissipation_coefficient = 1.0e-1
+#vpa_dissipation_coefficient = 1.0e-2
+#vpa_dissipation_coefficient = 1.0e-3
+force_minimum_pdf_value = 0.0
+
+[electron_numerical_dissipation]
+#vpa_dissipation_coefficient = 2.0
+force_minimum_pdf_value = 0.0
+
+[neutral_numerical_dissipation]
+#vz_dissipation_coefficient = 1.0e-1
+#vz_dissipation_coefficient = 1.0e-2
+#vz_dissipation_coefficient = 1.0e-3
+force_minimum_pdf_value = 0.0