Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify restart functionality #135

Merged
merged 9 commits into from
Sep 20, 2023
2 changes: 1 addition & 1 deletion machines/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ $ ./submit-restart.sh <path to input file>.toml
```
will submit a job to run and post-process a restart using input file. The
simulation will restart from the last time point of the previous run
(`restart_moment_kinetics.jl` supports more flexibility, but for now you would
(`run_moment_kinetics.jl` supports more flexibility, but for now you would
need to write your own submission script to pass the options needed for that).

Default parameters for the runs (number of nodes, time limit, etc.) were set up
Expand Down
2 changes: 1 addition & 1 deletion machines/archer/jobscript-restart.template
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@ export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

echo "running INPUTFILE $(date)"

srun --distribution=block:block --hint=nomultithread --ntasks=$SLURM_NTASKS bin/julia -Jmoment_kinetics.so --project -O3 --check-bounds=no restart_moment_kinetics.jl INPUTFILE RESTARTFROM
srun --distribution=block:block --hint=nomultithread --ntasks=$SLURM_NTASKS bin/julia -Jmoment_kinetics.so --project -O3 --check-bounds=no run_moment_kinetics.jl --restart INPUTFILE RESTARTFROM

echo "finished INPUTFILE $(date)"
2 changes: 1 addition & 1 deletion machines/marconi/jobscript-restart.template
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ source julia.env

echo "running INPUTFILE $(date)"

mpirun -np $SLURM_NTASKS bin/julia -Jmoment_kinetics.so --project -O3 --check-bounds=no restart_moment_kinetics.jl INPUTFILE RESTARTFROM
mpirun -np $SLURM_NTASKS bin/julia -Jmoment_kinetics.so --project -O3 --check-bounds=no run_moment_kinetics.jl --restart INPUTFILE RESTARTFROM

echo "finished INPUTFILE $(date)"
7 changes: 0 additions & 7 deletions restart_moment_kinetics.jl

This file was deleted.

4 changes: 4 additions & 0 deletions src/command_line_options.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ const s = ArgParseSettings()
"--long"
help = "Include more tests, increasing test run time."
action = :store_true
"--restart"
help = "Restart from latest output file in run directory (ignored if " *
"`restartfile` is passed)"
action = :store_true
"--restart-time-index"
help = "Time index in output file to restart from, defaults to final time point"
arg_type = Int
Expand Down
177 changes: 59 additions & 118 deletions src/moment_kinetics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
module moment_kinetics

export run_moment_kinetics, restart_moment_kinetics
export run_moment_kinetics

using MPI

Expand Down Expand Up @@ -89,11 +89,13 @@ using .type_definitions: mk_int
"""
main function that contains all of the content of the program
"""
function run_moment_kinetics(to::TimerOutput, input_dict=Dict())
function run_moment_kinetics(to::TimerOutput, input_dict=Dict(); restart=false,
restart_time_index=-1)
mk_state = nothing
try
# set up all the structs, etc. needed for a run
mk_state = setup_moment_kinetics(input_dict)
mk_state = setup_moment_kinetics(input_dict; restart=restart,
restart_time_index=restart_time_index)

# solve the 1+1D kinetic equation to advance f in time by nstep time steps
if run_type == performance_test
Expand Down Expand Up @@ -136,27 +138,38 @@ end
"""
overload which takes a filename and loads input
"""
function run_moment_kinetics(to::TimerOutput, input_filename::String)
return run_moment_kinetics(to, read_input_file(input_filename))
function run_moment_kinetics(to::TimerOutput, input_filename::String; restart=false,
restart_time_index=-1)
return run_moment_kinetics(to, read_input_file(input_filename); restart=restart,
restart_time_index=restart_time_index)
end

"""
overload with no TimerOutput arguments
"""
function run_moment_kinetics(input)
return run_moment_kinetics(TimerOutput(), input)
function run_moment_kinetics(input; restart=false, restart_time_index=-1)
return run_moment_kinetics(TimerOutput(), input; restart=restart,
restart_time_index=restart_time_index)
end

"""
overload which gets the input file name from command line arguments
"""
function run_moment_kinetics()
inputfile = get_options()["inputfile"]
if inputfile == nothing
run_moment_kinetics(Dict())
options = get_options()
inputfile = options["inputfile"]
restart = options["restart"]
if options["restart_file"] !== nothing
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

replace "restart_file" with "restartfile" to be consistent with src/command_line_options.jl line 18

restart = options["restartfile"]
end
restart_time_index = options["restart_time_index"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

replace "restart_time_index" with "restart-time-index" to be consistent with line 35 of src/command_line_options.jl

if inputfile === nothing
this_input = Dict()
else
run_moment_kinetics(inputfile)
this_input = inputfile
end
run_moment_kinetics(this_input; restart=restart,
restart_time_index=restart_time_index)
end

"""
Expand Down Expand Up @@ -227,109 +240,6 @@ function get_backup_filename(filename)
backup_moments_filename, backup_prefix_iblock
end

"""
restart_moment_kinetics(input_filename::String,
restart_filename::Union{String,Nothing}=nothing,
time_index::Int=-1)

Restart moment kinetics from an existing run. Space/velocity-space resolution in the
input must be the same as for the original run.

`input_filename` is the input file to use.

`restart_filename` can be used to pick a particular distribution-functions-output file to
restart from. By default will use the most recent one (the one without the numerical
suffix) in the run directory.

`time_index` can be passed to select the time index from `restart_filename` to restart
from. By default the latest time point is used.
"""
function restart_moment_kinetics(input_filename::String,
restart_filename::Union{String,Nothing}=nothing,
time_index::Int=-1)
restart_moment_kinetics(read_input_file(input_filename), restart_filename,
time_index)
return nothing
end
function restart_moment_kinetics()
options = get_options()
inputfile = options["inputfile"]
if inputfile === nothing
error("Must pass input file as first argument to restart a run.")
end
restartfile = options["restartfile"]
if restartfile === nothing
error("Must pass output file to restart from as second argument.")
end
time_index = options["restart-time-index"]

restart_moment_kinetics(inputfile, restartfile, time_index)

return nothing
end
function restart_moment_kinetics(input_dict::Dict,
restart_filename::Union{String,Nothing}=nothing,
time_index::Int=-1)

if restart_filename === nothing
run_name = input_dict["run_name"]
base_directory = get(input_dict, "base_directory", "runs")
output_dir = joinpath(base_directory, run_name)
io_settings = get(input_dict, "output", Dict{String,Any}())
binary_format = get(io_settings, "binary_format", hdf5)
if binary_format === hdf5
ext = "h5"
elseif binary_format === netcdf
ext = "cdf"
else
error("Unrecognized binary_format '$binary_format'")
end
restart_filename = glob(joinpath(output_dir, run_name * ".dfns*." * ext))[1]
end

try
# Move the output file being restarted from to make sure it doesn't get
# overwritten.
dfns_filename, backup_dfns_filename, parallel_io, moments_filename,
backup_moments_filename, backup_prefix_iblock =
get_backup_filename(restart_filename)
# Ensure every process got the filenames and checked files exist before moving
# files
MPI.Barrier(comm_world)
if (parallel_io && global_rank[] == 0) || (!parallel_io && block_rank[] == 0)
mv(dfns_filename, backup_dfns_filename)
mv(moments_filename, backup_moments_filename)
end
# Ensure files have been moved before any process tries to read from them
MPI.Barrier(comm_world)

# Set up all the structs, etc. needed for a run.
mk_state = setup_moment_kinetics(input_dict,
restart_prefix_iblock=backup_prefix_iblock,
restart_time_index=time_index)

try
time_advance!(mk_state...)
finally
# clean up i/o and communications
# last 2 elements of mk_state are `io` and `cdf`
cleanup_moment_kinetics!(mk_state[end-2:end]...)
end
catch e
# Stop code from hanging when running on multiple processes if only one of them
# throws an error
if global_size[] > 1
println("Abort called on rank $(block_rank[]) due to error. Error message "
* "was:\n", e)
MPI.Abort(comm_world, 1)
end

rethrow(e)
end

return nothing
end

"""
Perform all the initialization steps for a run.

Expand All @@ -339,8 +249,8 @@ reload data from time index given by `restart_time_index` for a restart.
`debug_loop_type` and `debug_loop_parallel_dims` are used to force specific set ups for
parallel loop ranges, and are only used by the tests in `debug_test/`.
"""
function setup_moment_kinetics(input_dict::Dict; restart_prefix_iblock=nothing,
restart_time_index=-1,
function setup_moment_kinetics(input_dict::Dict;
restart=false, restart_time_index=-1,
debug_loop_type::Union{Nothing,NTuple{N,Symbol} where N}=nothing,
debug_loop_parallel_dims::Union{Nothing,NTuple{N,Symbol} where N}=nothing)

Expand Down Expand Up @@ -402,7 +312,7 @@ function setup_moment_kinetics(input_dict::Dict; restart_prefix_iblock=nothing,
allocate_pdf_and_moments(composition, r, z, vperp, vpa, vzeta, vr, vz,
evolve_moments, collisions, num_diss_params)

if restart_prefix_iblock === nothing
if restart === false
restarting = false
# initialize f(z,vpa) and the lowest three v-space moments (density(z), upar(z) and ppar(z)),
# each of which may be evolved separately depending on input choices.
Expand All @@ -416,10 +326,41 @@ function setup_moment_kinetics(input_dict::Dict; restart_prefix_iblock=nothing,
else
restarting = true

if restart === true
run_name = input_dict["run_name"]
base_directory = get(input_dict, "base_directory", "runs")
output_dir = joinpath(base_directory, run_name)
io_settings = get(input_dict, "output", Dict{String,Any}())
binary_format = get(io_settings, "binary_format", hdf5)
if binary_format === hdf5
ext = "h5"
elseif binary_format === netcdf
ext = "cdf"
else
error("Unrecognized binary_format '$binary_format'")
end
restart = glob(joinpath(output_dir, run_name * ".dfns*." * ext))[1]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line appears to be giving BoundsError when the number of shared memory regions is greater that 1.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this code make any allowances for multiple .h5 or .cdf files to be copied and read? I cannot see how it does here.

Copy link
Collaborator

@mrhardman mrhardman Sep 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trying

println(joinpath(output_dir, run_name * ".dfns*." * ext))
println(glob(joinpath(output_dir, run_name * ".dfns*." * ext)))

to restart an example wall-bc_cheb_single_ion.toml with 8 shared memory regions (z_nelement/z_nelement_local = 8) and parallel_io = false

Gets

runs/wall-bc_cheb_single_ion/wall-bc_cheb_single_ion.dfns*.h5
String[]

It's pretty clear that this code is broken for some reason in this case. What should it be doing? What does the function `glob' do? I cannot find the definition.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

glob() finds files matching a pattern (name comes from *nix shell, e.g. https://tldp.org/LDP/abs/html/globbingref.html). The bounds error usually means there is no file matching the pattern it is looking for.

Some of the error messages can be a bit unhelpful. I tried to improve things a bit in a few commits in #128. Those would need merging if we merge this PR first - maybe I should just cherry-pick them into this branch...

end

# Move the output file being restarted from to make sure it doesn't get
# overwritten.
dfns_filename, backup_dfns_filename, parallel_io, moments_filename,
backup_moments_filename, backup_prefix_iblock =
get_backup_filename(restart)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would prefer not to use `restart' as a logical and as a filename here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can agree it's maybe not the clearest to change the value of restart that was passed in as an argument. Will make a separate restart_filename variable that gets set appropriately.

# Ensure every process got the filenames and checked files exist before moving
# files
MPI.Barrier(comm_world)
if (parallel_io && global_rank[] == 0) || (!parallel_io && block_rank[] == 0)
mv(dfns_filename, backup_dfns_filename)
mv(moments_filename, backup_moments_filename)
end
# Ensure files have been moved before any process tries to read from them
MPI.Barrier(comm_world)

# Reload pdf and moments from an existing output file
code_time, previous_runs_info, restart_time_index =
reload_evolving_fields!(pdf, moments, boundary_distributions,
restart_prefix_iblock, restart_time_index,
backup_prefix_iblock, restart_time_index,
composition, r, z, vpa, vperp, vzeta, vr, vz)
_block_synchronize()
end
Expand Down
12 changes: 6 additions & 6 deletions src/post_processing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -373,12 +373,12 @@ are passed, the plots/movies are given names beginning with `compare_` and are c
in the `comparison_plots/` subdirectory.

By default plots output from all restarts in a directory. To select a single run, pass the
`run_index` argument - the value corresponds to the `_<i>` suffix given to output files by
`restart_moment_kinetics()`. `run_index` can be an integer (which is applied to all
directories in `prefix...`), or a tuple of integers (which should have the same length as
the number of directories passed to `prefix...`). Use `run_index=-1` to get the most
recent run (which does not have a `_<i>` suffix). Note that `run_index` is only used when
a directory (rather than the prefix of a specific output file) is passed to `prefix...`
`run_index` argument - the value corresponds to the `_<i>` suffix given to output files
when restarting. `run_index` can be an integer (which is applied to all directories in
`prefix...`), or a tuple of integers (which should have the same length as the number of
directories passed to `prefix...`). Use `run_index=-1` to get the most recent run (which
does not have a `_<i>` suffix). Note that `run_index` is only used when a directory
(rather than the prefix of a specific output file) is passed to `prefix...`
"""
function analyze_and_plot_data(prefix...; run_index=nothing)
if length(prefix) == 0
Expand Down
13 changes: 2 additions & 11 deletions submit-restart.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,19 +94,10 @@ RUNNAME=$(util/get-run-name.jl $INPUTFILE)
RUNDIR=runs/$RUNNAME/
mkdir -p $RUNDIR

# Get default file to restart from, which is the latest run in $RUNDIR
if [[ -z $RESTARTFROM ]]; then
# "shopt -s extglob" is needed to let us use the ?() syntax within a script
# (it doesn't seem to be needed in an interactive shell!). See
# https://www.linuxjournal.com/content/pattern-matching-bash
shopt -s extglob
RESTARTFROM=$(ls $RUNDIR/$RUNNAME.dfns*.?(h5|cdf) | head -n 1)
fi

if [[ $POSTPROC -eq 0 ]]; then
echo "Submitting $INPUTFILE for restart from $RESTARTFROM and post-processing..."
echo "Submitting $INPUTFILE for restart from '$RESTARTFROM' and post-processing..."
else
echo "Submitting $INPUTFILE for restart from $RESTARTFROM..."
echo "Submitting $INPUTFILE for restart from '$RESTARTFROM'..."
fi

# Create a submission script for the run
Expand Down