@@ -33,14 +33,24 @@ This is a template function that should be implemented for each component model.
3333get_model_cache (sim:: Interfacer.ComponentModelSimulation ) = nothing
3434
3535"""
36- checkpoint_model_state(sim::Interfacer.ComponentModelSimulation, comms_ctx::ClimaComms.AbstractCommsContext, t::Int; output_dir = "output")
36+ checkpoint_model_state(
37+ sim::Interfacer.ComponentModelSimulation,
38+ comms_ctx::ClimaComms.AbstractCommsContext,
39+ t::Int,
40+ prev_checkpoint_t::Int;
41+ output_dir = "output")
3742
3843Checkpoint the model state of a simulation to a HDF5 file at a given time, t (in seconds).
44+
45+ If a previous checkpoint exists, it is removed. This is to avoid accumulating
46+ many checkpoint files in the output directory. A value of -1 for `prev_checkpoint_t`
47+ is used to indicate that there is no previous checkpoint to remove.
3948"""
4049function checkpoint_model_state (
4150 sim:: Interfacer.ComponentModelSimulation ,
4251 comms_ctx:: ClimaComms.AbstractCommsContext ,
43- t:: Int ;
52+ t:: Int ,
53+ prev_checkpoint_t:: Int ;
4454 output_dir = " output" ,
4555)
4656 Y = get_model_prog_state (sim)
@@ -52,23 +62,36 @@ function checkpoint_model_state(
5262 CC. InputOutput. HDF5. write_attribute (checkpoint_writer. file, " time" , t)
5363 CC. InputOutput. write! (checkpoint_writer, Y, " model_state" )
5464 Base. close (checkpoint_writer)
55- return nothing
5665
66+ # Remove previous checkpoint if it exists
67+ prev_checkpoint_file = joinpath (output_dir, " checkpoint_$(nameof (sim)) _$(prev_checkpoint_t) .hdf5" )
68+ remove_checkpoint (prev_checkpoint_file, prev_checkpoint_t, comms_ctx)
69+ return nothing
5770end
5871
5972"""
60- checkpoint_model_cache(sim::Interfacer.ComponentModelSimulation, comms_ctx::ClimaComms.AbstractCommsContext, t::Int; output_dir = "output")
73+ checkpoint_model_cache(
74+ sim::Interfacer.ComponentModelSimulation,
75+ comms_ctx::ClimaComms.AbstractCommsContext,
76+ t::Int,
77+ prev_checkpoint_t::Int;
78+ output_dir = "output")
6179
6280Checkpoint the model cache to N JLD2 files at a given time, t (in seconds),
6381where N is the number of MPI ranks.
6482
6583Objects are saved to JLD2 files because caches are generally not ClimaCore
6684objects (and ClimaCore.InputOutput can only save `Field`s or `FieldVector`s).
85+
86+ If a previous checkpoint exists, it is removed. This is to avoid accumulating
87+ many checkpoint files in the output directory. A value of -1 for `prev_checkpoint_t`
88+ is used to indicate that there is no previous checkpoint to remove.
6789"""
6890function checkpoint_model_cache (
6991 sim:: Interfacer.ComponentModelSimulation ,
7092 comms_ctx:: ClimaComms.AbstractCommsContext ,
71- t:: Int ;
93+ t:: Int ,
94+ prev_checkpoint_t:: Int ;
7295 output_dir = " output" ,
7396)
7497 # Move p to CPU (because we cannot save CUArrays)
@@ -79,6 +102,10 @@ function checkpoint_model_cache(
79102 pid = ClimaComms. mypid (comms_ctx)
80103 output_file = joinpath (output_dir, " checkpoint_cache_$(pid) _$(nameof (sim)) _$t .jld2" )
81104 JLD2. jldsave (output_file, cache = p)
105+
106+ # Remove previous checkpoint if it exists
107+ prev_checkpoint_file = joinpath (output_dir, " checkpoint_cache_$(pid) _$(nameof (sim)) _$(prev_checkpoint_t) .jld2" )
108+ remove_checkpoint (prev_checkpoint_file, prev_checkpoint_t, comms_ctx)
82109 return nothing
83110end
84111
@@ -104,21 +131,31 @@ function checkpoint_sims(cs::Interfacer.CoupledSimulation)
104131 day = floor (Int, time / (60 * 60 * 24 ))
105132 sec = floor (Int, time % (60 * 60 * 24 ))
106133 output_dir = cs. dirs. checkpoints
134+ prev_checkpoint_t = cs. prev_checkpoint_t[]
107135 comms_ctx = ClimaComms. context (cs)
136+
108137 for sim in cs. model_sims
109138 if ! isnothing (Checkpointer. get_model_prog_state (sim))
110- Checkpointer. checkpoint_model_state (sim, comms_ctx, time; output_dir)
139+ Checkpointer. checkpoint_model_state (sim, comms_ctx, time, prev_checkpoint_t ; output_dir)
111140 end
112141 if ! isnothing (Checkpointer. get_model_cache (sim))
113- Checkpointer. checkpoint_model_cache (sim, comms_ctx, time; output_dir)
142+ Checkpointer. checkpoint_model_cache (sim, comms_ctx, time, prev_checkpoint_t ; output_dir)
114143 end
115144 end
145+
116146 # Checkpoint the Coupler fields
117147 pid = ClimaComms. mypid (comms_ctx)
118148 @info " Saving coupler fields to JLD2 on day $day second $sec "
119149 output_file = joinpath (output_dir, " checkpoint_coupler_fields_$(pid) _$time .jld2" )
120150 # Adapt to Array move fields to the CPU
121151 JLD2. jldsave (output_file, coupler_fields = CC. Adapt. adapt (Array, cs. fields))
152+
153+ # Remove previous Coupler fields checkpoint if it exists
154+ prev_checkpoint_file = joinpath (output_dir, " checkpoint_coupler_fields_$(pid) _$(prev_checkpoint_t) .jld2" )
155+ remove_checkpoint (prev_checkpoint_file, prev_checkpoint_t, comms_ctx)
156+
157+ # Update previous checkpoint time stored in the coupled simulation
158+ cs. prev_checkpoint_t[] = time
122159end
123160
124161"""
@@ -211,4 +248,19 @@ function t_start_from_checkpoint(checkpoint_dir)
211248 return parse (Int, match (restart_file_rx, latest_restart)[2 ])
212249end
213250
251+ """
252+ remove_checkpoint(prev_checkpoint_file, prev_checkpoint_t, comms_ctx)
253+
254+ Delete the provided checkpoint file on the root process and print a helpful
255+ info message. This can be used to remove intermediate checkpoints, to prevent
256+ saving excessively large amounts of output.
257+ """
258+ function remove_checkpoint (prev_checkpoint_file, prev_checkpoint_t, comms_ctx)
259+ if ClimaComms. iamroot (comms_ctx) && prev_checkpoint_t != - 1 && isfile (prev_checkpoint_file)
260+ @info " Removing previous checkpoint file: $prev_checkpoint_file "
261+ rm (prev_checkpoint_file)
262+ end
263+ return nothing
264+ end
265+
214266end # module
0 commit comments