tweaks

CliMA · Jan 31, 2025 · be1f539 · be1f539
1 parent e0a305d
commit be1f539
Show file tree

Hide file tree

Showing 8 changed files with 136 additions and 85 deletions.
diff --git a/.buildkite/Project.toml b/.buildkite/Project.toml
@@ -6,6 +6,7 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 ClimaAnalysis = "29b5916a-a76c-4e73-9657-3c8fd22e65e6"
+ClimaCalibrate = "4347a170-ebd6-470c-89d3-5c705c0cacc2"
 ClimaComms = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
 ClimaCore = "d414da3d-4745-48bb-8d80-42e94e092884"
 ClimaCoreMakie = "908f55d8-4145-4867-9c14-5dad1a479e4d"
@@ -46,5 +47,5 @@ cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 [compat]
 ClimaAnalysis = "0.5.12"
 ClimaTimeSteppers = "0.7"
-Statistics = "1"
 Flux = "0.15"
+Statistics = "1"
diff --git a/Artifacts.toml b/Artifacts.toml
@@ -93,3 +93,6 @@ git-tree-sha1 = "839224a62b59d73073bdb9a5c55d3dc75e30fe33"
     [[ilamb_data.download]]
     sha256 = "64a9a344ebfbb0113014178a1f93a655c401431565907c07fd33aff8860b62d6"
     url = "https://caltech.box.com/shared/static/eii2bfwfp47axfeuysgxlgzbczz27u5g.gz"
+
+[era5_surface_fluxes]
+git-tree-sha1 = "f2cb69e41cba84991054162e794292f98bf0ae34"
diff --git a/experiments/calibration/global_bucket/climacalibrate_bucket/bucket_target_script.jl b/experiments/calibration/global_bucket/climacalibrate_bucket/bucket_target_script.jl
@@ -57,7 +57,7 @@ const FT = Float64;
 context = ClimaComms.context()
 device = ClimaComms.device()
 device_suffix = device isa ClimaComms.CPUSingleThreaded ? "cpu" : "gpu"
-root_path = "bucket_longrun_$(device_suffix)"
+root_path = "bucket_longrun"
 
 # Returns a 2 SVectors of (lat, lon) tuples for n random locations on the land
 # surface. The two sets of locations are designed to be used as a training and
@@ -126,6 +126,10 @@ function setup_prob(t0, tf, Δt, params, outdir; nelements = (101, 7))
     )
 
     # Set up parameters
+    p_names = collect(keys(params))
+    p_values = [params[name]["value"] for name in p_names]
+    params = (; zip(Symbol.(p_names), p_values)...)
+
     (; κ_soil, ρc_soil, f_bucket, W_f, p, z_0m) = params
     z_0b = z_0m
     τc = FT(Δt)
@@ -194,66 +198,70 @@ function setup_prob(t0, tf, Δt, params, outdir; nelements = (101, 7))
     return prob, SciMLBase.CallbackSet(driver_cb, diag_cb)
 end
 
-# Define the locations longitudes and latitudes
-# Needs to be defined once, both for g(θ) and ERA5 target
-t0 = 0.0
-tf = 60 * 60.0 * 24 * 366
-Δt = 900.0
-nelements = (101, 7)
-regridder_type = :InterpolationsRegridder
-radius = FT(6378.1e3)
-depth = FT(3.5)
-domain = ClimaLand.Domains.SphericalShell(;
-                                          radius = radius,
-                                          depth = depth,
-                                          nelements = nelements,
-                                          npolynomial = 1,
-                                          dz_tuple = FT.((1.0, 0.05)),
-                                         )
-surface_space = domain.space.surface
-training_locations, validation_locations =
-rand_locations(surface_space, regridder_type, 25)
-
-#     (; κ_soil, ρc_soil, f_bucket, W_f, p, z_0m) = params
-# The truth params = (;κ_soil = FT(1.5), ρc_soil = FT(2e6), f_bucket = FT(0.75), W_f = FT(0.2), p = FT(1), z_0m = FT(1e-2))
-# Read in the era5 datafile
-era5_ds = Dataset(joinpath(ClimaLand.Artifacts.era5_surface_data2008_path(),
-                           "era5_monthly_surface_fluxes_200801-200812.nc"))
-
-# Make the ERA5 target
-ERA5_target = []
-close = (x, y) -> abs(x - y) < 5e-1
-for (lon, lat) in training_locations
-    # Fetch slices of lhf and shf era5 data from the era5 dataset
-    lat_ind, lon_ind = findall((x) -> close(x, lat), era5_ds["latitude"][:])[1],
-    findall((x) -> close(x, lon + 180), era5_ds["longitude"][:])[1]
-    lhf_loc = vec(era5_ds["mslhf"][lon_ind, lat_ind, :][:, end, :])
-    shf_loc = vec(era5_ds["msshf"][lon_ind, lat_ind, :][:, end, :])
-
-    # Create Observation objects for lhf and shf
-    lhf_ERA5 = EKP.Observation(
-                              Dict(
-                                   "samples" => lhf_loc,
-                                   "covariances" => cov(lhf_loc) * EKP.I,
-                                   "names" => "lhf_$(lon)_$(lat)",
-                                  ),
-                             )
-
-    shf_ERA5 = EKP.Observation(
-                              Dict(
-                                   "samples" => shf_loc,
-                                   "covariances" => cov(shf_loc) * EKP.I,
-                                   "names" => "shf_$(lon)_$(lat)",
-                                  ),
-                             )
-
-    # Add the observations to the target
-    push!(ERA5_target, lhf_ERA5)
-    push!(ERA5_target, shf_ERA5)
-
-end
-
-full_obs_era5 = EKP.combine_observations(ERA5_target)
+# function target_and_locations()
+    # Define the locations longitudes and latitudes
+    # Needs to be defined once, both for g(θ) and ERA5 target
+    t0 = 0.0
+    tf = 60 * 60.0 * 24 * 366
+    Δt = 900.0
+    nelements = (101, 7)
+    regridder_type = :InterpolationsRegridder
+    radius = FT(6378.1e3)
+    depth = FT(3.5)
+    domain = ClimaLand.Domains.SphericalShell(;
+                                              radius = radius,
+                                              depth = depth,
+                                              nelements = nelements,
+                                              npolynomial = 1,
+                                              dz_tuple = FT.((1.0, 0.05)),
+                                             )
+    surface_space = domain.space.surface
+    locations, validation_locations =
+    rand_locations(surface_space, regridder_type, 25)
+
+    #     (; κ_soil, ρc_soil, f_bucket, W_f, p, z_0m) = params
+    # The truth params = (;κ_soil = FT(1.5), ρc_soil = FT(2e6), f_bucket = FT(0.75), W_f = FT(0.2), p = FT(1), z_0m = FT(1e-2))
+    # Read in the era5 datafile
+    era5_ds = Dataset(joinpath(ClimaLand.Artifacts.era5_surface_data2008_path(),
+                               "era5_monthly_surface_fluxes_200801-200812.nc"))
+
+    # Make the ERA5 target
+    ERA5_target = []
+    close = (x, y) -> abs(x - y) < 5e-1
+    for (lon, lat) in locations
+        # Fetch slices of lhf and shf era5 data from the era5 dataset
+        lat_ind, lon_ind = findall((x) -> close(x, lat), era5_ds["latitude"][:])[1],
+        findall((x) -> close(x, lon + 180), era5_ds["longitude"][:])[1]
+        lhf_loc = vec(era5_ds["mslhf"][lon_ind, lat_ind, :][:, end, :])
+        shf_loc = vec(era5_ds["msshf"][lon_ind, lat_ind, :][:, end, :])
+
+        # Create Observation objects for lhf and shf
+        lhf_ERA5 = EKP.Observation(
+                                   Dict(
+                                        "samples" => lhf_loc,
+                                        "covariances" => cov(lhf_loc) * EKP.I,
+                                        "names" => "lhf_$(lon)_$(lat)",
+                                       ),
+                                  )
+
+        shf_ERA5 = EKP.Observation(
+                                   Dict(
+                                        "samples" => shf_loc,
+                                        "covariances" => cov(shf_loc) * EKP.I,
+                                        "names" => "shf_$(lon)_$(lat)",
+                                       ),
+                                  )
+
+        # Add the observations to the target
+        push!(ERA5_target, lhf_ERA5)
+        push!(ERA5_target, shf_ERA5)
+
+    end
+
+    full_obs_era5 = EKP.combine_observations(ERA5_target)
+    observations = EKP.get_obs(full_obs_era5)
+#    return training_locations, observations
+# end
 
 # Things to consider:
 # - masking out the ocean when comparing to data

diff --git a/experiments/calibration/global_bucket/climacalibrate_bucket/calibrate.jl b/experiments/calibration/global_bucket/climacalibrate_bucket/calibrate.jl
@@ -1,26 +1,44 @@
 # The script included below contains the following functions:
 # rand_locations - returns n random locations on land (lon, lat)
 # setup_prob - config for the bucket model
-# it also returns the target used for calibration, full_obs_era5, which is
-# latent and sensible heat at the rand_locations stacked as a vector
-include("bucket_target_script.jl")
-observations = full_obs_era5
+# target_and_locations - returns locations on land and ERA5 LHF SHF obs at those locations
+using ClimaLand
+dir = pkgdir(ClimaLand)
+# locations, observations = target_and_locations()
+# Should observations be a vector or EKP.Observation type?
 
 # Now, we need a forward_model for ClimaCalibrate.
 # forward_model runs the model and generates a ClimaDiagnostic output directory
 # that will be used to generate the observation_map.
 # note that forward_model needs the setup_prob function defined above.
-import ClimaCalibrate: forward_model, parameter_path
+# IMPORTANT: forward_model needs to be defined as CAL.forward_model, as it adds a method
+import ClimaCalibrate: forward_model, parameter_path, path_to_ensemble_member
 import ClimaCalibrate as CAL
-include("forward_model.jl")
+using Distributed
+ntasks = 4
+addprocs(CAL.SlurmManager(ntasks))
+
+@everywhere begin
+    import ClimaCalibrate: forward_model, parameter_path, path_to_ensemble_member
+    import ClimaCalibrate as CAL
+
+    using ClimaLand
+    caldir = "calibration_output"
+    dir = pkgdir(ClimaLand)
+
+    include(joinpath(dir,"experiments/calibration/global_bucket/climacalibrate_bucket/bucket_target_script.jl"))
+    include(joinpath(dir,"experiments/calibration/global_bucket/climacalibrate_bucket/forward_model.jl"))
+end
 
 # Now we include the script observation_map.jl, which contains two functions:
-# process_member_data - makes the loss function we want from the forward_model output
-# observation_map - makes our G_ensemble, the loss for all n_ensemble parameters
-include("observation_map.jl")
+# process_member_data - makes the parameter to data map we want from the forward_model output for one ensemble member
+# observation_map - makes our G_ensemble, the parameter to data map for all n_ensemble parameters
+# IMPORTANT: observation_map needs to be defined as CAL.observation_map, as it adds a method
+include(joinpath(dir,"experiments/calibration/global_bucket/climacalibrate_bucket/observation_map.jl"))
 
 # Now that our functions are defined, we still need some to specify some arguments they require.
 # Parameters prior
+# Note: this could be provided in a .toml file. Not sure if it has to.
 prior_κ_soil = EKP.constrained_gaussian("κ_soil", 2, 1, 0, Inf);
 prior_ρc_soil = EKP.constrained_gaussian("ρc_soil", 4e6, 2e6, 0, Inf);
 prior_f_bucket = EKP.constrained_gaussian("f_bucket", 0.5, 0.3, 0, 1);
@@ -38,12 +56,15 @@ prior = EKP.combine_distributions([
 
 ensemble_size = 10
 n_iterations = 5
-noise = EKP.I # not sure what to do here, I had noise embeded in my observation_map function
+noise = 1.0*EKP.I # Should work, but this should be covariance of each month from observation (ERA5)
 
-output_dir = "calibration_output" # Should I create this folder?
+# This folder will contains ClimaCalibrate outputs - parameters ensemble at each iterations, etc.
+caldir = "calibration_output"
 
+# Note: what is the best way to test this?
+# Should this script be run as a slurm job? (I expect it takes ~ 5 hours or so to complete)
 CAL.calibrate(
-              CAL.WorkerBackend, # not sure what to do here
+              CAL.WorkerBackend,
               ensemble_size,
               n_iterations,
               observations,

diff --git a/experiments/calibration/global_bucket/climacalibrate_bucket/forward_model.jl b/experiments/calibration/global_bucket/climacalibrate_bucket/forward_model.jl
@@ -1,12 +1,15 @@
+import TOML as toml
+
 # Explain things here - tutorial style
-# (Nat: why is it CAL.forward_model, and not CAL.parameter_path?
 """
 
 """
 function CAL.forward_model(iteration, member)
-    ensemble_member_path = parameter_path(caldir, iteration, member)
-    params = toml.parsefile(ensemble_member_path) # should load a Dict, that needs to be converted to namedtuple
+    ensemble_member_path = path_to_ensemble_member(caldir, iteration, member)
+    params_path = parameter_path(caldir, iteration, member)
+    params = toml.parsefile(params_path) # should load a Dict, that needs to be converted to namedtuple
 
+    @info ensemble_member_path
     diagnostics_dir = joinpath(ensemble_member_path, "global_diagnostics")
     diagdir = ClimaUtilities.OutputPathGenerator.generate_output_path(
         diagnostics_dir,
@@ -17,4 +20,5 @@ function CAL.forward_model(iteration, member)
     timestepper = CTS.RK4()
     ode_algo = CTS.ExplicitAlgorithm(timestepper)
     SciMLBase.solve(prob, ode_algo; dt = Δt, callback = cb, adaptive = false)
+    return nothing
 end
diff --git a/experiments/calibration/global_bucket/climacalibrate_bucket/observation_map.jl b/experiments/calibration/global_bucket/climacalibrate_bucket/observation_map.jl
@@ -1,16 +1,16 @@
+using ClimaAnalysis
+import EnsembleKalmanProcesses as EKP
+
 # explain things here - tutorial style
-function process_member_data(root_path)
+function process_member_data(simdir)
 
-    simdir = ClimaAnalysis.SimDir(
-        joinpath(root_path, "global_diagnostics", "output_active"),
-    )
     lhf = get(simdir; short_name = "lhf")
     shf = get(simdir; short_name = "shf")
 
     # Initialize an empty list to store observations
     obs_list = []
     # Loop over each location
-    for (lon, lat) in training_locations
+    for (lon, lat) in locations
         # Slice lhf and shf at the given longitude and latitude
         lhf_loc = ClimaAnalysis.slice(lhf, lon = lon, lat = lat)
         shf_loc = ClimaAnalysis.slice(shf, lon = lon, lat = lat)
@@ -42,12 +42,12 @@ function process_member_data(root_path)
     return obs
 end
 
-function observation_map(iteration)
+function CAL.observation_map(iteration)
     single_member_dims = (1,)
     G_ensemble = Array{Float64}(undef, single_member_dims..., ensemble_size)
 
     for m in 1:ensemble_size
-        member_path = CAL.path_to_ensemble_member(caldir, iteration, m)
+        member_path = path_to_ensemble_member(caldir, iteration, m)
         simdir_path = joinpath(member_path, "global_diagnostics", "output_active")
         if isdir(simdir_path)
             simdir = SimDir(simdir_path)

diff --git a/run_calibration.sh b/run_calibration.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+#SBATCH --job-name=calibrate_bucket
+#SBATCH --output=output_%j.txt        # Output file (%j expands to job ID)
+#SBATCH --error=error_%j.txt          # Error file
+#SBATCH --nodes=1                     # Number of nodes
+#SBATCH --ntasks=4                    # Number of tasks (processes)
+#SBATCH --time=10:00:00               # Time limit hrs:min:sec
+#SBATCH --gpus-per-task=1
+
+# Run your command
+export CLIMACOMMS_DEVICE="CUDA"
+export CLIMACOMMS_CONTEXT="SINGLETON"
+
+julia --project=.buildkite/ experiments/calibration/global_bucket/climacalibrate_bucket/calibrate.jl
diff --git a/src/Artifacts.jl b/src/Artifacts.jl
@@ -41,7 +41,7 @@ NOTE: We should be able to use @clima_artifact here, but the folders on central
 seem to be named differently than the artifact name
 """
 function era5_surface_data2008_path(; context = nothing)
-    return "/groups/esm/ClimaArtifacts/artifacts/era5_surface_fluxes_2008"
+    return @clima_artifact("era5_surface_fluxes", context)
 end
 
 """