From 5cd09f347a5122d56465c672ef9e8524071b2f1f Mon Sep 17 00:00:00 2001
From: "Bryn N. Ubald" <55503826+bnubald@users.noreply.github.com>
Date: Wed, 28 Feb 2024 16:05:40 +0000
Subject: [PATCH 01/44] Fixes #14: Dynamically generate seed block in yaml

---
 .gitignore                 |  1 +
 ENVS.example               | 11 +++++++++++
 ensemble/predict.tmpl.yaml |  4 +---
 ensemble/train.tmpl.yaml   | 11 +----------
 run_predict_ensemble.sh    | 36 +++++++++++++++++++++++++++++++++++-
 run_train_ensemble.sh      | 37 ++++++++++++++++++++++++++++++++++++-
 6 files changed, 85 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0d668a7..b49d126 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,7 @@ loader.*.json
 *.npy
 *.out
 tmp.*
+*.swp
 *test*
 *.png
 
diff --git a/ENVS.example b/ENVS.example
index 58a711f..0850cca 100644
--- a/ENVS.example
+++ b/ENVS.example
@@ -61,6 +61,12 @@ DEMO_PIPELINE_VAL_END="2022-2-14"
 DEMO_PIPELINE_TEST_START="2022-2-15"
 DEMO_PIPELINE_TEST_END="2022-2-28"
 
+##
+# Training & Prediction ensemble run seeds
+#
+SMALL_DEMO_PIPELINE_ENSEMBLE_TRAIN_SEEDS="42,46"
+SMALL_DEMO_PIPELINE_ENSEMBLE_PREDICT_SEEDS="42,46"
+
 ##
 # The prefix to use for training date ranges
 #
@@ -83,6 +89,9 @@ VAL_END_NAME="${PREFIX}_VAL_END"
 TEST_START_NAME="${PREFIX}_TEST_START"
 TEST_END_NAME="${PREFIX}_TEST_END"
 
+ENSEMBLE_TRAIN_SEEDS_NAME="${PREFIX}_ENSEMBLE_TRAIN_SEEDS"
+ENSEMBLE_PREDICT_SEEDS_NAME="${PREFIX}_ENSEMBLE_PREDICT_SEEDS"
+
 # What are we exporting
 
 export TRAIN_START=${!TRAIN_START_NAME}
@@ -92,3 +101,5 @@ export VAL_END=${!VAL_END_NAME}
 export TEST_START=${!TEST_START_NAME}
 export TEST_END=${!TEST_END_NAME}
 
+export ENSEMBLE_TRAIN_SEEDS=${!ENSEMBLE_TRAIN_SEEDS_NAME}
+export ENSEMBLE_PREDICT_SEEDS=${!ENSEMBLE_PREDICT_SEEDS_NAME}
diff --git a/ensemble/predict.tmpl.yaml b/ensemble/predict.tmpl.yaml
index 2916b3c..f960174 100644
--- a/ensemble/predict.tmpl.yaml
+++ b/ensemble/predict.tmpl.yaml
@@ -40,9 +40,7 @@ ensemble:
               cmd:  /usr/bin/ln -s ../../data
       pre_run:    []
       runs:
-        - seed:   42
-        - seed:   46
-        - seed:   45
+        - seed:   SEEDS
       post_run:     []
       post_batch:
         - name:   execute
diff --git a/ensemble/train.tmpl.yaml b/ensemble/train.tmpl.yaml
index 4f8af88..0bed0bb 100644
--- a/ensemble/train.tmpl.yaml
+++ b/ensemble/train.tmpl.yaml
@@ -44,16 +44,7 @@ ensemble:
       pre_batch:  []
       pre_run:    []
       runs:
-        - seed:   42
-        - seed:   46
-        - seed:   45
-        - seed:   17
-        - seed:   24
-        - seed:   84
-        - seed:   83
-        - seed:   16
-        - seed:   5
-        - seed:   3
+        - seed:   SEEDS
       post_run:   []
       post_batch:
         - name:   execute
diff --git a/run_predict_ensemble.sh b/run_predict_ensemble.sh
index 5e834d8..280cce6 100755
--- a/run_predict_ensemble.sh
+++ b/run_predict_ensemble.sh
@@ -14,8 +14,9 @@ ENSEMBLE_TARGET="slurm"
 ENSEMBLE_SWITCH=""
 ENSEMBLE_ARGS=""
 TRAIN_IDENT=""
+ENSEMBLE_SEEDS_DEFAULT=42,46,45
 
-while getopts ":b:df:i:lm:p:x" opt; do
+while getopts ":b:df:i:lm:p:r:x" opt; do
   case "$opt" in
     b)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_batch=$OPTARG ";;
     d)  ENSEMBLE_TARGET="dummy";;
@@ -24,6 +25,7 @@ while getopts ":b:df:i:lm:p:x" opt; do
     l)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_testset=false ";;
     m)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}mem=$OPTARG ";;
     p)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_prep=$OPTARG ";;
+    r)  ENSEMBLE_RUNS=$OPTARG ;; # Ensemble member run seed values
     x)  DO_NOT_EXECUTE=1
   esac
 done
@@ -52,11 +54,43 @@ ln -s `realpath ${DATEFILE}` ensemble/${NAME}/predict_dates.csv
 
 PREDICT_CONFIG=`mktemp -p . --suffix ".predict"`
 
+##
+# Dynamically generate seeds for ensemble run.
+#
+
+IFS="," read -ra SEEDS <<< "$ENSEMBLE_RUNS"
+
+# Check if seeds defined as CLI args (e.g. `-r 42,46`)
+if [ ${#SEEDS[@]} -eq 0 ]; then
+    IFS="," read -ra SEEDS <<< "$ENSEMBLE_PREDICT_SEEDS"
+    # Check if seeds defined in ENVS exported variables (else use defaults)
+    if [ ${#SEEDS[@]} -eq 0 ]; then
+        IFS="," read -ra SEEDS <<< "$ENSEMBLE_SEEDS_DEFAULT"
+    fi
+fi
+
+# Generate seed lines for yaml output
+ENSEMBLE_SEEDS=""
+COUNTER=0
+for seed in ${SEEDS[@]}
+do
+    ENSEMBLE_SEEDS+="        - seed:   "$seed
+    if [ $COUNTER -lt $((${#SEEDS[@]}-1)) ]; then
+        ENSEMBLE_SEEDS+="\n"
+    fi
+    ((COUNTER++))
+done
+
+echo "No. of ensemble members: " "${#SEEDS[@]}"
+printf -v joined '%s,' "${SEEDS[@]}"
+echo "Ensemble members: " "${joined%,}"
+
 sed -r \
     -e "s/NETWORK/${NETWORK}/g" \
     -e "s/DATASET/${DATASET}/g" \
     -e "s/LOADER/${LOADER}/g" \
     -e "s/NAME/${NAME}/g" \
+    -e "/\bSEEDS$/s/.*/${ENSEMBLE_SEEDS}/g" \
  ensemble/predict.tmpl.yaml >$PREDICT_CONFIG
 
 COMMAND="model_ensemble $PREDICT_CONFIG $ENSEMBLE_TARGET $ENSEMBLE_SWITCH $ENSEMBLE_ARGS"
diff --git a/run_train_ensemble.sh b/run_train_ensemble.sh
index 3de09bb..2c86b12 100755
--- a/run_train_ensemble.sh
+++ b/run_train_ensemble.sh
@@ -9,13 +9,15 @@ fi
 
 echo "ARGS: $@"
 
+# Defaults if not specified
 ENSEMBLE_TARGET="slurm"
 ENSEMBLE_SWITCH=""
 ENSEMBLE_ARGS=""
 ENSEMBLE_JOBS=1
 ENSEMBLE_NTASKS=4
+ENSEMBLE_SEEDS_DEFAULT=42,46,45,17,24,84,83,16,5,3
 
-while getopts ":b:c:de:f:g:j:l:m:n:p:q:s:t:" opt; do
+while getopts ":b:c:de:f:g:j:l:m:n:p:q:r:s:t:" opt; do
   case "$opt" in
     b)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_batch=$OPTARG ";;
     c)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}cluster=$OPTARG ";;
@@ -29,6 +31,7 @@ while getopts ":b:c:de:f:g:j:l:m:n:p:q:s:t:" opt; do
     n)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}nodelist=$OPTARG ";;
     p)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_prep=$OPTARG ";;
     q)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_queue=$OPTARG ";;
+    r)  ENSEMBLE_RUNS=$OPTARG ;; # Ensemble member run seed values
     s)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_strategy=$OPTARG ";;
     t)  ENSEMBLE_NTASKS=$OPTARG ;;
   esac
@@ -47,12 +50,44 @@ NAME="$3"
 
 TRAIN_CONFIG=`mktemp -p . --suffix ".train"`
 
+##
+# Dynamically generate seeds for ensemble run.
+#
+
+IFS="," read -ra SEEDS <<< "$ENSEMBLE_RUNS"
+
+# Check if seeds defined as CLI args (e.g. `-r 42,46`)
+if [ ${#SEEDS[@]} -eq 0 ]; then
+    IFS="," read -ra SEEDS <<< "$ENSEMBLE_TRAIN_SEEDS"
+    # Check if seeds defined in ENVS exported variables (else use defaults)
+    if [ ${#SEEDS[@]} -eq 0 ]; then
+        IFS="," read -ra SEEDS <<< "$ENSEMBLE_SEEDS_DEFAULT"
+    fi
+fi
+
+# Generate seed lines for yaml output
+ENSEMBLE_SEEDS=""
+COUNTER=0
+for seed in ${SEEDS[@]}
+do
+    ENSEMBLE_SEEDS+="        - seed:   "$seed
+    if [ $COUNTER -lt $((${#SEEDS[@]}-1)) ]; then
+        ENSEMBLE_SEEDS+="\n"
+    fi
+    ((COUNTER++))
+done
+
+echo "No. of ensemble members: " "${#SEEDS[@]}"
+printf -v joined '%s,' "${SEEDS[@]}"
+echo "Ensemble members: " "${joined%,}"
+
 sed -r \
     -e "s/NAME/${NAME}/g" \
     -e "s/LOADER/${LOADER}/g" \
     -e "s/DATASET/${DATASET}/g" \
     -e "s/MAXJOBS/${ENSEMBLE_JOBS}/g" \
     -e "s/NTASKS/${ENSEMBLE_NTASKS}/g" \
+    -e "/\bSEEDS$/s/.*/${ENSEMBLE_SEEDS}/g" \
  ensemble/train.tmpl.yaml >$TRAIN_CONFIG
 
 COMMAND="model_ensemble $TRAIN_CONFIG $ENSEMBLE_TARGET $ENSEMBLE_SWITCH $ENSEMBLE_ARGS"

From 5e86f996b0252af0b81064b79f5c7f66b47cb2c2 Mon Sep 17 00:00:00 2001
From: "Bryn N. Ubald" <55503826+bnubald@users.noreply.github.com>
Date: Thu, 7 Mar 2024 11:25:32 +0000
Subject: [PATCH 02/44] Fixes #36: Rename seed variables in ENVS.example

---
 ENVS.example | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ENVS.example b/ENVS.example
index 0850cca..715d973 100644
--- a/ENVS.example
+++ b/ENVS.example
@@ -64,8 +64,8 @@ DEMO_PIPELINE_TEST_END="2022-2-28"
 ##
 # Training & Prediction ensemble run seeds
 #
-SMALL_DEMO_PIPELINE_ENSEMBLE_TRAIN_SEEDS="42,46"
-SMALL_DEMO_PIPELINE_ENSEMBLE_PREDICT_SEEDS="42,46"
+DEMO_PIPELINE_ENSEMBLE_TRAIN_SEEDS="42,46"
+DEMO_PIPELINE_ENSEMBLE_PREDICT_SEEDS="42,46"
 
 ##
 # The prefix to use for training date ranges

From d2098e9b62db5640f5a9a3fa42dcb389bce2b957 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 22 Mar 2024 16:57:18 +0000
Subject: [PATCH 03/44] Dev #38: adding support for incremental HPC environment
 installation from cloned tensorflow-gpu

---
 environment.dawn.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 environment.dawn.yml

diff --git a/environment.dawn.yml b/environment.dawn.yml
new file mode 100644
index 0000000..2713e21
--- /dev/null
+++ b/environment.dawn.yml
@@ -0,0 +1,11 @@
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - cartopy
+  - eccodes
+  - ffmpeg
+  - hdf5
+  - netcdf4
+  - openh264
+  - xarray

From bef3bd7f9c59b7c1218250f3d52ed0a47555d027 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Sat, 23 Mar 2024 07:22:46 +0000
Subject: [PATCH 04/44] Dev #38: sticking in some stubs for dawn use

---
 ensemble/predict.tmpl.yaml | 2 +-
 ensemble/template/dawn.sh  | 5 +++++
 ensemble/train.tmpl.yaml   | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 ensemble/template/dawn.sh

diff --git a/ensemble/predict.tmpl.yaml b/ensemble/predict.tmpl.yaml
index f960174..55ad34d 100644
--- a/ensemble/predict.tmpl.yaml
+++ b/ensemble/predict.tmpl.yaml
@@ -14,6 +14,7 @@ ensemble:
       - ../../../processed
       - ../../../results
     mem:          224gb
+    cluster:      pvc
 
   pre_process:          []
   post_process:         []
@@ -24,7 +25,6 @@ ensemble:
     - icenet_predict.sh.j2
     email:        someone@example.com
     job_file:     icenet_predict.sh
-    cluster:      short
     nodes:        1
     ntasks:       8
     length:       00:30:00
diff --git a/ensemble/template/dawn.sh b/ensemble/template/dawn.sh
new file mode 100644
index 0000000..e73154b
--- /dev/null
+++ b/ensemble/template/dawn.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+module purge
+module load default-dawn
+module load intelpython-conda
diff --git a/ensemble/train.tmpl.yaml b/ensemble/train.tmpl.yaml
index 0bed0bb..0a7e1c0 100644
--- a/ensemble/train.tmpl.yaml
+++ b/ensemble/train.tmpl.yaml
@@ -17,6 +17,8 @@ ensemble:
       - ../../../results
     gpus:         1
     mem:          128gb
+    cluster:      pvc
+    nodes:        1
 
   pre_process:
     - name:   execute
@@ -31,8 +33,6 @@ ensemble:
     - icenet_train.sh.j2
     email:        someone@example.com
     job_file:     icenet_train.sh
-    cluster:      gpu
-    nodes:        1
     ntasks:       NTASKS
     length:       4-00:00:00
     maxruns:      5

From 2fd5ae4e99306fee0da5bb2d76a751663dcfe45a Mon Sep 17 00:00:00 2001
From: "Bryn N. Ubald" <55503826+bnubald@users.noreply.github.com>
Date: Wed, 28 Feb 2024 16:05:40 +0000
Subject: [PATCH 05/44] Fixes #14: Dynamically generate seed block in yaml

---
 .gitignore                 |  1 +
 ENVS.example               | 11 +++++++++++
 ensemble/predict.tmpl.yaml |  4 +---
 ensemble/train.tmpl.yaml   | 11 +----------
 run_predict_ensemble.sh    | 36 +++++++++++++++++++++++++++++++++++-
 run_train_ensemble.sh      | 37 ++++++++++++++++++++++++++++++++++++-
 6 files changed, 85 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0d668a7..b49d126 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,7 @@ loader.*.json
 *.npy
 *.out
 tmp.*
+*.swp
 *test*
 *.png
 
diff --git a/ENVS.example b/ENVS.example
index 58a711f..0850cca 100644
--- a/ENVS.example
+++ b/ENVS.example
@@ -61,6 +61,12 @@ DEMO_PIPELINE_VAL_END="2022-2-14"
 DEMO_PIPELINE_TEST_START="2022-2-15"
 DEMO_PIPELINE_TEST_END="2022-2-28"
 
+##
+# Training & Prediction ensemble run seeds
+#
+SMALL_DEMO_PIPELINE_ENSEMBLE_TRAIN_SEEDS="42,46"
+SMALL_DEMO_PIPELINE_ENSEMBLE_PREDICT_SEEDS="42,46"
+
 ##
 # The prefix to use for training date ranges
 #
@@ -83,6 +89,9 @@ VAL_END_NAME="${PREFIX}_VAL_END"
 TEST_START_NAME="${PREFIX}_TEST_START"
 TEST_END_NAME="${PREFIX}_TEST_END"
 
+ENSEMBLE_TRAIN_SEEDS_NAME="${PREFIX}_ENSEMBLE_TRAIN_SEEDS"
+ENSEMBLE_PREDICT_SEEDS_NAME="${PREFIX}_ENSEMBLE_PREDICT_SEEDS"
+
 # What are we exporting
 
 export TRAIN_START=${!TRAIN_START_NAME}
@@ -92,3 +101,5 @@ export VAL_END=${!VAL_END_NAME}
 export TEST_START=${!TEST_START_NAME}
 export TEST_END=${!TEST_END_NAME}
 
+export ENSEMBLE_TRAIN_SEEDS=${!ENSEMBLE_TRAIN_SEEDS_NAME}
+export ENSEMBLE_PREDICT_SEEDS=${!ENSEMBLE_PREDICT_SEEDS_NAME}
diff --git a/ensemble/predict.tmpl.yaml b/ensemble/predict.tmpl.yaml
index 2916b3c..f960174 100644
--- a/ensemble/predict.tmpl.yaml
+++ b/ensemble/predict.tmpl.yaml
@@ -40,9 +40,7 @@ ensemble:
               cmd:  /usr/bin/ln -s ../../data
       pre_run:    []
       runs:
-        - seed:   42
-        - seed:   46
-        - seed:   45
+        - seed:   SEEDS
       post_run:     []
       post_batch:
         - name:   execute
diff --git a/ensemble/train.tmpl.yaml b/ensemble/train.tmpl.yaml
index 4f8af88..0bed0bb 100644
--- a/ensemble/train.tmpl.yaml
+++ b/ensemble/train.tmpl.yaml
@@ -44,16 +44,7 @@ ensemble:
       pre_batch:  []
       pre_run:    []
       runs:
-        - seed:   42
-        - seed:   46
-        - seed:   45
-        - seed:   17
-        - seed:   24
-        - seed:   84
-        - seed:   83
-        - seed:   16
-        - seed:   5
-        - seed:   3
+        - seed:   SEEDS
       post_run:   []
       post_batch:
         - name:   execute
diff --git a/run_predict_ensemble.sh b/run_predict_ensemble.sh
index 5e834d8..280cce6 100755
--- a/run_predict_ensemble.sh
+++ b/run_predict_ensemble.sh
@@ -14,8 +14,9 @@ ENSEMBLE_TARGET="slurm"
 ENSEMBLE_SWITCH=""
 ENSEMBLE_ARGS=""
 TRAIN_IDENT=""
+ENSEMBLE_SEEDS_DEFAULT=42,46,45
 
-while getopts ":b:df:i:lm:p:x" opt; do
+while getopts ":b:df:i:lm:p:r:x" opt; do
   case "$opt" in
     b)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_batch=$OPTARG ";;
     d)  ENSEMBLE_TARGET="dummy";;
@@ -24,6 +25,7 @@ while getopts ":b:df:i:lm:p:x" opt; do
     l)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_testset=false ";;
     m)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}mem=$OPTARG ";;
     p)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_prep=$OPTARG ";;
+    r)  ENSEMBLE_RUNS=$OPTARG ;; # Ensemble member run seed values
     x)  DO_NOT_EXECUTE=1
   esac
 done
@@ -52,11 +54,43 @@ ln -s `realpath ${DATEFILE}` ensemble/${NAME}/predict_dates.csv
 
 PREDICT_CONFIG=`mktemp -p . --suffix ".predict"`
 
+##
+# Dynamically generate seeds for ensemble run.
+#
+
+IFS="," read -ra SEEDS <<< "$ENSEMBLE_RUNS"
+
+# Check if seeds defined as CLI args (e.g. `-r 42,46`)
+if [ ${#SEEDS[@]} -eq 0 ]; then
+    IFS="," read -ra SEEDS <<< "$ENSEMBLE_PREDICT_SEEDS"
+    # Check if seeds defined in ENVS exported variables (else use defaults)
+    if [ ${#SEEDS[@]} -eq 0 ]; then
+        IFS="," read -ra SEEDS <<< "$ENSEMBLE_SEEDS_DEFAULT"
+    fi
+fi
+
+# Generate seed lines for yaml output
+ENSEMBLE_SEEDS=""
+COUNTER=0
+for seed in ${SEEDS[@]}
+do
+    ENSEMBLE_SEEDS+="        - seed:   "$seed
+    if [ $COUNTER -lt $((${#SEEDS[@]}-1)) ]; then
+        ENSEMBLE_SEEDS+="\n"
+    fi
+    ((COUNTER++))
+done
+
+echo "No. of ensemble members: " "${#SEEDS[@]}"
+printf -v joined '%s,' "${SEEDS[@]}"
+echo "Ensemble members: " "${joined%,}"
+
 sed -r \
     -e "s/NETWORK/${NETWORK}/g" \
     -e "s/DATASET/${DATASET}/g" \
     -e "s/LOADER/${LOADER}/g" \
     -e "s/NAME/${NAME}/g" \
+    -e "/\bSEEDS$/s/.*/${ENSEMBLE_SEEDS}/g" \
  ensemble/predict.tmpl.yaml >$PREDICT_CONFIG
 
 COMMAND="model_ensemble $PREDICT_CONFIG $ENSEMBLE_TARGET $ENSEMBLE_SWITCH $ENSEMBLE_ARGS"
diff --git a/run_train_ensemble.sh b/run_train_ensemble.sh
index 3de09bb..2c86b12 100755
--- a/run_train_ensemble.sh
+++ b/run_train_ensemble.sh
@@ -9,13 +9,15 @@ fi
 
 echo "ARGS: $@"
 
+# Defaults if not specified
 ENSEMBLE_TARGET="slurm"
 ENSEMBLE_SWITCH=""
 ENSEMBLE_ARGS=""
 ENSEMBLE_JOBS=1
 ENSEMBLE_NTASKS=4
+ENSEMBLE_SEEDS_DEFAULT=42,46,45,17,24,84,83,16,5,3
 
-while getopts ":b:c:de:f:g:j:l:m:n:p:q:s:t:" opt; do
+while getopts ":b:c:de:f:g:j:l:m:n:p:q:r:s:t:" opt; do
   case "$opt" in
     b)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_batch=$OPTARG ";;
     c)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}cluster=$OPTARG ";;
@@ -29,6 +31,7 @@ while getopts ":b:c:de:f:g:j:l:m:n:p:q:s:t:" opt; do
     n)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}nodelist=$OPTARG ";;
     p)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_prep=$OPTARG ";;
     q)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_queue=$OPTARG ";;
+    r)  ENSEMBLE_RUNS=$OPTARG ;; # Ensemble member run seed values
     s)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_strategy=$OPTARG ";;
     t)  ENSEMBLE_NTASKS=$OPTARG ;;
   esac
@@ -47,12 +50,44 @@ NAME="$3"
 
 TRAIN_CONFIG=`mktemp -p . --suffix ".train"`
 
+##
+# Dynamically generate seeds for ensemble run.
+#
+
+IFS="," read -ra SEEDS <<< "$ENSEMBLE_RUNS"
+
+# Check if seeds defined as CLI args (e.g. `-r 42,46`)
+if [ ${#SEEDS[@]} -eq 0 ]; then
+    IFS="," read -ra SEEDS <<< "$ENSEMBLE_TRAIN_SEEDS"
+    # Check if seeds defined in ENVS exported variables (else use defaults)
+    if [ ${#SEEDS[@]} -eq 0 ]; then
+        IFS="," read -ra SEEDS <<< "$ENSEMBLE_SEEDS_DEFAULT"
+    fi
+fi
+
+# Generate seed lines for yaml output
+ENSEMBLE_SEEDS=""
+COUNTER=0
+for seed in ${SEEDS[@]}
+do
+    ENSEMBLE_SEEDS+="        - seed:   "$seed
+    if [ $COUNTER -lt $((${#SEEDS[@]}-1)) ]; then
+        ENSEMBLE_SEEDS+="\n"
+    fi
+    ((COUNTER++))
+done
+
+echo "No. of ensemble members: " "${#SEEDS[@]}"
+printf -v joined '%s,' "${SEEDS[@]}"
+echo "Ensemble members: " "${joined%,}"
+
 sed -r \
     -e "s/NAME/${NAME}/g" \
     -e "s/LOADER/${LOADER}/g" \
     -e "s/DATASET/${DATASET}/g" \
     -e "s/MAXJOBS/${ENSEMBLE_JOBS}/g" \
     -e "s/NTASKS/${ENSEMBLE_NTASKS}/g" \
+    -e "/\bSEEDS$/s/.*/${ENSEMBLE_SEEDS}/g" \
  ensemble/train.tmpl.yaml >$TRAIN_CONFIG
 
 COMMAND="model_ensemble $TRAIN_CONFIG $ENSEMBLE_TARGET $ENSEMBLE_SWITCH $ENSEMBLE_ARGS"

From 100d399a2541b5db387f6196376783796187841c Mon Sep 17 00:00:00 2001
From: "Bryn N. Ubald" <55503826+bnubald@users.noreply.github.com>
Date: Thu, 7 Mar 2024 11:25:32 +0000
Subject: [PATCH 06/44] Fixes #36: Rename seed variables in ENVS.example

---
 ENVS.example | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ENVS.example b/ENVS.example
index 0850cca..715d973 100644
--- a/ENVS.example
+++ b/ENVS.example
@@ -64,8 +64,8 @@ DEMO_PIPELINE_TEST_END="2022-2-28"
 ##
 # Training & Prediction ensemble run seeds
 #
-SMALL_DEMO_PIPELINE_ENSEMBLE_TRAIN_SEEDS="42,46"
-SMALL_DEMO_PIPELINE_ENSEMBLE_PREDICT_SEEDS="42,46"
+DEMO_PIPELINE_ENSEMBLE_TRAIN_SEEDS="42,46"
+DEMO_PIPELINE_ENSEMBLE_PREDICT_SEEDS="42,46"
 
 ##
 # The prefix to use for training date ranges

From 3a15c62548fd58095517c8edf3089f4a77465bcd Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 22 Mar 2024 16:57:18 +0000
Subject: [PATCH 07/44] Dev #38: adding support for incremental HPC environment
 installation from cloned tensorflow-gpu

---
 environment.dawn.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 environment.dawn.yml

diff --git a/environment.dawn.yml b/environment.dawn.yml
new file mode 100644
index 0000000..2713e21
--- /dev/null
+++ b/environment.dawn.yml
@@ -0,0 +1,11 @@
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - cartopy
+  - eccodes
+  - ffmpeg
+  - hdf5
+  - netcdf4
+  - openh264
+  - xarray

From a8f1193477e2c7c7949db51ee55d772b78ee605f Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Sat, 23 Mar 2024 07:22:46 +0000
Subject: [PATCH 08/44] Dev #38: sticking in some stubs for dawn use

---
 ensemble/predict.tmpl.yaml | 2 +-
 ensemble/template/dawn.sh  | 5 +++++
 ensemble/train.tmpl.yaml   | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 ensemble/template/dawn.sh

diff --git a/ensemble/predict.tmpl.yaml b/ensemble/predict.tmpl.yaml
index f960174..55ad34d 100644
--- a/ensemble/predict.tmpl.yaml
+++ b/ensemble/predict.tmpl.yaml
@@ -14,6 +14,7 @@ ensemble:
       - ../../../processed
       - ../../../results
     mem:          224gb
+    cluster:      pvc
 
   pre_process:          []
   post_process:         []
@@ -24,7 +25,6 @@ ensemble:
     - icenet_predict.sh.j2
     email:        someone@example.com
     job_file:     icenet_predict.sh
-    cluster:      short
     nodes:        1
     ntasks:       8
     length:       00:30:00
diff --git a/ensemble/template/dawn.sh b/ensemble/template/dawn.sh
new file mode 100644
index 0000000..e73154b
--- /dev/null
+++ b/ensemble/template/dawn.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+module purge
+module load default-dawn
+module load intelpython-conda
diff --git a/ensemble/train.tmpl.yaml b/ensemble/train.tmpl.yaml
index 0bed0bb..0a7e1c0 100644
--- a/ensemble/train.tmpl.yaml
+++ b/ensemble/train.tmpl.yaml
@@ -17,6 +17,8 @@ ensemble:
       - ../../../results
     gpus:         1
     mem:          128gb
+    cluster:      pvc
+    nodes:        1
 
   pre_process:
     - name:   execute
@@ -31,8 +33,6 @@ ensemble:
     - icenet_train.sh.j2
     email:        someone@example.com
     job_file:     icenet_train.sh
-    cluster:      gpu
-    nodes:        1
     ntasks:       NTASKS
     length:       4-00:00:00
     maxruns:      5

From adb234f6b64f03fa35b1d72f04bc96de0430b782 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Thu, 28 Mar 2024 15:14:15 +0000
Subject: [PATCH 09/44] Dev #39: highlighting what the intention is for
 specifying basic pip dependencies

---
 environment.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/environment.yml b/environment.yml
index 17bb914..937f0e6 100644
--- a/environment.yml
+++ b/environment.yml
@@ -13,3 +13,6 @@ dependencies:
   - openh264
   - python=3.8
   - xarray
+  pip:
+    - icenet==0.2.7
+    - model-ensembler

From 3e4f9280ba997993a5530177b461e6db712ed226 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 12 Apr 2024 16:12:03 +0100
Subject: [PATCH 10/44] Removing explict icenet dependency, that's not
 necessary under pip (and certainly shouldn't be pinned

---
 environment.dawn.yml | 2 ++
 environment.yml      | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/environment.dawn.yml b/environment.dawn.yml
index 2713e21..29ea4b7 100644
--- a/environment.dawn.yml
+++ b/environment.dawn.yml
@@ -9,3 +9,5 @@ dependencies:
   - netcdf4
   - openh264
   - xarray
+  pip:
+    - model-ensembler
diff --git a/environment.yml b/environment.yml
index 937f0e6..9a67f24 100644
--- a/environment.yml
+++ b/environment.yml
@@ -14,5 +14,4 @@ dependencies:
   - python=3.8
   - xarray
   pip:
-    - icenet==0.2.7
     - model-ensembler

From 0d5bf55fe0bf2606523728f7a1ab39418a9df149 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 12 Apr 2024 17:15:52 +0100
Subject: [PATCH 11/44] Fixes #39: sorted this out properly

---
 environment.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 9a67f24..2e454cb 100644
--- a/environment.yml
+++ b/environment.yml
@@ -12,6 +12,7 @@ dependencies:
   - netcdf4
   - openh264
   - python=3.8
+  - pip
   - xarray
-  pip:
+  - pip:
     - model-ensembler

From 81ca63ca996a8a98da4cf86b9aac872c8544181e Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Wed, 19 Jun 2024 14:38:43 +0100
Subject: [PATCH 12/44] Version of python bump

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 2e454cb..b7912b6 100644
--- a/environment.yml
+++ b/environment.yml
@@ -11,7 +11,7 @@ dependencies:
   - ipykernel
   - netcdf4
   - openh264
-  - python=3.8
+  - python=3.9
   - pip
   - xarray
   - pip:

From fa29ce53d0a69bbc627b7e724a9c59f328f1fafe Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Mon, 22 Jul 2024 20:46:19 +0100
Subject: [PATCH 13/44] Development rationalisation to support 0.4 development

---
 clean_pipeline.sh => script.backup/clean_pipeline.sh             | 0
 loader_test_dates.sh => script.backup/loader_test_dates.sh       | 0
 produce_op_assets.sh => script.backup/produce_op_assets.sh       | 0
 run_check.sh => script.backup/run_check.sh                       | 0
 run_daily.sh => script.backup/run_daily.sh                       | 0
 run_era5_forecast.sh => script.backup/run_era5_forecast.sh       | 0
 run_forecast_plots.sh => script.backup/run_forecast_plots.sh     | 0
 run_input_plots.sh => script.backup/run_input_plots.sh           | 0
 run_predict_ensemble.sh => script.backup/run_predict_ensemble.sh | 0
 run_prediction.sh => script.backup/run_prediction.sh             | 0
 run_train_ensemble.sh => script.backup/run_train_ensemble.sh     | 0
 run_validation.sh => script.backup/run_validation.sh             | 0
 train_analysis.sh => script.backup/train_analysis.sh             | 0
 13 files changed, 0 insertions(+), 0 deletions(-)
 rename clean_pipeline.sh => script.backup/clean_pipeline.sh (100%)
 rename loader_test_dates.sh => script.backup/loader_test_dates.sh (100%)
 rename produce_op_assets.sh => script.backup/produce_op_assets.sh (100%)
 rename run_check.sh => script.backup/run_check.sh (100%)
 rename run_daily.sh => script.backup/run_daily.sh (100%)
 rename run_era5_forecast.sh => script.backup/run_era5_forecast.sh (100%)
 rename run_forecast_plots.sh => script.backup/run_forecast_plots.sh (100%)
 rename run_input_plots.sh => script.backup/run_input_plots.sh (100%)
 rename run_predict_ensemble.sh => script.backup/run_predict_ensemble.sh (100%)
 rename run_prediction.sh => script.backup/run_prediction.sh (100%)
 rename run_train_ensemble.sh => script.backup/run_train_ensemble.sh (100%)
 rename run_validation.sh => script.backup/run_validation.sh (100%)
 rename train_analysis.sh => script.backup/train_analysis.sh (100%)

diff --git a/clean_pipeline.sh b/script.backup/clean_pipeline.sh
similarity index 100%
rename from clean_pipeline.sh
rename to script.backup/clean_pipeline.sh
diff --git a/loader_test_dates.sh b/script.backup/loader_test_dates.sh
similarity index 100%
rename from loader_test_dates.sh
rename to script.backup/loader_test_dates.sh
diff --git a/produce_op_assets.sh b/script.backup/produce_op_assets.sh
similarity index 100%
rename from produce_op_assets.sh
rename to script.backup/produce_op_assets.sh
diff --git a/run_check.sh b/script.backup/run_check.sh
similarity index 100%
rename from run_check.sh
rename to script.backup/run_check.sh
diff --git a/run_daily.sh b/script.backup/run_daily.sh
similarity index 100%
rename from run_daily.sh
rename to script.backup/run_daily.sh
diff --git a/run_era5_forecast.sh b/script.backup/run_era5_forecast.sh
similarity index 100%
rename from run_era5_forecast.sh
rename to script.backup/run_era5_forecast.sh
diff --git a/run_forecast_plots.sh b/script.backup/run_forecast_plots.sh
similarity index 100%
rename from run_forecast_plots.sh
rename to script.backup/run_forecast_plots.sh
diff --git a/run_input_plots.sh b/script.backup/run_input_plots.sh
similarity index 100%
rename from run_input_plots.sh
rename to script.backup/run_input_plots.sh
diff --git a/run_predict_ensemble.sh b/script.backup/run_predict_ensemble.sh
similarity index 100%
rename from run_predict_ensemble.sh
rename to script.backup/run_predict_ensemble.sh
diff --git a/run_prediction.sh b/script.backup/run_prediction.sh
similarity index 100%
rename from run_prediction.sh
rename to script.backup/run_prediction.sh
diff --git a/run_train_ensemble.sh b/script.backup/run_train_ensemble.sh
similarity index 100%
rename from run_train_ensemble.sh
rename to script.backup/run_train_ensemble.sh
diff --git a/run_validation.sh b/script.backup/run_validation.sh
similarity index 100%
rename from run_validation.sh
rename to script.backup/run_validation.sh
diff --git a/train_analysis.sh b/script.backup/train_analysis.sh
similarity index 100%
rename from train_analysis.sh
rename to script.backup/train_analysis.sh

From 515333de168ac57e279f84d7cd5250449b7870c9 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Sat, 17 Aug 2024 00:20:57 +0100
Subject: [PATCH 14/44] Removing unnecessary pinning

---
 environment.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/environment.yml b/environment.yml
index b7912b6..80cc4cf 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,10 +1,9 @@
 channels:
   - conda-forge
-  - defaults
 dependencies:
   - cartopy
-  - cudatoolkit=11.2
-  - cudnn=8.1.0
+  - cudatoolkit
+  - cudnn
   - eccodes
   - ffmpeg
   - hdf5

From 0d91837c953f303eda929d4a4f72393f79831d96 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Sat, 17 Aug 2024 00:22:27 +0100
Subject: [PATCH 15/44] Sorting out new preprocess config gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index b49d126..0faae3c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ __pycache__/
 /wandb/
 *test*.json
 dataset_config.*.json
+processed.*.json
 loader.*.json
 *.csv
 *.err

From 735cbf7efbd167c43d20fb2eb17be0f49804d0a9 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Tue, 20 Aug 2024 21:10:53 +0100
Subject: [PATCH 16/44] Dev #53: reorganising structure of scripts

---
 .gitignore                                    |   1 +
 ENVS                                          |   1 -
 condense.slurm.sh                             |  10 --
 .../run_check.sh => dataset_check.sh          |   0
 .../run_forecast_plots.sh => plot_forecast.sh |   0
 .../run_input_plots.sh => plot_inputs.sh      |   0
 .../run_validation.sh => plot_validations.sh  |  16 +--
 prep_prediction_data.sh                       |  11 ++
 prep_training_data.sh                         | 102 ++++++++++++++++++
 ...oduce_op_assets.sh => process_op_assets.sh |   0
 run_data.sh                                   |  40 -------
 ...ict_ensemble.sh => run_predict_ensemble.sh |   0
 ...train_ensemble.sh => run_train_ensemble.sh |   0
 script.backup/create_masks_plots.txt          |  19 ++++
 14 files changed, 141 insertions(+), 59 deletions(-)
 delete mode 120000 ENVS
 delete mode 100755 condense.slurm.sh
 rename script.backup/run_check.sh => dataset_check.sh (100%)
 rename script.backup/run_forecast_plots.sh => plot_forecast.sh (100%)
 rename script.backup/run_input_plots.sh => plot_inputs.sh (100%)
 rename script.backup/run_validation.sh => plot_validations.sh (83%)
 create mode 100755 prep_prediction_data.sh
 create mode 100755 prep_training_data.sh
 rename script.backup/produce_op_assets.sh => process_op_assets.sh (100%)
 delete mode 100755 run_data.sh
 rename script.backup/run_predict_ensemble.sh => run_predict_ensemble.sh (100%)
 rename script.backup/run_train_ensemble.sh => run_train_ensemble.sh (100%)
 create mode 100644 script.backup/create_masks_plots.txt

diff --git a/.gitignore b/.gitignore
index 0faae3c..f89f39b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@ tmp.*
 *test*
 *.png
 
+!ENVS
 !ENVS.example
 ENVS.*
 
diff --git a/ENVS b/ENVS
deleted file mode 120000
index 73248d3..0000000
--- a/ENVS
+++ /dev/null
@@ -1 +0,0 @@
-ENVS.example
\ No newline at end of file
diff --git a/condense.slurm.sh b/condense.slurm.sh
deleted file mode 100755
index 99f021e..0000000
--- a/condense.slurm.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-. ENVS
-
-conda activate $ICENET_CONDA
-
-echo "START $1 $2 $3: `date +%T`"
-icenet_process_condense -v $1 $2 $3 >$ICENET_HOME/logs/condense.$1.$2.$3.log 2>&1
-echo "END $1 $2 $3 `date +%T`"
-
diff --git a/script.backup/run_check.sh b/dataset_check.sh
similarity index 100%
rename from script.backup/run_check.sh
rename to dataset_check.sh
diff --git a/script.backup/run_forecast_plots.sh b/plot_forecast.sh
similarity index 100%
rename from script.backup/run_forecast_plots.sh
rename to plot_forecast.sh
diff --git a/script.backup/run_input_plots.sh b/plot_inputs.sh
similarity index 100%
rename from script.backup/run_input_plots.sh
rename to plot_inputs.sh
diff --git a/script.backup/run_validation.sh b/plot_validations.sh
similarity index 83%
rename from script.backup/run_validation.sh
rename to plot_validations.sh
index a0ad829..17c6d36 100755
--- a/script.backup/run_validation.sh
+++ b/plot_validations.sh
@@ -16,7 +16,7 @@ if [ $# -lt 2 ] || [ "$1" == "-h" ]; then
     echo -e "\nThe script will generate several plots which can be used to validate the forecast (and also to compare with ECMWF)"
     echo "The plots to analyse the performance of the forecasts will be saved to <output_dir>"
     echo "and the plots to compare performance with ECMWF will be saved to <output_dir>/ECMWF_comp"
-    echo "Run \"run_forecast_plots.sh -h\" for more details of what the plots generated are"
+    echo "Run \"plot_forecast.sh -h\" for more details of what the plots generated are"
     exit 1
 fi
 
@@ -74,20 +74,20 @@ for element in "${METRICS[@]}"
     do
     if [ "${element}" == "binacc" ]; then
         for THRESH in ${THRESHOLDS[@]}; do
-            ./run_forecast_plots.sh -m ${element} $REGION -v -l -t $THRESH \
+            ./plot_forecast.sh -m ${element} $REGION -v -l -t $THRESH \
                 -o $OUTPUT_DIR $FORECAST $HEMI
-            ./run_forecast_plots.sh -m ${element} $REGION -e -v -l -t $THRESH \
+            ./plot_forecast.sh -m ${element} $REGION -e -v -l -t $THRESH \
                 -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST $HEMI
         done
     elif [ "${element}" == "sie" ]; then
         for THRESH in ${THRESHOLDS[@]}; do
-            ./run_forecast_plots.sh -m ${element} $REGION -v -l -t $THRESH $GRID_AREA_SIZE \
+            ./plot_forecast.sh -m ${element} $REGION -v -l -t $THRESH $GRID_AREA_SIZE \
                 -o $OUTPUT_DIR $FORECAST $HEMI
-            ./run_forecast_plots.sh -m ${element} $REGION -e -v -l -t $THRESH $GRID_AREA_SIZE \
+            ./plot_forecast.sh -m ${element} $REGION -e -v -l -t $THRESH $GRID_AREA_SIZE \
                 -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST $HEMI
         done
     elif [ "${element}" == "sic" ]; then
-        ./run_forecast_plots.sh -m ${element} $REGION -v \
+        ./plot_forecast.sh -m ${element} $REGION -v \
             -o $OUTPUT_DIR $FORECAST $HEMI
     else
         if [ "${element}" == "mae" ]; then
@@ -97,9 +97,9 @@ for element in "${METRICS[@]}"
         elif [ "${element}" == "rmse" ]; then
             LOGFILE="${RMSE_LOG}"
         fi
-        ./run_forecast_plots.sh -m ${element} $REGION -v -l \
+        ./plot_forecast.sh -m ${element} $REGION -v -l \
             -o $OUTPUT_DIR $FORECAST $HEMI
-        ./run_forecast_plots.sh -m ${element} $REGION -e -v -l \
+        ./plot_forecast.sh -m ${element} $REGION -e -v -l \
             -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST $HEMI
     fi
 done
diff --git a/prep_prediction_data.sh b/prep_prediction_data.sh
new file mode 100755
index 0000000..bf0ca5f
--- /dev/null
+++ b/prep_prediction_data.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+source ENVS
+
+conda activate $ICENET_CONDA
+
+set -o pipefail
+set -eu
+
+
+
diff --git a/prep_training_data.sh b/prep_training_data.sh
new file mode 100755
index 0000000..c71a52d
--- /dev/null
+++ b/prep_training_data.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+source ENVS
+
+conda activate $ICENET_CONDA
+
+set -o pipefail
+set -eu
+
+if [ $# -lt 1 ] || [ "$1" == "-h" ]; then
+    echo "Usage $0 <hemisphere>"
+fi
+
+HEMI="$1"
+
+export OSISAF_DATASET="data/osisaf/dataset_config.month.hemi.north.json"       # Persistent dataset
+export ERA5_DATASET="data/era5/dataset_config.month.hemi.north.json"           # Persistent dataset
+export GROUND_TRUTH_SIC="osi_sic"    # Ephemeral dataset
+export GROUND_TRUTH_SIC_DSC="data/$GROUND_TRUTH_SIC/dataset_config.month.hemi.north.json"
+export ATMOS_PROC="era5_osi"         # Ephemeral dataset
+export ATMOS_PROC_DSC="data/$ATMOS_PROC/dataset_config.month.hemi.north.json"
+export PROCESSED_DATASET="test"
+export LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json"
+export DATASET_NAME="test_net_ds"
+
+
+source ENVS
+
+
+
+
+
+(
+  for HEMI in north south; do echo download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS; done
+  for HEMI in north south; do echo download_osisaf $DATA_ARGS $HEMI $OSISAF_DATES $OSISAF_VAR_ARGS; done
+  for HEMI in north south; do echo download_era5 $DATA_ARGS $HEMI $ERA5_DATES $ERA5_VAR_ARGS; done
+
+  for HEMI in north south; do echo download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS; done
+)
+
+
+source ENVS
+
+## Process
+
+preprocess_loader_init -v $PROCESSED_DATASET
+
+preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET land "icenet.data.masks.osisaf:Masks"
+  * TODO: masks is not compatible with dual hemisphere in this form!
+preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.data.masks.osisaf:Masks"
+preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks"
+
+preprocess_missing_time -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC
+# TODO: didn't seemingly detect missing months? data/osi_sic/month/hemi.north/siconca.missing_days.csv
+# TODO: undoubtedly need to include the known invalid dates - added these to the osisaf downloader
+preprocess_missing_spatial -m processed.masks.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC
+# TODO: Interpolation failing in all cases?
+# TODO: this undoubtedly explains the stray nans present in dataset generation
+
+preprocess_dataset $PROC_ARGS_SIC -v \
+  -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \
+  -i "icenet.data.processors.osisaf:SICPreProcessor" \
+  $GROUND_TRUTH_SIC_DSC ${PROCESSED_DATASET}_osisaf
+# TODO: plenty of nans contained in here, we need better assesments
+
+# TODO: icenet_osisaf_ref -v data/osisaf/hemi.north/siconca/2012.nc ref.osisaf.north.nc
+#  this needs to:
+#  - ds = xr.open_dataset("./data/osisaf/month/hemi.north/siconca/1978.nc")
+#  - ds = ds.drop_vars(["raw_ice_conc_values", "smearing_standard_error", "algorithm_standard_error"])
+#  - cube = ds.siconca.to_iris()
+#  - cube.coord('projection_x_coordinate').convert_units('meters')
+#  - cube.coord('projection_y_coordinate').convert_units('meters')
+#  - iris.save("ref.osisaf.nc")
+
+
+preprocess_regrid -v $ERA5_DATASET ref.osisaf.nc $ATMOS_PROC
+# TODO: get the batcher back in place for multiprocessing this
+# TODO: this should regrid ALL files in the dataset, for some reason 2024.nc did not get wrapped in
+preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.nc
+  * TODO: get the batcher back in place for multiprocessing this
+
+preprocess_dataset $PROC_ARGS_ERA5 -v \
+  -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \
+  -i "icenet.data.processors.cds:ERA5PreProcessor" \
+  $ATMOS_PROC_DSC ${PROCESSED_DATASET}_era5
+  * TODO: naive copy of "./data/era5_osi/month/hemi.north/uas/2024.nc" results in mistaken loading - not regridded
+  * TODO: dask multiprocessing cluster with task batcher across multiple variables would be sensible
+
+preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PROCESSED_DATASET}_osisaf.json processed.${PROCESSED_DATASET}_era5.json
+
+preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC sin "icenet.data.meta:SinProcessor"
+preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC cos "icenet.data.meta:CosProcessor"
+preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "icenet.data.masks.osisaf:Masks"
+
+icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME
+  * TODO: FIXME in here to override the creation of nan containing sets
+
+icenet_plot_input -p -v dataset_config.test_net_ds.json 2021-04-30 ./plot/input.png
+icenet_plot_input --outputs -v dataset_config.test_net_ds.json 2021-04-30 ./plot/outputs.png
+icenet_plot_input --weights -v dataset_config.test_net_ds.json 2021-04-30 ./plot/weights.png
+
+icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42
\ No newline at end of file
diff --git a/script.backup/produce_op_assets.sh b/process_op_assets.sh
similarity index 100%
rename from script.backup/produce_op_assets.sh
rename to process_op_assets.sh
diff --git a/run_data.sh b/run_data.sh
deleted file mode 100755
index 1c90194..0000000
--- a/run_data.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-source ENVS
-
-conda activate $ICENET_CONDA
-
-set -o pipefail
-set -eu
-
-if [ $# -lt 1 ] || [ "$1" == "-h" ]; then
-    echo "Usage $0 <hemisphere> [batch_size] [workers]"
-fi
-
-DATANAME="$TRAIN_DATA_NAME"
-HEMI="$1"
-BATCH_SIZE=${2:-2}
-WORKERS=${3:-8}
-
-if [ ! -f loader.${DATANAME}_${HEMI}.json ]; then
-    [ ! -z "$PROC_ARGS_ERA5" ] && icenet_process_era5 -v -l $LAG \
-        $PROC_ARGS_ERA5 \
-        -ns $TRAIN_START -ne $TRAIN_END -vs $VAL_START -ve $VAL_END -ts $TEST_START -te $TEST_END \
-        ${DATANAME}_${HEMI} $HEMI
-
-    [ ! -z "$PROC_ARGS_ORAS5" ] && icenet_process_oras5 -v -l $LAG \
-        $PROC_ARGS_ORAS5 \
-        -ns $TRAIN_START -ne $TRAIN_END -vs $VAL_START -ve $VAL_END -ts $TEST_START -te $TEST_END \
-        ${DATANAME}_${HEMI} $HEMI
-
-    [ ! -z "$PROC_ARGS_SIC" ] && icenet_process_sic -v -l $LAG \
-        $PROC_ARGS_SIC \
-        -ns $TRAIN_START -ne $TRAIN_END -vs $VAL_START -ve $VAL_END -ts $TEST_START -te $TEST_END \
-        ${DATANAME}_${HEMI} $HEMI
-
-    icenet_process_metadata ${DATANAME}_${HEMI} $HEMI
-else
-    echo "Skipping preprocessing as loader.${DATANAME}_${HEMI}.json already exists..."
-fi
-
-icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fd $FORECAST_DAYS -l $LAG ${DATANAME}_${HEMI} $HEMI
diff --git a/script.backup/run_predict_ensemble.sh b/run_predict_ensemble.sh
similarity index 100%
rename from script.backup/run_predict_ensemble.sh
rename to run_predict_ensemble.sh
diff --git a/script.backup/run_train_ensemble.sh b/run_train_ensemble.sh
similarity index 100%
rename from script.backup/run_train_ensemble.sh
rename to run_train_ensemble.sh
diff --git a/script.backup/create_masks_plots.txt b/script.backup/create_masks_plots.txt
new file mode 100644
index 0000000..25d6939
--- /dev/null
+++ b/script.backup/create_masks_plots.txt
@@ -0,0 +1,19 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+from icenet.data.masks.osisaf import Masks
+from download_toolbox.interface import get_dataset_config_implementation
+dsc = get_dataset_config_implementation("data/osi_sic/dataset_config.month.hemi.north.json")
+m = Masks(dsc)
+m.polarhole_filename
+
+for i in range(1,13):
+    plt.contourf(m.active_grid_cell(pd.Timestamp("2020-{}-1".format(i))))
+    plt.savefig("agcm{}.png".format(i))
+
+
+plt.contourf(m.land())
+plt.savefig("land.png")
+for i in range(1975, 2026, 10):
+    plt.contourf(m.polarhole(pd.Timestamp("{}-1-1".format(i))))
+    plt.savefig("polarhole.{}.png".format(i))
+ 

From 1316e6a2201069745a7ea4f2bb942dea1d77c278 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Tue, 20 Aug 2024 21:12:02 +0100
Subject: [PATCH 17/44] Messed up gitignore, getting rid of ENVS

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index f89f39b..0c5445a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,7 +34,7 @@ tmp.*
 *test*
 *.png
 
-!ENVS
+ENVS
 !ENVS.example
 ENVS.*
 

From 8d39b50edb2d5d3a5d7ccd048c4e97f0b592c549 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Wed, 21 Aug 2024 09:02:45 +0100
Subject: [PATCH 18/44] Dev #53: Adapted for new structure of
 environmental-forecasting training data preparation

---
 prep_training_data.sh | 104 ++++++++++++++++++++++--------------------
 1 file changed, 55 insertions(+), 49 deletions(-)

diff --git a/prep_training_data.sh b/prep_training_data.sh
index c71a52d..dd63262 100755
--- a/prep_training_data.sh
+++ b/prep_training_data.sh
@@ -8,62 +8,67 @@ set -o pipefail
 set -eu
 
 if [ $# -lt 1 ] || [ "$1" == "-h" ]; then
-    echo "Usage $0 <hemisphere>"
+    echo "Usage $0 <hemisphere> [download=0|1]"
 fi
 
 HEMI="$1"
-
-export OSISAF_DATASET="data/osisaf/dataset_config.month.hemi.north.json"       # Persistent dataset
-export ERA5_DATASET="data/era5/dataset_config.month.hemi.north.json"           # Persistent dataset
-export GROUND_TRUTH_SIC="osi_sic"    # Ephemeral dataset
-export GROUND_TRUTH_SIC_DSC="data/$GROUND_TRUTH_SIC/dataset_config.month.hemi.north.json"
-export ATMOS_PROC="era5_osi"         # Ephemeral dataset
-export ATMOS_PROC_DSC="data/$ATMOS_PROC/dataset_config.month.hemi.north.json"
-export PROCESSED_DATASET="test"
-export LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json"
-export DATASET_NAME="test_net_ds"
-
-
-source ENVS
-
-
-
-
-
-(
-  for HEMI in north south; do echo download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS; done
-  for HEMI in north south; do echo download_osisaf $DATA_ARGS $HEMI $OSISAF_DATES $OSISAF_VAR_ARGS; done
-  for HEMI in north south; do echo download_era5 $DATA_ARGS $HEMI $ERA5_DATES $ERA5_VAR_ARGS; done
-
-  for HEMI in north south; do echo download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS; done
-)
-
-
-source ENVS
-
-## Process
-
+DOWNLOAD=$2
+
+# download-toolbox integration
+# This updates our source
+if [ $DOWNLOAD -eq 1 ]; then
+  download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS
+  download_osisaf $DATA_ARGS $HEMI $OSISAF_DATES $OSISAF_VAR_ARGS
+  download_era5 $DATA_ARGS $HEMI $ERA5_DATES $ERA5_VAR_ARGS
+  download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS
+fi 2>&1 | tee logs/download.log
+
+DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json"
+
+# preprocess-toolbox integration
+# Persistent datasets from the source data store, wherever that is
+OSISAF_DATASET="${SOURCE_DATA_STORE}/osisaf/${DATASET_CONFIG_NAME}"
+ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}"
+
+# Create links to the central data store datasets for easier "mapping"
+[ [ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf
+[ [ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5
+# TODO: AMSR
+# TODO: CMIP
+
+GROUND_TRUTH_SIC="osi_sic"
+ATMOS_PROC="era5_osi"
+
+# Our processed dataset configurations, we localise data when regridding and reprojecting
+GROUND_TRUTH_SIC_DSC="data/${GROUND_TRUTH_SIC}/${DATASET_CONFIG_NAME}"
+ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}"
+
+PROCESSED_DATASET="training"
+LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json"
+DATASET_NAME="tfdata_cache"
+
+## Workflow
 preprocess_loader_init -v $PROCESSED_DATASET
 
 preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET land "icenet.data.masks.osisaf:Masks"
-  * TODO: masks is not compatible with dual hemisphere in this form!
+  # TODO: masks is not compatible with dual hemisphere in this form!
 preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.data.masks.osisaf:Masks"
 preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks"
 
 preprocess_missing_time -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC
-# TODO: didn't seemingly detect missing months? data/osi_sic/month/hemi.north/siconca.missing_days.csv
-# TODO: undoubtedly need to include the known invalid dates - added these to the osisaf downloader
+  # TODO: didn't seemingly detect missing months? data/osi_sic/month/hemi.north/siconca.missing_days.csv
+  # TODO: undoubtedly need to include the known invalid dates - added these to the osisaf downloader
 preprocess_missing_spatial -m processed.masks.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC
-# TODO: Interpolation failing in all cases?
-# TODO: this undoubtedly explains the stray nans present in dataset generation
+  # TODO: Interpolation failing in all cases?
+  # TODO: this undoubtedly explains the stray nans present in dataset generation
 
 preprocess_dataset $PROC_ARGS_SIC -v \
   -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \
   -i "icenet.data.processors.osisaf:SICPreProcessor" \
   $GROUND_TRUTH_SIC_DSC ${PROCESSED_DATASET}_osisaf
-# TODO: plenty of nans contained in here, we need better assesments
+  # TODO: plenty of nans contained in here due to failing spatial interpolation - needs investigation
 
-# TODO: icenet_osisaf_ref -v data/osisaf/hemi.north/siconca/2012.nc ref.osisaf.north.nc
+# TODO: icenet_osisaf_ref for geospatial metadata -v data/osisaf/hemi.north/siconca/???.nc ref.osisaf.north.nc
 #  this needs to:
 #  - ds = xr.open_dataset("./data/osisaf/month/hemi.north/siconca/1978.nc")
 #  - ds = ds.drop_vars(["raw_ice_conc_values", "smearing_standard_error", "algorithm_standard_error"])
@@ -72,19 +77,18 @@ preprocess_dataset $PROC_ARGS_SIC -v \
 #  - cube.coord('projection_y_coordinate').convert_units('meters')
 #  - iris.save("ref.osisaf.nc")
 
-
 preprocess_regrid -v $ERA5_DATASET ref.osisaf.nc $ATMOS_PROC
-# TODO: get the batcher back in place for multiprocessing this
-# TODO: this should regrid ALL files in the dataset, for some reason 2024.nc did not get wrapped in
+  # TODO: get the batcher back in place for multiprocessing this
+  # TODO: this should regrid ALL files in the dataset, for some reason 2024.nc did not get wrapped in
 preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.nc
-  * TODO: get the batcher back in place for multiprocessing this
+  # TODO: get the batcher back in place for multiprocessing this
 
 preprocess_dataset $PROC_ARGS_ERA5 -v \
   -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \
   -i "icenet.data.processors.cds:ERA5PreProcessor" \
   $ATMOS_PROC_DSC ${PROCESSED_DATASET}_era5
-  * TODO: naive copy of "./data/era5_osi/month/hemi.north/uas/2024.nc" results in mistaken loading - not regridded
-  * TODO: dask multiprocessing cluster with task batcher across multiple variables would be sensible
+  # TODO: naive copy of "./data/era5_osi/month/hemi.north/uas/2024.nc" earlier is not regridded?
+  # TODO: dask multiprocessing cluster with task batcher across multiple variables would be sensible
 
 preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PROCESSED_DATASET}_osisaf.json processed.${PROCESSED_DATASET}_era5.json
 
@@ -92,11 +96,13 @@ preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC sin "icene
 preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC cos "icenet.data.meta:CosProcessor"
 preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "icenet.data.masks.osisaf:Masks"
 
-icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME
-  * TODO: FIXME in here to override the creation of nan containing sets
-
+# TODO: select a random date from the training set, plot and log so user can double check outputs
 icenet_plot_input -p -v dataset_config.test_net_ds.json 2021-04-30 ./plot/input.png
 icenet_plot_input --outputs -v dataset_config.test_net_ds.json 2021-04-30 ./plot/outputs.png
 icenet_plot_input --weights -v dataset_config.test_net_ds.json 2021-04-30 ./plot/weights.png
 
-icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42
\ No newline at end of file
+icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME
+  # TODO: FIXME in here to override the creation of nan containing sets due to earlier issues
+
+
+# icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42
\ No newline at end of file

From 7679464ca480b88996ddd3fedd39a5cc0b53854e Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Wed, 21 Aug 2024 12:55:38 +0100
Subject: [PATCH 19/44] Dev #53: finalised scripting of prep_training_data

---
 prep_training_data.sh | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/prep_training_data.sh b/prep_training_data.sh
index dd63262..2cafab2 100755
--- a/prep_training_data.sh
+++ b/prep_training_data.sh
@@ -1,7 +1,6 @@
-#!/bin/bash
+#!/bin/bash -l
 
 source ENVS
-
 conda activate $ICENET_CONDA
 
 set -o pipefail
@@ -9,18 +8,19 @@ set -eu
 
 if [ $# -lt 1 ] || [ "$1" == "-h" ]; then
     echo "Usage $0 <hemisphere> [download=0|1]"
+    exit 1
 fi
 
 HEMI="$1"
-DOWNLOAD=$2
+DOWNLOAD=${2:-0}
 
 # download-toolbox integration
 # This updates our source
 if [ $DOWNLOAD -eq 1 ]; then
-  download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS
+  # download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS
   download_osisaf $DATA_ARGS $HEMI $OSISAF_DATES $OSISAF_VAR_ARGS
   download_era5 $DATA_ARGS $HEMI $ERA5_DATES $ERA5_VAR_ARGS
-  download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS
+  # download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS
 fi 2>&1 | tee logs/download.log
 
 DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json"
@@ -31,8 +31,8 @@ OSISAF_DATASET="${SOURCE_DATA_STORE}/osisaf/${DATASET_CONFIG_NAME}"
 ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}"
 
 # Create links to the central data store datasets for easier "mapping"
-[ [ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf
-[ [ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5
+[ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf
+[ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5
 # TODO: AMSR
 # TODO: CMIP
 
@@ -96,13 +96,15 @@ preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC sin "icene
 preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC cos "icenet.data.meta:CosProcessor"
 preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "icenet.data.masks.osisaf:Masks"
 
+icenet_dataset_create -v -c -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME
+
 # TODO: select a random date from the training set, plot and log so user can double check outputs
-icenet_plot_input -p -v dataset_config.test_net_ds.json 2021-04-30 ./plot/input.png
-icenet_plot_input --outputs -v dataset_config.test_net_ds.json 2021-04-30 ./plot/outputs.png
-icenet_plot_input --weights -v dataset_config.test_net_ds.json 2021-04-30 ./plot/weights.png
+icenet_plot_input -p -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./plot/input.png
+icenet_plot_input --outputs -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./plot/outputs.png
+icenet_plot_input --weights -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./plot/weights.png
 
 icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME
   # TODO: FIXME in here to override the creation of nan containing sets due to earlier issues
 
 
-# icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42
\ No newline at end of file
+# icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42

From e6673fed108b626d045303b04b7f92befcff3312 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Wed, 21 Aug 2024 16:55:56 +0100
Subject: [PATCH 20/44] Dev #53: implementation for new structure of training
 runs

---
 ensemble/template/icenet_train.sh.j2 | 24 ++++++++++++++++++---
 ensemble/train.tmpl.yaml             | 32 ++++++++++++++--------------
 prep_training_data.sh                |  1 +
 run_train_ensemble.sh                | 31 +++++++++++++--------------
 4 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/ensemble/template/icenet_train.sh.j2 b/ensemble/template/icenet_train.sh.j2
index 478ce92..73962d0 100755
--- a/ensemble/template/icenet_train.sh.j2
+++ b/ensemble/template/icenet_train.sh.j2
@@ -1,4 +1,5 @@
-#!/bin/bash
+#!/bin/bash -l
+{% if run.cluster != "test" %}
 #SBATCH --output={{ run.dir }}/train.%j.%N.{{ run.seed }}.out
 #SBATCH --error={{ run.dir }}/train.%j.%N.{{ run.seed }}.err
 #SBATCH --chdir={{ run.dir }}
@@ -15,6 +16,7 @@
 #SBATCH --cpus-per-task={{ run.ntasks }}
 #SBATCH --mem={{ run.mem }}
 {% if run.nodelist %}#SBATCH --nodelist={{ run.nodelist }}{% endif %}
+{% endif %}
 
 cd {{ run.dir }}
 
@@ -36,8 +38,24 @@ echo "START `date +%F\ %T`"
 source $PREP_SCRIPT
 conda activate $ICENET_CONDA
 
-# TODO: run.arg_filter_factor comes from ENVS now
-COMMAND="icenet_train -v {{ run.arg_dataset }} {{ run.name }} {{ run.seed }} $TRAIN_STATIC_ARGS -b {{ run.arg_batch }} -e {{ run.arg_epochs }} -m -qs {{ run.arg_queue }} -w {{ run.ntasks }} -s {{ run.arg_strategy }} {% if run.arg_preload %} -p results/networks/{{ run.name }}/{{ run.name }}.network_{{ run.arg_preload }}.{{ run.seed }}.h5 {% endif %}{% if run.arg_filter_factor %} -n {{ run.arg_filter_factor }}{% endif %}"
+PRELOAD=""
+FINAL_WEIGHTS="results/networks/{{ run.name }}/{{ run.name }}.network_{{ run.preload }}.{{ run.seed }}.h5"
+CHECKPOINT_WEIGHTS="`ls results/networks/{{ run.name }}/checkpoint.{{ run.name }}.network_{{ run.preload }}.{{ run.seed }}.*.keras 2>/dev/null`"
+
+# TODO: do we have keras / h5 weight multi-handling in place in library?
+if [ -f $FINAL_WEIGHTS ]; then
+    echo "Preloading from previously trained network $FINAL_WEIGHTS"
+    PRELOAD="-p $FINAL_WEIGHTS"
+elif [ ! -z "$CHECKPOINT_WEIGHTS" ]; then
+    CHECKPOINT_FILE=`echo "$CHECKPOINT_WEIGHTS" | sort | head -n 1`
+    echo "Preloading from checkpoint file $CHECKPOINT_FILE"
+    PRELOAD="-p $CHECKPOINT_FILE"
+fi
+
+COMMAND="icenet_train_tensorflow -v \
+    $TRAIN_STATIC_ARGS \
+    -b {{ run.batch }} -e {{ run.epochs }} -n $FILTER_FACTOR -s {{ run.strategy }} \
+    $PRELOAD {{ run.dataset }} {{ run.name }} {{ run.seed }} "
 
 echo "Running $COMMAND"
 eval $COMMAND
diff --git a/ensemble/train.tmpl.yaml b/ensemble/train.tmpl.yaml
index 0a7e1c0..423cfb2 100644
--- a/ensemble/train.tmpl.yaml
+++ b/ensemble/train.tmpl.yaml
@@ -1,24 +1,27 @@
 ---
 ensemble:
   vars:
-    arg_batch:          4
-    arg_dataset:        DATASET
-    arg_epochs:         100
-    arg_filter_factor:  1
-    arg_queue:          2
-    arg_strategy:       default
+    batch:          4
+    cluster:        dummy
+    dataset:        DATASET
+    email:          someone@example.com
+    epochs:         100
+    filter_factor:  1
+    gpus:           1
+    length:         1-00:00:00
+    mem:            128gb
+    nodes:          1
+    ntasks:         2
+    preload:        DATASET
+    strategy:       default
     symlinks:
       - ../../../data
       - ../../../ENVS*
-      - ../../../loader.LOADER.json
-      - ../../../dataset_config.DATASET.json
+      - ../../../LOADER
+      - ../../../DATASET
       - ../../../network_datasets
       - ../../../processed
       - ../../../results
-    gpus:         1
-    mem:          128gb
-    cluster:      pvc
-    nodes:        1
 
   pre_process:
     - name:   execute
@@ -31,11 +34,8 @@ ensemble:
     templatedir:  ../template
     templates:
     - icenet_train.sh.j2
-    email:        someone@example.com
     job_file:     icenet_train.sh
-    ntasks:       NTASKS
-    length:       4-00:00:00
-    maxruns:      5
+    maxruns:      MAXJOBS
     maxjobs:      MAXJOBS
 
   batches:
diff --git a/prep_training_data.sh b/prep_training_data.sh
index 2cafab2..a794de9 100755
--- a/prep_training_data.sh
+++ b/prep_training_data.sh
@@ -108,3 +108,4 @@ icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LO
 
 
 # icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42
+# run_train_ensemble
\ No newline at end of file
diff --git a/run_train_ensemble.sh b/run_train_ensemble.sh
index 2c86b12..2ceba8e 100755
--- a/run_train_ensemble.sh
+++ b/run_train_ensemble.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
-if [[ $# -lt 3 ]]; then
-    echo "Usage $0 LOADER DATASET NAME"
+if [[ $# -lt 2 ]]; then
+    echo "Usage $0 DATASET NAME"
     exit 1
 fi
 
@@ -14,26 +14,26 @@ ENSEMBLE_TARGET="slurm"
 ENSEMBLE_SWITCH=""
 ENSEMBLE_ARGS=""
 ENSEMBLE_JOBS=1
-ENSEMBLE_NTASKS=4
 ENSEMBLE_SEEDS_DEFAULT=42,46,45,17,24,84,83,16,5,3
 
-while getopts ":b:c:de:f:g:j:l:m:n:p:q:r:s:t:" opt; do
+while getopts ":b:c:de:f:g:j:l:m:n:o:p:q:r:s:t:x:" opt; do
   case "$opt" in
-    b)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_batch=$OPTARG ";;
+    b)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}batch=$OPTARG ";;
     c)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}cluster=$OPTARG ";;
     d)  ENSEMBLE_TARGET="dummy";;
-    e)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_epochs=$OPTARG ";;
-    f)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_filter_factor=$OPTARG ";;
+    e)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}epochs=$OPTARG ";;
+    f)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}filter_factor=$OPTARG ";;
     g)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}gpus=$OPTARG ";;
     j)  ENSEMBLE_JOBS=$OPTARG ;;
-    l)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_preload=$OPTARG ";;
+    l)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}preload=$OPTARG ";;
     m)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}mem=$OPTARG ";;
     n)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}nodelist=$OPTARG ";;
-    p)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_prep=$OPTARG ";;
-    q)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_queue=$OPTARG ";;
+    o)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}nodes=$OPTARG ";;
+    p)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}prep=$OPTARG ";;
     r)  ENSEMBLE_RUNS=$OPTARG ;; # Ensemble member run seed values
-    s)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_strategy=$OPTARG ";;
-    t)  ENSEMBLE_NTASKS=$OPTARG ;;
+    s)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}strategy=$OPTARG ";;
+    t)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}ntasks=$OPTARG ";;
+    x)  ENSEMBLE_ARGS="${ENSEMBLE_ARGS}email=$OPTARG ";;
   esac
 done
 
@@ -44,10 +44,10 @@ shift $((OPTIND-1))
 
 echo "ARGS = $ENSEMBLE_SWITCH $ENSEMBLE_ARGS, Leftovers: $@"
 
-LOADER="$1"
-DATASET="$2"
-NAME="$3"
+DATASET="$1"
+NAME="$2"
 
+LOADER=`basename $( cat dataset_config.${DATASET}.json | jq '.loader_config' | tr -d '"' )`
 TRAIN_CONFIG=`mktemp -p . --suffix ".train"`
 
 ##
@@ -86,7 +86,6 @@ sed -r \
     -e "s/LOADER/${LOADER}/g" \
     -e "s/DATASET/${DATASET}/g" \
     -e "s/MAXJOBS/${ENSEMBLE_JOBS}/g" \
-    -e "s/NTASKS/${ENSEMBLE_NTASKS}/g" \
     -e "/\bSEEDS$/s/.*/${ENSEMBLE_SEEDS}/g" \
  ensemble/train.tmpl.yaml >$TRAIN_CONFIG
 

From 13204a236cbca6e143ec683639c49aa4f4a3c549 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Wed, 21 Aug 2024 21:37:36 +0100
Subject: [PATCH 21/44] Updating refs for creation of links

---
 ensemble/train.tmpl.yaml | 4 ++--
 prep_training_data.sh    | 4 ----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/ensemble/train.tmpl.yaml b/ensemble/train.tmpl.yaml
index 423cfb2..3ef8d00 100644
--- a/ensemble/train.tmpl.yaml
+++ b/ensemble/train.tmpl.yaml
@@ -8,7 +8,7 @@ ensemble:
     epochs:         100
     filter_factor:  1
     gpus:           1
-    length:         1-00:00:00
+    length:         "1-00:00:00"
     mem:            128gb
     nodes:          1
     ntasks:         2
@@ -18,7 +18,7 @@ ensemble:
       - ../../../data
       - ../../../ENVS*
       - ../../../LOADER
-      - ../../../DATASET
+      - ../../../dataset_config.DATASET.json
       - ../../../network_datasets
       - ../../../processed
       - ../../../results
diff --git a/prep_training_data.sh b/prep_training_data.sh
index a794de9..68431af 100755
--- a/prep_training_data.sh
+++ b/prep_training_data.sh
@@ -105,7 +105,3 @@ icenet_plot_input --weights -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./
 
 icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME
   # TODO: FIXME in here to override the creation of nan containing sets due to earlier issues
-
-
-# icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42
-# run_train_ensemble
\ No newline at end of file

From f2bd7d74d7ba7bfbfbd6d497ca7a061e0fb21f65 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Thu, 22 Aug 2024 15:08:09 +0100
Subject: [PATCH 22/44] Training data working for both hemispheres

---
 prep_training_data.sh | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/prep_training_data.sh b/prep_training_data.sh
index 68431af..dcf64f6 100755
--- a/prep_training_data.sh
+++ b/prep_training_data.sh
@@ -43,22 +43,21 @@ ATMOS_PROC="era5_osi"
 GROUND_TRUTH_SIC_DSC="data/${GROUND_TRUTH_SIC}/${DATASET_CONFIG_NAME}"
 ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}"
 
-PROCESSED_DATASET="training"
+PROCESSED_DATASET="training.${HEMI}"
 LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json"
-DATASET_NAME="tfdata_cache"
+DATASET_NAME="tfdata_${HEMI}"
 
 ## Workflow
 preprocess_loader_init -v $PROCESSED_DATASET
 
 preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET land "icenet.data.masks.osisaf:Masks"
-  # TODO: masks is not compatible with dual hemisphere in this form!
 preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.data.masks.osisaf:Masks"
 preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks"
 
 preprocess_missing_time -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC
   # TODO: didn't seemingly detect missing months? data/osi_sic/month/hemi.north/siconca.missing_days.csv
   # TODO: undoubtedly need to include the known invalid dates - added these to the osisaf downloader
-preprocess_missing_spatial -m processed.masks.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC
+preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC
   # TODO: Interpolation failing in all cases?
   # TODO: this undoubtedly explains the stray nans present in dataset generation
 
@@ -68,16 +67,13 @@ preprocess_dataset $PROC_ARGS_SIC -v \
   $GROUND_TRUTH_SIC_DSC ${PROCESSED_DATASET}_osisaf
   # TODO: plenty of nans contained in here due to failing spatial interpolation - needs investigation
 
-# TODO: icenet_osisaf_ref for geospatial metadata -v data/osisaf/hemi.north/siconca/???.nc ref.osisaf.north.nc
-#  this needs to:
-#  - ds = xr.open_dataset("./data/osisaf/month/hemi.north/siconca/1978.nc")
-#  - ds = ds.drop_vars(["raw_ice_conc_values", "smearing_standard_error", "algorithm_standard_error"])
-#  - cube = ds.siconca.to_iris()
-#  - cube.coord('projection_x_coordinate').convert_units('meters')
-#  - cube.coord('projection_y_coordinate').convert_units('meters')
-#  - iris.save("ref.osisaf.nc")
+HEMI_SHORT="nh"
+[ $HEMI == "south" ] && HEMI_SHORT="sh"
+# TODO: we should be able to preseve data during download-toolbox processing for this, but
+#  alas this needs some investigation to achieve, so this will work for the moment
+icenet_generate_ref_osisaf -v data/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc
 
-preprocess_regrid -v $ERA5_DATASET ref.osisaf.nc $ATMOS_PROC
+preprocess_regrid -v $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC
   # TODO: get the batcher back in place for multiprocessing this
   # TODO: this should regrid ALL files in the dataset, for some reason 2024.nc did not get wrapped in
 preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.nc
@@ -98,10 +94,10 @@ preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "
 
 icenet_dataset_create -v -c -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME
 
-# TODO: select a random date from the training set, plot and log so user can double check outputs
-icenet_plot_input -p -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./plot/input.png
-icenet_plot_input --outputs -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./plot/outputs.png
-icenet_plot_input --weights -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./plot/weights.png
+FIRST_DATE=`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.train[0]' | tr -d '"'`
+icenet_plot_input -p -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/input.${HEMI}.${FIRST_DATE}.png
+icenet_plot_input --outputs -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/outputs.${HEMI}.${FIRST_DATE}.png
+icenet_plot_input --weights -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/weights.${HEMI}.${FIRST_DATE}.png
 
 icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME
   # TODO: FIXME in here to override the creation of nan containing sets due to earlier issues

From 3016b5d5b647482dd7fd25ad775b3b085205d9ca Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Tue, 27 Aug 2024 23:33:50 +0100
Subject: [PATCH 23/44] Dev #53: implementing prediction and more comprehensive
 lifecycle, BUT with significant issues around download-toobox

---
 ensemble/predict.tmpl.yaml             |   9 ++
 ensemble/template/icenet_predict.sh.j2 |   2 +-
 prep_prediction_data.sh                | 111 ++++++++++++++++++++++++-
 prep_training_data.sh                  |  20 ++---
 run_prediction.sh                      |  50 +++++++++++
 script.backup/loader_test_dates.sh     |  14 ----
 script.backup/run_prediction.sh        |  82 ------------------
 7 files changed, 174 insertions(+), 114 deletions(-)
 create mode 100755 run_prediction.sh
 delete mode 100755 script.backup/loader_test_dates.sh
 delete mode 100755 script.backup/run_prediction.sh

diff --git a/ensemble/predict.tmpl.yaml b/ensemble/predict.tmpl.yaml
index 55ad34d..9b84fef 100644
--- a/ensemble/predict.tmpl.yaml
+++ b/ensemble/predict.tmpl.yaml
@@ -38,6 +38,15 @@ ensemble:
         - name:   execute
           args: 
               cmd:  /usr/bin/ln -s ../../data
+        - name:   execute
+          args:
+              cmd:  /usr/bin/ln -s ../../processed
+        - name:   execute
+          args:
+              cmd:  /usr/bin/ln -s ../../ref.osisaf.north.nc
+        - name:   execute
+          args:
+              cmd:  /usr/bin/ln -s ../../ref.osisaf.south.nc
       pre_run:    []
       runs:
         - seed:   SEEDS
diff --git a/ensemble/template/icenet_predict.sh.j2 b/ensemble/template/icenet_predict.sh.j2
index ba69559..a190cd5 100755
--- a/ensemble/template/icenet_predict.sh.j2
+++ b/ensemble/template/icenet_predict.sh.j2
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/bash -l
 #SBATCH --output={{ run.dir }}/predict.%j.%N.{{ run.seed }}.out
 #SBATCH --error={{ run.dir }}/predict.%j.%N.{{ run.seed }}.err
 #SBATCH --chdir={{ run.dir }}
diff --git a/prep_prediction_data.sh b/prep_prediction_data.sh
index bf0ca5f..884a6e3 100755
--- a/prep_prediction_data.sh
+++ b/prep_prediction_data.sh
@@ -1,11 +1,114 @@
-#!/bin/bash
+#!/usr/bin/bash -l
 
-source ENVS
+set -e -o pipefail
+
+. ENVS
 
 conda activate $ICENET_CONDA
 
-set -o pipefail
-set -eu
+if [ $# -lt 2 ] || [ "$1" == "-h" ]; then
+    echo "Usage $0 <prediction_name> <hemisphere> [date_vars] [train_data_name]"
+    echo "<prediction_name> name of prediction dataset"
+    echo "<hemisphere>      hemisphere to use"
+    echo "[date_vars]       variables for defining start and end dates to forecast"
+    echo "[train_data_name] name of data used to train the model"
+    echo "Options: none"
+    exit 1
+fi
+
+# obtaining any arguments that should be passed onto run_forecast_plots.sh
+OPTIND=1
+while getopts "" opt; do
+    case "$opt" in
+    esac
+done
+
+shift $((OPTIND-1))
+
+echo "Leftovers from getopt: $@"
+
+PREDICTION_NAME="$1"
+HEMI="$2"
+DATE_VARS="${3:-$PREDICTION_NAME}"
+DATA_PROC="${4:-${TRAIN_DATA_NAME}}.${HEMI}"
+
+NAME_START="${DATE_VARS^^}_START"
+NAME_END="${DATE_VARS^^}_END"
+echo "Dates from ENVS: $NAME_START and $NAME_END"
+PREDICTION_START=${!NAME_START}
+PREDICTION_END=${!NAME_END}
+
+if [ -z $PREDICTION_START ] || [ -z $PREDICTION_END ]; then
+    echo "Prediction date args not set correctly: \"$PREDICTION_START\" to \"$PREDICTION_END\""
+    exit 1
+else
+    echo "Prediction start arg: $PREDICTION_START"
+    echo "Prediction end arg: $PREDICTION_END"
+fi
+
+PREDICTION_DATASET="prediction.${PREDICTION_NAME}.${HEMI}"
+LOADER_CONFIGURATION="loader.${PREDICTION_DATASET}.json"
+
+PRED_DATA_START=`date --date "$PREDICTION_START - $LAG ${DATA_FREQUENCY}s" +%Y-%m-%d`
+# download-toolbox integration
+(
+  # download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS
+  download_osisaf $DATA_ARGS $HEMI $PRED_DATA_START $PREDICTION_END $OSISAF_VAR_ARGS
+  download_era5 $DATA_ARGS $HEMI $PRED_DATA_START $PREDICTION_END $ERA5_VAR_ARGS
+  # download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS
+
+  # TODO: this overwrites the ./data/osisaf/dataset_config.month.hemi.north.json files, which is unacceptable - localise
+  # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly
+) 2>&1 | tee logs/download.${PREDICTION_DATASET}.log
+
+DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json"
+
+# preprocess-toolbox integration
+# Persistent datasets from the source data store, wherever that is
+OSISAF_DATASET="${SOURCE_DATA_STORE}/osisaf/${DATASET_CONFIG_NAME}"
+ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}"
+ATMOS_PROC="era5_osi.$PREDICTION_DATASET"
+ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}"
+
+# Create links to the central data store datasets for easier "mapping"
+[ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf
+[ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5
+# TODO: AMSR
+# TODO: CMIP
+
+LOADER_CONFIGURATION="loader.${PREDICTION_DATASET}.json"
+
+preprocess_loader_init -v $PREDICTION_DATASET
+preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET land "icenet.data.masks.osisaf:Masks"
+preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.data.masks.osisaf:Masks"
+preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks"
+
+preprocess_dataset $PROC_ARGS_SIC -v \
+  -sn "prediction" -ss "$PREDICTION_START" -se "$PREDICTION_END" \
+  -r processed/${DATA_PROC}_osisaf/ \
+  -i "icenet.data.processors.osisaf:SICPreProcessor" \
+  $OSISAF_DATASET ${PREDICTION_DATASET}_osisaf
+
+if [ ! -f ref.osisaf.${HEMI}.nc ]; then
+  echo "Reference OSISAF for regrid should still be available, bailing for the mo"
+  exit 1
+fi
+
+preprocess_regrid -v $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC
+preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc
+
+preprocess_dataset $PROC_ARGS_ERA5 -v \
+  -sn "prediction" -ss "$PREDICTION_START" -se "$PREDICTION_END" \
+  -r processed/${DATA_PROC}_era5/ \
+  -i "icenet.data.processors.cds:ERA5PreProcessor" \
+  $ATMOS_PROC_DSC ${PREDICTION_DATASET}_era5
 
+preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PREDICTION_DATASET}_osisaf.json processed.${PREDICTION_DATASET}_era5.json
+preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET sin "icenet.data.meta:SinProcessor"
+preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET cos "icenet.data.meta:CosProcessor"
+preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET land_map "icenet.data.masks.osisaf:Masks"
 
+icenet_dataset_create -v -c -p -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $PREDICTION_DATASET
 
+FIRST_DATE=${PLOT_DATE:-`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.prediction[0]' | tr -d '"'`}
+icenet_plot_input -p -v dataset_config.${PREDICTION_DATASET}.json $FIRST_DATE ./plot/input.${HEMI}.${FIRST_DATE}.png
diff --git a/prep_training_data.sh b/prep_training_data.sh
index dcf64f6..6106ed7 100755
--- a/prep_training_data.sh
+++ b/prep_training_data.sh
@@ -21,7 +21,7 @@ if [ $DOWNLOAD -eq 1 ]; then
   download_osisaf $DATA_ARGS $HEMI $OSISAF_DATES $OSISAF_VAR_ARGS
   download_era5 $DATA_ARGS $HEMI $ERA5_DATES $ERA5_VAR_ARGS
   # download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS
-fi 2>&1 | tee logs/download.log
+fi 2>&1 | tee logs/download.training.log
 
 DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json"
 
@@ -36,14 +36,14 @@ ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}"
 # TODO: AMSR
 # TODO: CMIP
 
-GROUND_TRUTH_SIC="osi_sic"
-ATMOS_PROC="era5_osi"
+GROUND_TRUTH_SIC="osi_sic.$TRAIN_DATA_NAME"
+ATMOS_PROC="era5_osi.$TRAIN_DATA_NAME"
 
 # Our processed dataset configurations, we localise data when regridding and reprojecting
 GROUND_TRUTH_SIC_DSC="data/${GROUND_TRUTH_SIC}/${DATASET_CONFIG_NAME}"
 ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}"
 
-PROCESSED_DATASET="training.${HEMI}"
+PROCESSED_DATASET="${TRAIN_DATA_NAME}.${HEMI}"
 LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json"
 DATASET_NAME="tfdata_${HEMI}"
 
@@ -55,28 +55,23 @@ preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.d
 preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks"
 
 preprocess_missing_time -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC
-  # TODO: didn't seemingly detect missing months? data/osi_sic/month/hemi.north/siconca.missing_days.csv
-  # TODO: undoubtedly need to include the known invalid dates - added these to the osisaf downloader
 preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC
-  # TODO: Interpolation failing in all cases?
-  # TODO: this undoubtedly explains the stray nans present in dataset generation
 
 preprocess_dataset $PROC_ARGS_SIC -v \
   -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \
   -i "icenet.data.processors.osisaf:SICPreProcessor" \
   $GROUND_TRUTH_SIC_DSC ${PROCESSED_DATASET}_osisaf
-  # TODO: plenty of nans contained in here due to failing spatial interpolation - needs investigation
 
 HEMI_SHORT="nh"
 [ $HEMI == "south" ] && HEMI_SHORT="sh"
-# TODO: we should be able to preseve data during download-toolbox processing for this, but
+# TODO: we should be able to preserve data during download-toolbox processing for this, but
 #  alas this needs some investigation to achieve, so this will work for the moment
 icenet_generate_ref_osisaf -v data/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc
 
 preprocess_regrid -v $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC
   # TODO: get the batcher back in place for multiprocessing this
   # TODO: this should regrid ALL files in the dataset, for some reason 2024.nc did not get wrapped in
-preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.nc
+preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc
   # TODO: get the batcher back in place for multiprocessing this
 
 preprocess_dataset $PROC_ARGS_ERA5 -v \
@@ -94,10 +89,9 @@ preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "
 
 icenet_dataset_create -v -c -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME
 
-FIRST_DATE=`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.train[0]' | tr -d '"'`
+FIRST_DATE=${PLOT_DATE:-`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.train[0]' | tr -d '"'`}
 icenet_plot_input -p -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/input.${HEMI}.${FIRST_DATE}.png
 icenet_plot_input --outputs -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/outputs.${HEMI}.${FIRST_DATE}.png
 icenet_plot_input --weights -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/weights.${HEMI}.${FIRST_DATE}.png
 
 icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME
-  # TODO: FIXME in here to override the creation of nan containing sets due to earlier issues
diff --git a/run_prediction.sh b/run_prediction.sh
new file mode 100755
index 0000000..321a1ba
--- /dev/null
+++ b/run_prediction.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/bash -l
+
+set -e -o pipefail
+
+. ENVS
+
+conda activate $ICENET_CONDA
+
+if [ $# -lt 3 ] || [ "$1" == "-h" ]; then
+    echo "Usage $0 <prediction_name> <model> <hemisphere> [date_vars] [train_data_name]"
+    echo "<prediction_name> name of prediction]"
+    echo "<model>           model name"
+    echo "<hemisphere>      hemisphere to use"
+    echo "Options: none"
+    exit 1
+fi
+
+# obtaining any arguments that should be passed onto run_forecast_plots.sh
+OPTIND=1
+while getopts "" opt; do
+    case "$opt" in
+    esac
+done
+
+shift $((OPTIND-1))
+
+echo "Leftovers from getopt: $@"
+
+PREDICTION_NAME="prediction.$1"
+MODEL="$2"
+HEMI="$3"
+
+# This assumes you're not retraining using the same model name, eek
+if [ -d results/networks/${MODEL}_${HEMI} ]; then
+    SAVEFILE=`ls results/networks/${MODEL}_${HEMI}/${MODEL}_${HEMI}.*.h5 | head -n 1`
+    DATASET=`echo $SAVEFILE | perl -lpe's/.+\.network_(.+)\.[0-9]+\.h5/$1/'`
+    echo "First model file: $SAVEFILE"
+    echo "Dataset model was trained on: $DATASET"
+else
+    echo "Model $MODEL doesn't exist"
+    exit 1
+fi
+
+LOADER_NAME="loader.${PREDICTION_NAME}.${HEMI}.json"
+jq -c '.sources[].splits["prediction"][]' $LOADER_NAME | sort | uniq | sed -r \
+    -e 's/"//g' \
+    -e 's/([0-9]{4})_([0-9]{2})_([0-9]{2})/\1-\2-\3/' >${PREDICTION_NAME}.${HEMI}.csv
+
+./run_predict_ensemble.sh -d -i $DATASET -f $FILTER_FACTOR -p $PREP_SCRIPT \
+    ${MODEL}_${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI}.csv
diff --git a/script.backup/loader_test_dates.sh b/script.backup/loader_test_dates.sh
deleted file mode 100755
index 3b78579..0000000
--- a/script.backup/loader_test_dates.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-
-if [ $# -ne 1 ]; then
-    echo "Usage $0 <loaderName>"
-    exit 1
-fi
-
-LOADER_NAME="loader.${1}.json"
-
-jq -c '.sources[].dates["test"][]' $LOADER_NAME | sort | uniq | sed -r \
-    -e 's/"//g' \
-    -e 's/([0-9]{4})_([0-9]{2})_([0-9]{2})/\1-\2-\3/'
-
-exit 0
diff --git a/script.backup/run_prediction.sh b/script.backup/run_prediction.sh
deleted file mode 100755
index 5d31bca..0000000
--- a/script.backup/run_prediction.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/bash
-
-set -e -o pipefail
-
-. ENVS
-
-conda activate $ICENET_CONDA
-
-if [ $# -lt 3 ] || [ "$1" == "-h" ]; then
-    echo "Usage $0 <forecast name> <model> <hemisphere> [date_vars] [train_data_name]"
-    echo "<forecast_name>   name of forecast"
-    echo "<model>           model name"
-    echo "<hemisphere>      hemisphere to use"
-    echo "[date_vars]       variables for defining start and end dates to forecast"
-    echo "[train_data_name] name of data used to train the model"
-    echo "Options: none"
-    exit 1
-fi
-
-# obtaining any arguments that should be passed onto run_forecast_plots.sh
-OPTIND=1
-while getopts "" opt; do
-    case "$opt" in
-    esac
-done
-
-shift $((OPTIND-1))
-
-echo "Leftovers from getopt: $@"
-
-FORECAST="$1"
-MODEL="$2"
-HEMI="$3"
-DATE_VARS="${4:-$FORECAST}"
-DATA_PROC="${5:-${TRAIN_DATA_NAME}}_${HEMI}"
-
-# This assumes you're not retraining using the same model name, eek
-if [ -d results/networks/$MODEL ]; then
-    SAVEFILE=`ls results/networks/${MODEL}/${MODEL}.*.h5 | head -n 1`
-    DATASET=`echo $SAVEFILE | perl -lpe's/.+\.network_(.+)\.[0-9]+\.h5/$1/'`
-    echo "First model file: $SAVEFILE"
-    echo "Dataset model was trained on: $DATASET"
-else
-    echo "Model $MODEL doesn't exist"
-    exit 1
-fi
-
-NAME_START="${DATE_VARS^^}_START"
-NAME_END="${DATE_VARS^^}_END"
-echo "Dates from ENVS: $NAME_START and $NAME_END"
-PREDICTION_START=${!NAME_START}
-PREDICTION_END=${!NAME_END}
-
-if [ -z $PREDICTION_START ] || [ -z $PREDICTION_END ]; then
-    echo "Prediction date args not set correctly: \"$PREDICTION_START\" to \"$PREDICTION_END\""
-    exit 1
-else
-    echo "Prediction start arg: $PREDICTION_START"
-    echo "Prediction end arg: $PREDICTION_END"
-fi
-
-[ ! -z "$PROC_ARGS_ERA5" ] && \
-    icenet_process_era5 -r processed/$DATA_PROC/era5/$HEMI \
-        $PROC_ARGS_ERA5 \
-        -v -l $LAG -ts $PREDICTION_START -te $PREDICTION_END ${FORECAST}_${HEMI} $HEMI
-
-[ ! -z "$PROC_ARGS_ORAS5" ] && \
-    icenet_process_oras5 -r processed/$DATA_PROC/oras5/$HEMI \
-        $PROC_ARGS_ORAS5 \
-        -v -l $LAG -ts $PREDICTION_START -te $PREDICTION_END ${FORECAST}_${HEMI} $HEMI
-
-[ ! -z "$PROC_ARGS_SIC" ] && \
-    icenet_process_sic  -r processed/$DATA_PROC/osisaf/$HEMI \
-        $PROC_ARGS_SIC \
-        -v -l $LAG -ts $PREDICTION_START -te $PREDICTION_END ${FORECAST}_${HEMI} $HEMI
-
-icenet_process_metadata ${FORECAST}_${HEMI} $HEMI
-icenet_dataset_create -l $LAG -c ${FORECAST}_${HEMI} $HEMI
-./loader_test_dates.sh ${FORECAST}_${HEMI} >${FORECAST}_${HEMI}.csv
-
-./run_predict_ensemble.sh -i $DATASET -f $FILTER_FACTOR -p $PREP_SCRIPT \
-    $MODEL ${FORECAST}_${HEMI} ${FORECAST}_${HEMI} ${FORECAST}_${HEMI}.csv

From ddc593c2c65e88f39324833b9b3ddac2e40270f0 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Thu, 29 Aug 2024 11:46:21 +0100
Subject: [PATCH 24/44] Remapping lag and lead to the forecasting processing

---
 prep_training_data.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/prep_training_data.sh b/prep_training_data.sh
index 6106ed7..07a1d89 100755
--- a/prep_training_data.sh
+++ b/prep_training_data.sh
@@ -60,6 +60,7 @@ preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,active_grid_
 preprocess_dataset $PROC_ARGS_SIC -v \
   -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \
   -i "icenet.data.processors.osisaf:SICPreProcessor" \
+  -sh $LAG -st $FORECAST_LENGTH \
   $GROUND_TRUTH_SIC_DSC ${PROCESSED_DATASET}_osisaf
 
 HEMI_SHORT="nh"
@@ -77,6 +78,7 @@ preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc
 preprocess_dataset $PROC_ARGS_ERA5 -v \
   -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \
   -i "icenet.data.processors.cds:ERA5PreProcessor" \
+  -sh $LAG -st $FORECAST_LENGTH \
   $ATMOS_PROC_DSC ${PROCESSED_DATASET}_era5
   # TODO: naive copy of "./data/era5_osi/month/hemi.north/uas/2024.nc" earlier is not regridded?
   # TODO: dask multiprocessing cluster with task batcher across multiple variables would be sensible

From c9bfb382ca592445565bf7a3db2230b058c767d6 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Thu, 29 Aug 2024 12:25:25 +0100
Subject: [PATCH 25/44] Correcting for localised processed data store

---
 prep_prediction_data.sh |  3 ++-
 prep_training_data.sh   | 21 ++++++++-------------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/prep_prediction_data.sh b/prep_prediction_data.sh
index 884a6e3..af30826 100755
--- a/prep_prediction_data.sh
+++ b/prep_prediction_data.sh
@@ -58,7 +58,6 @@ PRED_DATA_START=`date --date "$PREDICTION_START - $LAG ${DATA_FREQUENCY}s" +%Y-%
   # download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS
 
   # TODO: this overwrites the ./data/osisaf/dataset_config.month.hemi.north.json files, which is unacceptable - localise
-  # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly
 ) 2>&1 | tee logs/download.${PREDICTION_DATASET}.log
 
 DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json"
@@ -88,6 +87,7 @@ preprocess_dataset $PROC_ARGS_SIC -v \
   -r processed/${DATA_PROC}_osisaf/ \
   -i "icenet.data.processors.osisaf:SICPreProcessor" \
   $OSISAF_DATASET ${PREDICTION_DATASET}_osisaf
+  # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly
 
 if [ ! -f ref.osisaf.${HEMI}.nc ]; then
   echo "Reference OSISAF for regrid should still be available, bailing for the mo"
@@ -102,6 +102,7 @@ preprocess_dataset $PROC_ARGS_ERA5 -v \
   -r processed/${DATA_PROC}_era5/ \
   -i "icenet.data.processors.cds:ERA5PreProcessor" \
   $ATMOS_PROC_DSC ${PREDICTION_DATASET}_era5
+  # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly
 
 preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PREDICTION_DATASET}_osisaf.json processed.${PREDICTION_DATASET}_era5.json
 preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET sin "icenet.data.meta:SinProcessor"
diff --git a/prep_training_data.sh b/prep_training_data.sh
index 07a1d89..ed818f3 100755
--- a/prep_training_data.sh
+++ b/prep_training_data.sh
@@ -40,8 +40,8 @@ GROUND_TRUTH_SIC="osi_sic.$TRAIN_DATA_NAME"
 ATMOS_PROC="era5_osi.$TRAIN_DATA_NAME"
 
 # Our processed dataset configurations, we localise data when regridding and reprojecting
-GROUND_TRUTH_SIC_DSC="data/${GROUND_TRUTH_SIC}/${DATASET_CONFIG_NAME}"
-ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}"
+GROUND_TRUTH_SIC_DSC="${PROCESSED_DATA_STORE}/${GROUND_TRUTH_SIC}/${DATASET_CONFIG_NAME}"
+ATMOS_PROC_DSC="${PROCESSED_DATA_STORE}/${ATMOS_PROC}/${DATASET_CONFIG_NAME}"
 
 PROCESSED_DATASET="${TRAIN_DATA_NAME}.${HEMI}"
 LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json"
@@ -50,11 +50,12 @@ DATASET_NAME="tfdata_${HEMI}"
 ## Workflow
 preprocess_loader_init -v $PROCESSED_DATASET
 
-preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET land "icenet.data.masks.osisaf:Masks"
-preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.data.masks.osisaf:Masks"
-preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks"
-
 preprocess_missing_time -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC
+
+preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land "icenet.data.masks.osisaf:Masks"
+preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC polarhole "icenet.data.masks.osisaf:Masks"
+preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC active_grid_cell "icenet.data.masks.osisaf:Masks"
+
 preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC
 
 preprocess_dataset $PROC_ARGS_SIC -v \
@@ -65,23 +66,17 @@ preprocess_dataset $PROC_ARGS_SIC -v \
 
 HEMI_SHORT="nh"
 [ $HEMI == "south" ] && HEMI_SHORT="sh"
-# TODO: we should be able to preserve data during download-toolbox processing for this, but
-#  alas this needs some investigation to achieve, so this will work for the moment
+
 icenet_generate_ref_osisaf -v data/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc
 
 preprocess_regrid -v $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC
-  # TODO: get the batcher back in place for multiprocessing this
-  # TODO: this should regrid ALL files in the dataset, for some reason 2024.nc did not get wrapped in
 preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc
-  # TODO: get the batcher back in place for multiprocessing this
 
 preprocess_dataset $PROC_ARGS_ERA5 -v \
   -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \
   -i "icenet.data.processors.cds:ERA5PreProcessor" \
   -sh $LAG -st $FORECAST_LENGTH \
   $ATMOS_PROC_DSC ${PROCESSED_DATASET}_era5
-  # TODO: naive copy of "./data/era5_osi/month/hemi.north/uas/2024.nc" earlier is not regridded?
-  # TODO: dask multiprocessing cluster with task batcher across multiple variables would be sensible
 
 preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PROCESSED_DATASET}_osisaf.json processed.${PROCESSED_DATASET}_era5.json
 

From a9a34c7db71146a98f41826c03133298be7648a0 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Thu, 29 Aug 2024 12:47:16 +0100
Subject: [PATCH 26/44] Mask data ref was missing

---
 prep_training_data.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prep_training_data.sh b/prep_training_data.sh
index ed818f3..9de3f77 100755
--- a/prep_training_data.sh
+++ b/prep_training_data.sh
@@ -67,7 +67,7 @@ preprocess_dataset $PROC_ARGS_SIC -v \
 HEMI_SHORT="nh"
 [ $HEMI == "south" ] && HEMI_SHORT="sh"
 
-icenet_generate_ref_osisaf -v data/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc
+icenet_generate_ref_osisaf -v ${PROCESSED_DATA_STORE}/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc
 
 preprocess_regrid -v $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC
 preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc

From 2c2ee7ae0fb24036226270b55f26650ea49ba365 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 30 Aug 2024 17:07:23 +0100
Subject: [PATCH 27/44] Updated for much more efficient copying and processing
 of prediction datasets

---
 prep_prediction_data.sh | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/prep_prediction_data.sh b/prep_prediction_data.sh
index af30826..4dae82a 100755
--- a/prep_prediction_data.sh
+++ b/prep_prediction_data.sh
@@ -56,8 +56,6 @@ PRED_DATA_START=`date --date "$PREDICTION_START - $LAG ${DATA_FREQUENCY}s" +%Y-%
   download_osisaf $DATA_ARGS $HEMI $PRED_DATA_START $PREDICTION_END $OSISAF_VAR_ARGS
   download_era5 $DATA_ARGS $HEMI $PRED_DATA_START $PREDICTION_END $ERA5_VAR_ARGS
   # download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS
-
-  # TODO: this overwrites the ./data/osisaf/dataset_config.month.hemi.north.json files, which is unacceptable - localise
 ) 2>&1 | tee logs/download.${PREDICTION_DATASET}.log
 
 DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json"
@@ -66,8 +64,9 @@ DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json"
 # Persistent datasets from the source data store, wherever that is
 OSISAF_DATASET="${SOURCE_DATA_STORE}/osisaf/${DATASET_CONFIG_NAME}"
 ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}"
+
 ATMOS_PROC="era5_osi.$PREDICTION_DATASET"
-ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}"
+ATMOS_PROC_DSC="${PROCESSED_DATA_STORE}/${ATMOS_PROC}/${DATASET_CONFIG_NAME}"
 
 # Create links to the central data store datasets for easier "mapping"
 [ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf
@@ -76,16 +75,16 @@ ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}"
 # TODO: CMIP
 
 LOADER_CONFIGURATION="loader.${PREDICTION_DATASET}.json"
+TRAIN_LOADER_CONFIGURATION="loader.${TRAIN_DATA_NAME}.${HEMI}.json"
 
 preprocess_loader_init -v $PREDICTION_DATASET
-preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET land "icenet.data.masks.osisaf:Masks"
-preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.data.masks.osisaf:Masks"
-preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks"
+preprocess_loader_copy $TRAIN_LOADER_CONFIGURATION $PREDICTION_DATASET masks channels
 
 preprocess_dataset $PROC_ARGS_SIC -v \
   -sn "prediction" -ss "$PREDICTION_START" -se "$PREDICTION_END" \
   -r processed/${DATA_PROC}_osisaf/ \
   -i "icenet.data.processors.osisaf:SICPreProcessor" \
+  -sh $LAG -st $FORECAST_LENGTH \
   $OSISAF_DATASET ${PREDICTION_DATASET}_osisaf
   # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly
 
@@ -94,20 +93,22 @@ if [ ! -f ref.osisaf.${HEMI}.nc ]; then
   exit 1
 fi
 
-preprocess_regrid -v $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC
-preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc
+preprocess_regrid -v \
+  -sn "prediction" -ss "$PREDICTION_START" -se "$PREDICTION_END" \
+  -sh $LAG -st $FORECAST_LENGTH \
+  $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC
+preprocess_rotate -v \
+  -n uas,vas $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc
 
 preprocess_dataset $PROC_ARGS_ERA5 -v \
   -sn "prediction" -ss "$PREDICTION_START" -se "$PREDICTION_END" \
   -r processed/${DATA_PROC}_era5/ \
   -i "icenet.data.processors.cds:ERA5PreProcessor" \
+  -sh $LAG -st $FORECAST_LENGTH \
   $ATMOS_PROC_DSC ${PREDICTION_DATASET}_era5
   # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly
 
 preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PREDICTION_DATASET}_osisaf.json processed.${PREDICTION_DATASET}_era5.json
-preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET sin "icenet.data.meta:SinProcessor"
-preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET cos "icenet.data.meta:CosProcessor"
-preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET land_map "icenet.data.masks.osisaf:Masks"
 
 icenet_dataset_create -v -c -p -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $PREDICTION_DATASET
 

From 0dcdf7b1c91f932a48512e0ea2a95e79a0fe74e1 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 30 Aug 2024 17:15:18 +0100
Subject: [PATCH 28/44] Fixes #53: last amendment for patching model name
 correctly?

---
 run_prediction.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_prediction.sh b/run_prediction.sh
index 321a1ba..87ebf6e 100755
--- a/run_prediction.sh
+++ b/run_prediction.sh
@@ -47,4 +47,4 @@ jq -c '.sources[].splits["prediction"][]' $LOADER_NAME | sort | uniq | sed -r \
     -e 's/([0-9]{4})_([0-9]{2})_([0-9]{2})/\1-\2-\3/' >${PREDICTION_NAME}.${HEMI}.csv
 
 ./run_predict_ensemble.sh -d -i $DATASET -f $FILTER_FACTOR -p $PREP_SCRIPT \
-    ${MODEL}_${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI}.csv
+    ${MODEL}.${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI}.csv

From 461e18e8942950ef725d759172d515d950a7e42b Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 30 Aug 2024 17:24:21 +0100
Subject: [PATCH 29/44] Forgot to complete the full comfiguration naming in
 prediction copies

---
 prep_prediction_data.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prep_prediction_data.sh b/prep_prediction_data.sh
index 4dae82a..03564e9 100755
--- a/prep_prediction_data.sh
+++ b/prep_prediction_data.sh
@@ -78,7 +78,7 @@ LOADER_CONFIGURATION="loader.${PREDICTION_DATASET}.json"
 TRAIN_LOADER_CONFIGURATION="loader.${TRAIN_DATA_NAME}.${HEMI}.json"
 
 preprocess_loader_init -v $PREDICTION_DATASET
-preprocess_loader_copy $TRAIN_LOADER_CONFIGURATION $PREDICTION_DATASET masks channels
+preprocess_loader_copy $TRAIN_LOADER_CONFIGURATION loader.${PREDICTION_DATASET}.json masks channels
 
 preprocess_dataset $PROC_ARGS_SIC -v \
   -sn "prediction" -ss "$PREDICTION_START" -se "$PREDICTION_END" \

From 15fa9491781ab0deb98b0572448b218c332a50cb Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 30 Aug 2024 17:41:57 +0100
Subject: [PATCH 30/44] Further fixing of changes to model delimiters

---
 run_prediction.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/run_prediction.sh b/run_prediction.sh
index 87ebf6e..8097f9c 100755
--- a/run_prediction.sh
+++ b/run_prediction.sh
@@ -31,8 +31,8 @@ MODEL="$2"
 HEMI="$3"
 
 # This assumes you're not retraining using the same model name, eek
-if [ -d results/networks/${MODEL}_${HEMI} ]; then
-    SAVEFILE=`ls results/networks/${MODEL}_${HEMI}/${MODEL}_${HEMI}.*.h5 | head -n 1`
+if [ -d results/networks/${MODEL}.${HEMI} ]; then
+    SAVEFILE=`ls results/networks/${MODEL}.${HEMI}/${MODEL}.${HEMI}.*.h5 | head -n 1`
     DATASET=`echo $SAVEFILE | perl -lpe's/.+\.network_(.+)\.[0-9]+\.h5/$1/'`
     echo "First model file: $SAVEFILE"
     echo "Dataset model was trained on: $DATASET"

From a183f4165232ffe44ad2b024655f03da576367ee Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 30 Aug 2024 17:44:31 +0100
Subject: [PATCH 31/44] Clearing up some cruft and giving the option to supply
 extra args

---
 run_prediction.sh | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/run_prediction.sh b/run_prediction.sh
index 8097f9c..9d3e7e9 100755
--- a/run_prediction.sh
+++ b/run_prediction.sh
@@ -15,20 +15,10 @@ if [ $# -lt 3 ] || [ "$1" == "-h" ]; then
     exit 1
 fi
 
-# obtaining any arguments that should be passed onto run_forecast_plots.sh
-OPTIND=1
-while getopts "" opt; do
-    case "$opt" in
-    esac
-done
-
-shift $((OPTIND-1))
-
-echo "Leftovers from getopt: $@"
-
 PREDICTION_NAME="prediction.$1"
 MODEL="$2"
 HEMI="$3"
+EXTRA_ARGS="${4:-""}"
 
 # This assumes you're not retraining using the same model name, eek
 if [ -d results/networks/${MODEL}.${HEMI} ]; then
@@ -46,5 +36,5 @@ jq -c '.sources[].splits["prediction"][]' $LOADER_NAME | sort | uniq | sed -r \
     -e 's/"//g' \
     -e 's/([0-9]{4})_([0-9]{2})_([0-9]{2})/\1-\2-\3/' >${PREDICTION_NAME}.${HEMI}.csv
 
-./run_predict_ensemble.sh -d -i $DATASET -f $FILTER_FACTOR -p $PREP_SCRIPT \
+./run_predict_ensemble.sh $EXTRA_ARGS -i $DATASET -f $FILTER_FACTOR -p $PREP_SCRIPT \
     ${MODEL}.${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI}.csv

From 7a6d2b1a67d8d54cb1569b17fcc9d66f8c1ce7df Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Mon, 2 Sep 2024 10:17:35 +0100
Subject: [PATCH 32/44] Updating for variable temporal lengths and resolutions

---
 ensemble/predict.tmpl.yaml |  3 +++
 process_op_assets.sh       | 11 ++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/ensemble/predict.tmpl.yaml b/ensemble/predict.tmpl.yaml
index 9b84fef..cfe61cf 100644
--- a/ensemble/predict.tmpl.yaml
+++ b/ensemble/predict.tmpl.yaml
@@ -41,6 +41,9 @@ ensemble:
         - name:   execute
           args:
               cmd:  /usr/bin/ln -s ../../processed
+        - name:   execute
+          args:
+              cmd:  /usr/bin/ln -s ../../processed_data
         - name:   execute
           args:
               cmd:  /usr/bin/ln -s ../../ref.osisaf.north.nc
diff --git a/process_op_assets.sh b/process_op_assets.sh
index 46b8a44..72f8446 100755
--- a/process_op_assets.sh
+++ b/process_op_assets.sh
@@ -9,6 +9,7 @@ OUTPUT_DIR="results/forecasts/$FORECAST_NAME"
 LOG_DIR="log/forecasts/$FORECAST_NAME"
 
 FORECAST_FILE="results/predict/${FORECAST_NAME}.nc"
+FORECAST_LENGTH=`python -c 'import xarray as xr; print(int(xr.open_dataset("'$FORECAST_FILE'").leadtime.max()))'`
 HEMI=`echo $FORECAST_NAME | sed -r 's/^.+_(north|south)$/\1/'`
 
 if [ $# -lt 1 ] || [ "$1" == "-h" ]; then
@@ -62,23 +63,23 @@ for DATE_FORECAST in $( cat ${FORECAST_NAME}.csv ); do
   python -c 'import xarray; xarray.open_dataset("'$FORECAST_FILE'").sel(time=slice("'$DATE_FORECAST'", "'$DATE_FORECAST'")).to_netcdf("'$DATE_DIR'/'$DATE_FORECAST'.nc")'
 
   echo "Producing geotiffs from that file"
-  icenet_output_geotiff -o $DATE_DIR $FORECAST_FILE $DATE_FORECAST 1..93
+  icenet_output_geotiff -o $DATE_DIR $FORECAST_FILE $DATE_FORECAST 1..$FORECAST_LENGTH
   rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.tiff'
 
   echo "Producing movie file of raw video"
-  icenet_plot_forecast $REGION -o $DATE_DIR -l 1..93 -f mp4 $HEMI $FORECAST_FILE $DATE_FORECAST
+  icenet_plot_forecast $REGION -o $DATE_DIR -l 1..$FORECAST_LENGTH -f mp4 $HEMI $FORECAST_FILE $DATE_FORECAST
   rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.mp4'
 
   echo "Producing stills for manual composition (with coastlines)"
-  icenet_plot_forecast $REGION -o $DATE_DIR -l 1..93 $HEMI $FORECAST_FILE $DATE_FORECAST
+  icenet_plot_forecast $REGION -o $DATE_DIR -l 1..$FORECAST_LENGTH $HEMI $FORECAST_FILE $DATE_FORECAST
   ffmpeg -framerate 5 -pattern_type glob -i ${DATE_DIR}'/'${FORECAST_NAME}'.*.png' -c:v libx264 ${DATE_DIR}/${FORECAST_NAME}.mp4
   rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.png'
 
   echo "Producing movie and stills of ensemble standard deviation in predictions"
-  icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..93 -f mp4 $HEMI $FORECAST_FILE $DATE_FORECAST
+  icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..$FORECAST_LENGTH -f mp4 $HEMI $FORECAST_FILE $DATE_FORECAST
   rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.stddev.mp4'
 
-  icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..93 $HEMI $FORECAST_FILE $DATE_FORECAST
+  icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..$FORECAST_LENGTH $HEMI $FORECAST_FILE $DATE_FORECAST
   ffmpeg -framerate 5 -pattern_type glob -i ${DATE_DIR}'/'${FORECAST_NAME}'.*.stddev.png' -c:v libx264 ${DATE_DIR}/${FORECAST_NAME}.stddev.mp4
   rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.stddev.png'
 

From 97f137bc6ecd34899d7414ab6181013e7b5c8cb3 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Mon, 2 Sep 2024 10:19:13 +0100
Subject: [PATCH 33/44] Updating hemi regex

---
 process_op_assets.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/process_op_assets.sh b/process_op_assets.sh
index 72f8446..f21acf3 100755
--- a/process_op_assets.sh
+++ b/process_op_assets.sh
@@ -10,7 +10,7 @@ LOG_DIR="log/forecasts/$FORECAST_NAME"
 
 FORECAST_FILE="results/predict/${FORECAST_NAME}.nc"
 FORECAST_LENGTH=`python -c 'import xarray as xr; print(int(xr.open_dataset("'$FORECAST_FILE'").leadtime.max()))'`
-HEMI=`echo $FORECAST_NAME | sed -r 's/^.+_(north|south)$/\1/'`
+HEMI=`echo $FORECAST_NAME | sed -r 's/^.+\.(north|south)$/\1/'`
 
 if [ $# -lt 1 ] || [ "$1" == "-h" ]; then
   echo "$0 <forecast name w/hemi> [region]"

From 8344fdd1e75a578d4389f6a293f4dac2c849adb6 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Tue, 3 Sep 2024 17:18:39 +0100
Subject: [PATCH 34/44] Updated for producing op assets with environmental
 forecasting

---
 process_op_assets.sh | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/process_op_assets.sh b/process_op_assets.sh
index f21acf3..5ea098c 100755
--- a/process_op_assets.sh
+++ b/process_op_assets.sh
@@ -12,6 +12,9 @@ FORECAST_FILE="results/predict/${FORECAST_NAME}.nc"
 FORECAST_LENGTH=`python -c 'import xarray as xr; print(int(xr.open_dataset("'$FORECAST_FILE'").leadtime.max()))'`
 HEMI=`echo $FORECAST_NAME | sed -r 's/^.+\.(north|south)$/\1/'`
 
+GROUND_TRUTH_DS=`jq -r 'first(.sources[]).dataset_config' loader.${FORECAST_NAME}.json`
+GROUND_TRUTH_DIR=`dirname $( jq -r '.data._var_files.siconca[0]' $GROUND_TRUTH_DS )`
+
 if [ $# -lt 1 ] || [ "$1" == "-h" ]; then
   echo "$0 <forecast name w/hemi> [region]"
   exit 1
@@ -24,7 +27,6 @@ if ! [[ $HEMI =~ ^(north|south)$ ]]; then
   exit 1
 fi
 
-
 function produce_docs {
   local DIR=$1
 
@@ -43,7 +45,6 @@ function rename_gfx {
   done
 }
 
-
 for WORKING_DIR in "$OUTPUT_DIR" "$LOG_DIR"; do
   if [ -d $WORKING_DIR ]; then
     echo "Output directory $WORKING_DIR already exists, removing"
@@ -67,20 +68,21 @@ for DATE_FORECAST in $( cat ${FORECAST_NAME}.csv ); do
   rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.tiff'
 
   echo "Producing movie file of raw video"
-  icenet_plot_forecast $REGION -o $DATE_DIR -l 1..$FORECAST_LENGTH -f mp4 $HEMI $FORECAST_FILE $DATE_FORECAST
+  icenet_plot_forecast $REGION -o $DATE_DIR -l 1..$FORECAST_LENGTH -f mp4 $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST
   rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.mp4'
 
   echo "Producing stills for manual composition (with coastlines)"
-  icenet_plot_forecast $REGION -o $DATE_DIR -l 1..$FORECAST_LENGTH $HEMI $FORECAST_FILE $DATE_FORECAST
-  ffmpeg -framerate 5 -pattern_type glob -i ${DATE_DIR}'/'${FORECAST_NAME}'.*.png' -c:v libx264 ${DATE_DIR}/${FORECAST_NAME}.mp4
+  icenet_plot_forecast $REGION -o $DATE_DIR -l 1..$FORECAST_LENGTH $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST
+  # Removed -c:v libx264
+  ffmpeg -framerate 5 -pattern_type glob -i ${DATE_DIR}'/'${FORECAST_NAME}'.*.png' ${DATE_DIR}/${FORECAST_NAME}.mp4
   rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.png'
 
   echo "Producing movie and stills of ensemble standard deviation in predictions"
-  icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..$FORECAST_LENGTH -f mp4 $HEMI $FORECAST_FILE $DATE_FORECAST
+  icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..$FORECAST_LENGTH -f mp4 $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST
   rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.stddev.mp4'
 
-  icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..$FORECAST_LENGTH $HEMI $FORECAST_FILE $DATE_FORECAST
-  ffmpeg -framerate 5 -pattern_type glob -i ${DATE_DIR}'/'${FORECAST_NAME}'.*.stddev.png' -c:v libx264 ${DATE_DIR}/${FORECAST_NAME}.stddev.mp4
+  icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..$FORECAST_LENGTH $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST
+  ffmpeg -framerate 5 -pattern_type glob -i ${DATE_DIR}'/'${FORECAST_NAME}'.*.stddev.png' ${DATE_DIR}/${FORECAST_NAME}.stddev.mp4
   rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.stddev.png'
 
   produce_docs $DATE_DIR
@@ -90,7 +92,7 @@ for DATE_FORECAST in $( cat ${FORECAST_NAME}.csv ); do
 
   echo "Producing binary accuracy plots (these are meaningless forecasting into the future w.r.t the OSISAF data)"
 
-  SIC_FILENAME="./data/osisaf/${HEMI}/siconca/`date +%Y`.nc"
+  SIC_FILENAME="${GROUND_TRUTH_DIR}/`date +%Y`.nc"
   # Get the most recent day, sorry for ignoring all timezone information
   SIC_LATEST=`python -c 'import xarray; print(str(xarray.open_dataset("'$SIC_FILENAME'").time.values[-1])[0:10])'`
 
@@ -100,20 +102,20 @@ for DATE_FORECAST in $( cat ${FORECAST_NAME}.csv ); do
     for THRESHOLD in 0.15 0.5 0.8 0.9; do
       icenet_plot_bin_accuracy $REGION -e -b -t $THRESHOLD \
         -o ${DATE_DIR}/bin_accuracy.${THRESHOLD}.png \
-        $HEMI $FORECAST_FILE $DATE_FORECAST
+        $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST
     done
 
     icenet_plot_metrics $REGION -e -b -s \
       -o ${DATE_DIR}/ \
-      $HEMI $FORECAST_FILE $DATE_FORECAST
+      $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST
 
     icenet_plot_sic_error $REGION \
       -o ${DATE_DIR}/${DATE_FORECAST}.sic_error.mp4 \
-      $HEMI $FORECAST_FILE $DATE_FORECAST
+      $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST
 
     icenet_plot_sie_error $REGION -e -b \
       -o ${DATE_DIR}/${DATE_FORECAST}.sie_error.25.png \
-      $HEMI $FORECAST_FILE $DATE_FORECAST
+      $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST
   else
     echo "We do not have observational SIC data ($SIC_LATEST) for plotting \
     forecast date $DATE_FORECAST"

From 0a5960b693f22d30c139a43624542fc831cf7fe2 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Wed, 4 Sep 2024 08:18:22 +0100
Subject: [PATCH 35/44] Updating plotting commands

---
 plot_forecast.sh    | 54 ++++++++++++++++++++++-----------------------
 plot_validations.sh | 22 +++++++++---------
 2 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/plot_forecast.sh b/plot_forecast.sh
index 3b40954..c877587 100755
--- a/plot_forecast.sh
+++ b/plot_forecast.sh
@@ -2,11 +2,10 @@
 
 source ENVS
 
-if [ $# -lt 2 ] || [ "$1" == "-h" ]; then
-    echo -e "\nUsage $0 <forecast_name> <hemisphere>"
+if [ $# -lt 1 ] || [ "$1" == "-h" ]; then
+    echo -e "\nUsage $0 <forecast_name>"
     echo -e "\nArguments"
     echo "<forecast_name>     name of forecast"
-    echo "<hemisphere>        hemisphere to use"
     echo -e "\nOptions"
     echo "-m <metrics>        string of metrics separated by commas, by default \"binacc,sie,mae,rmse,sic\". Options: \"binacc\", \"sie\", \"mae\", \"mse\", \"rmse\", \"sic\""
     echo "-r <region>         region arguments, by default uses full hemisphere"
@@ -17,7 +16,7 @@ if [ $# -lt 2 ] || [ "$1" == "-h" ]; then
     echo "-g <grid_area_size> grid area resolution to use - i.e. the length of the sides in km, by default 25 (i.e. 25km^2)"
     echo "-o <output_dir>     output directory path to store plots, by default \"plot/<forecast_name>\""
     echo -e "\nList of outputs generated"
-    echo "* If \"binacc\" is included in the requested metrics, will generate all binary accuracy plots for dates in <forecast_name>_<hemisphere>.csv"
+    echo "* If \"binacc\" is included in the requested metrics, will generate all binary accuracy plots for dates in <forecast_name>.csv"
     echo "- these will be saved in the format \"<output_dir>/binacc.t_<threshold>.<date>.png\""
     echo "If \"-l\" is passed, leadtime averaged plots for binary accuracy will be generated too:"
     echo "    - averaging over all: \"<output_dir>/binacc.t_<threshold>_leadtime_avg_all.png\""
@@ -26,7 +25,7 @@ if [ $# -lt 2 ] || [ "$1" == "-h" ]; then
     echo "    - averaging by month and for target date: \"<output_dir>/binacc.t_<threshold>_leadtime_avg_target_month.png\""
     echo "    - averaging by day and for target date: \"<output_dir>/binacc.t_<threshold>_leadtime_avg_target_day.png\""
     echo "If \"-v\" is passed, a video will be produced to stitch all these plots together and saved in \"<output_dir>/binacc.t_<threshold>.mp4\""
-    echo "* If \"sie\" is included in the requested metrics, will generate all SIE error plots for dates in <forecast_name>_<hemisphere>.csv"
+    echo "* If \"sie\" is included in the requested metrics, will generate all SIE error plots for dates in <forecast_name>.csv"
     echo "(these will be saved in the format \"<output_dir>/sie.t_<threshold>.g_<grid_area_size>.<date>.png\")"
     echo "If \"-l\" is passed, leadtime averaged plots for SIE error will be generated too:"
     echo "    - averaging over all: \"<output_dir>/sie.t_<threshold>.g_<grid_area_size>_leadtime_avg_all.png\""
@@ -35,7 +34,7 @@ if [ $# -lt 2 ] || [ "$1" == "-h" ]; then
     echo "    - averaging by month and for target date: \"<output_dir>/sie.t_<threshold>.g_<grid_area_size>_leadtime_avg_target_month.png\""
     echo "    - averaging by day and for target date: \"<output_dir>/sie.t_<threshold>.g_<grid_area_size>_leadtime_avg_target_day.png\""
     echo "If \"-v\" is passed, a video will be produced to stitch all these plots together and saved in \"<output_dir>/sie.t_<threshold>.g_<grid_area_size>.mp4\""
-    echo "* If \"mae\", \"mse\", or \"rmse\" is included in the requested metrics, will generate all MAE, MSE, or RMSE plots for dates in <forecast_name>_<hemisphere>.csv"
+    echo "* If \"mae\", \"mse\", or \"rmse\" is included in the requested metrics, will generate all MAE, MSE, or RMSE plots for dates in <forecast_name>.csv"
     echo "the names for the plots follow a similar convention as above but without the threshold or grid-area-size being saved in the name..."
     echo "for instance, for a given <metric>, these will be saved in the format \"<output_dir>/<metric>.<date>.png\""
     echo "If \"-l\" is passed, leadtime averaged plots for <metric> will be generated too:"
@@ -47,7 +46,7 @@ if [ $# -lt 2 ] || [ "$1" == "-h" ]; then
     echo "Note that if $\"-e\" is passed, all of these will have \"_comp\" after \"avg\""
     echo "The plot of the standard deviation of the metric for each forecast will also be generated"
     echo "If \"-v\" is passed, a video will be produced to stitch all these plots together and saved in \"<output_dir>/<metric>.mp4\""
-    echo "* If \"sic\" is included in the requested metrics, will generate all SIC error videos for dates in <forecast_name>_<hemisphere>.csv"
+    echo "* If \"sic\" is included in the requested metrics, will generate all SIC error videos for dates in <forecast_name>.csv"
     echo "(these will be saved in the format \"<output_dir>/sic.<date>.mp4\")"
     exit 1
 fi
@@ -106,18 +105,17 @@ shift $((OPTIND-1))
 
 # echo "Leftovers from getopt: $@"
 
-FORECAST="$1"
-HEMI="$2"
-
-FORECAST_NAME=${FORECAST}_${HEMI}
+FORECAST_NAME="$1"
 FORECAST_FILE="results/predict/${FORECAST_NAME}.nc"
 LOG_PREFIX="logs/${FORECAST_NAME}"
-BINACC_LOG="${LOG_PREFIX}_binacc.log"
-SIE_LOG="${LOG_PREFIX}_sie.log"
-MAE_LOG="${LOG_PREFIX}_mae.log"
-MSE_LOG="${LOG_PREFIX}_mse.log"
-RMSE_LOG="${LOG_PREFIX}_rmse.log"
-SICERR_LOG="${LOG_PREFIX}_sic.log"
+BINACC_LOG="${LOG_PREFIX}.binacc.log"
+SIE_LOG="${LOG_PREFIX}.sie.log"
+MAE_LOG="${LOG_PREFIX}.mae.log"
+MSE_LOG="${LOG_PREFIX}.mse.log"
+RMSE_LOG="${LOG_PREFIX}.rmse.log"
+SICERR_LOG="${LOG_PREFIX}.sic.log"
+
+GROUND_TRUTH_DS=`jq -r 'first(.sources[]).dataset_config' loader.${FORECAST_NAME}.json`
 
 if [ "${REQUESTED_OUTPUT_DIR}" == "" ]; then
     OUTPUT_DIR="plot/${FORECAST_NAME}"
@@ -143,29 +141,29 @@ cat ${FORECAST_NAME}.csv | while read -r FORECAST_DATE; do
             OUTPUT="${OUTPUT_DIR}/${element}.t_${THRESHOLD:3}.${FORECAST_DATE}.png"
             echo "Producing binary accuracy plot for $FORECAST_DATE (${OUTPUT})"
             icenet_plot_bin_accuracy -b $E_FLAG -v $REGION -o $OUTPUT $THRESHOLD \
-                $HEMI $FORECAST_FILE $FORECAST_DATE >> $BINACC_LOG 2>&1
+                $GROUND_TRUTH_DS $FORECAST_FILE $FORECAST_DATE >> $BINACC_LOG 2>&1
         elif [ "${element}" == "sie" ]; then
             OUTPUT="${OUTPUT_DIR}/${element}.t_${THRESHOLD:3}.ga_${GRID_AREA_SIZE:4}.${FORECAST_DATE}.png"
             echo "Producing sea ice extent error plot for $FORECAST_DATE (${OUTPUT})"
             icenet_plot_sie_error -b $E_FLAG -v $REGION -o $OUTPUT $THRESHOLD $GRID_AREA_SIZE \
-                $HEMI $FORECAST_FILE $FORECAST_DATE >> $SIE_LOG 2>&1
+                $GROUND_TRUTH_DS $FORECAST_FILE $FORECAST_DATE >> $SIE_LOG 2>&1
         elif [ "${element}" == "mae" ]; then
             echo "Producing MAE plot for $FORECAST_DATE (${OUTPUT})"
             icenet_plot_metrics -b $E_FLAG -v $REGION -m $element -o $OUTPUT \
-                $HEMI $FORECAST_FILE $FORECAST_DATE >> $MAE_LOG 2>&1
+                $GROUND_TRUTH_DS $FORECAST_FILE $FORECAST_DATE >> $MAE_LOG 2>&1
         elif [ "${element}" == "mse" ]; then
             echo "Producing MSE plot for $FORECAST_DATE (${OUTPUT})"
             icenet_plot_metrics -b $E_FLAG -v $REGION -m $element -o $OUTPUT \
-                $HEMI $FORECAST_FILE $FORECAST_DATE >> $MSE_LOG 2>&1
+                $GROUND_TRUTH_DS $FORECAST_FILE $FORECAST_DATE >> $MSE_LOG 2>&1
         elif [ "${element}" == "rmse" ]; then
             echo "Producing RMSE plot for $FORECAST_DATE (${OUTPUT})"
             icenet_plot_metrics -b $E_FLAG -v $REGION -m $element -o $OUTPUT \
-                $HEMI $FORECAST_FILE $FORECAST_DATE >> $RMSE_LOG 2>&1
+                $GROUND_TRUTH_DS $FORECAST_FILE $FORECAST_DATE >> $RMSE_LOG 2>&1
         elif [ "${element}" == "sic" ]; then
             OUTPUT="${OUTPUT_DIR}/${element}.${FORECAST_DATE}.mp4"
             echo "Producing SIC error video for $FORECAST_DATE (${OUTPUT})"
             icenet_plot_sic_error -v $REGION -o $OUTPUT \
-                $HEMI $FORECAST_FILE $FORECAST_DATE >> $SICERR_LOG 2>&1
+                $GROUND_TRUTH_DS $FORECAST_FILE $FORECAST_DATE >> $SICERR_LOG 2>&1
         fi
     done
 done
@@ -208,33 +206,33 @@ if [[ "${LEADTIME_AVG}" == true ]]; then
         echo "Plots produced:"
         # averaging over all
         OUTPUT="${OUTPUT_PATH_START}_all.png"
-        icenet_plot_leadtime_avg $HEMI $FORECAST_FILE $REGION \
+        icenet_plot_leadtime_avg $GROUND_TRUTH_DS $FORECAST_FILE $REGION \
             -m $element -ao "all" -s -sm 1 $E_FLAG $THRESHOLD $GRID_AREA_SIZE \
             -dp $DATA_PATH -o $OUTPUT >> $LOGFILE 2>&1
         echo "* ${OUTPUT}"
         ##### initialisation day
         # averaging over monthly
         OUTPUT="${OUTPUT_PATH_START}_init_month.png"
-        icenet_plot_leadtime_avg $HEMI $FORECAST_FILE $REGION \
+        icenet_plot_leadtime_avg $GROUND_TRUTH_DS $FORECAST_FILE $REGION \
             -m $element -ao "month" -s $E_FLAG $THRESHOLD $GRID_AREA_SIZE \
             -dp $DATA_PATH -o $OUTPUT >> $LOGFILE 2>&1
         echo "* ${OUTPUT}"
         # averaging over daily
         OUTPUT="${OUTPUT_PATH_START}_init_day.png"
-        icenet_plot_leadtime_avg $HEMI $FORECAST_FILE $REGION \
+        icenet_plot_leadtime_avg $GROUND_TRUTH_DS $FORECAST_FILE $REGION \
             -m $element -ao "day" -s $E_FLAG $THRESHOLD $GRID_AREA_SIZE \
             -dp $DATA_PATH -o $OUTPUT >> $LOGFILE 2>&1
         echo "* ${OUTPUT}"
         ##### target day
         # averaging over monthly
         OUTPUT="${OUTPUT_PATH_START}_target_month.png"
-        icenet_plot_leadtime_avg $HEMI $FORECAST_FILE $REGION \
+        icenet_plot_leadtime_avg $GROUND_TRUTH_DS $FORECAST_FILE $REGION \
             -m $element -ao "month" -s -td $E_FLAG $THRESHOLD $GRID_AREA_SIZE \
             -dp $DATA_PATH -o $OUTPUT >> $LOGFILE 2>&1
         echo "* ${OUTPUT}"
         # averaging over daily
         OUTPUT="${OUTPUT_PATH_START}_target_day.png"
-        icenet_plot_leadtime_avg $HEMI $FORECAST_FILE $REGION \
+        icenet_plot_leadtime_avg $GROUND_TRUTH_DS $FORECAST_FILE $REGION \
             -m $element -ao "day" -s -td $E_FLAG $THRESHOLD $GRID_AREA_SIZE \
             -dp $DATA_PATH -o $OUTPUT >> $LOGFILE 2>&1
         echo "* ${OUTPUT}"
diff --git a/plot_validations.sh b/plot_validations.sh
index 17c6d36..fcf56eb 100755
--- a/plot_validations.sh
+++ b/plot_validations.sh
@@ -2,11 +2,10 @@
 
 source ENVS
 
-if [ $# -lt 2 ] || [ "$1" == "-h" ]; then
+if [ $# -lt 1 ] || [ "$1" == "-h" ]; then
     echo -e "\nUsage $0 <forecast_name> <hemisphere>"
     echo -e "\nArguments"
     echo "<forecast_name>     name of forecast"
-    echo "<hemisphere>        hemisphere to use"
     echo -e "\nOptions"
     echo "-m <metrics>        string of metrics separated by commas, by default \"binacc,sie,mae,rmse,sic\""
     echo "-r <region>         region arguments, by default uses full hemisphere"
@@ -27,6 +26,7 @@ THRESHOLDS=(0.15, 0.8)
 GRID_AREA_SIZE="-g 25"
 REQUESTED_OUTPUT_DIR=""
 OPTIND=1
+
 while getopts "m:r:t:g:o:" opt; do
     case "$opt" in
         m)  METRICS=${OPTARG} ;;
@@ -56,9 +56,7 @@ shift $((OPTIND-1))
 
 # echo "Leftovers from getopt: $@"
 
-FORECAST="$1"
-HEMI="$2"
-FORECAST_NAME=${FORECAST}_${HEMI}
+FORECAST_NAME=${1}
 
 if [ "${REQUESTED_OUTPUT_DIR}" == "" ]; then
     OUTPUT_DIR="plot/validation/${FORECAST_NAME}"
@@ -75,20 +73,20 @@ for element in "${METRICS[@]}"
     if [ "${element}" == "binacc" ]; then
         for THRESH in ${THRESHOLDS[@]}; do
             ./plot_forecast.sh -m ${element} $REGION -v -l -t $THRESH \
-                -o $OUTPUT_DIR $FORECAST $HEMI
+                -o $OUTPUT_DIR $FORECAST_NAME
             ./plot_forecast.sh -m ${element} $REGION -e -v -l -t $THRESH \
-                -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST $HEMI
+                -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST_NAME
         done
     elif [ "${element}" == "sie" ]; then
         for THRESH in ${THRESHOLDS[@]}; do
             ./plot_forecast.sh -m ${element} $REGION -v -l -t $THRESH $GRID_AREA_SIZE \
-                -o $OUTPUT_DIR $FORECAST $HEMI
+                -o $OUTPUT_DIR $FORECAST_NAME
             ./plot_forecast.sh -m ${element} $REGION -e -v -l -t $THRESH $GRID_AREA_SIZE \
-                -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST $HEMI
+                -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST_NAME
         done
     elif [ "${element}" == "sic" ]; then
         ./plot_forecast.sh -m ${element} $REGION -v \
-            -o $OUTPUT_DIR $FORECAST $HEMI
+            -o $OUTPUT_DIR $FORECAST_NAME
     else
         if [ "${element}" == "mae" ]; then
             LOGFILE="${MAE_LOG}"
@@ -98,8 +96,8 @@ for element in "${METRICS[@]}"
             LOGFILE="${RMSE_LOG}"
         fi
         ./plot_forecast.sh -m ${element} $REGION -v -l \
-            -o $OUTPUT_DIR $FORECAST $HEMI
+            -o $OUTPUT_DIR $FORECAST_NAME
         ./plot_forecast.sh -m ${element} $REGION -e -v -l \
-            -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST $HEMI
+            -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST_NAME
     fi
 done

From 4923cf92f70fce8fc546d5c9f744270c3d10d1c1 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Wed, 4 Sep 2024 11:23:27 +0100
Subject: [PATCH 36/44] Updating template dates

---
 template_LICENSE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/template_LICENSE.md b/template_LICENSE.md
index 6154e7c..84828f7 100644
--- a/template_LICENSE.md
+++ b/template_LICENSE.md
@@ -1 +1 @@
-Unless otherwise stated, all content is &copy; British Antarctic Survey and The Alan Turing Institute 2023 and made available via the [Open Government License](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) which is compatible with the [CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/)
+Unless otherwise stated, all content is &copy; British Antarctic Survey and The Alan Turing Institute 2024 and made available via the [Open Government License](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) which is compatible with the [CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/)

From 1e63b0198b4b8f8c06c04f1e41562948e768d376 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Thu, 5 Sep 2024 17:27:01 +0100
Subject: [PATCH 37/44] Validating and sorting out spatial interpolation

---
 prep_training_data.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prep_training_data.sh b/prep_training_data.sh
index 9de3f77..bb0d3c1 100755
--- a/prep_training_data.sh
+++ b/prep_training_data.sh
@@ -56,7 +56,7 @@ preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land "icenet.
 preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC polarhole "icenet.data.masks.osisaf:Masks"
 preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC active_grid_cell "icenet.data.masks.osisaf:Masks"
 
-preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC
+preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,inactive_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC
 
 preprocess_dataset $PROC_ARGS_SIC -v \
   -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \

From 795e9c7dc8bbb3a039845cf1c46148ee1b31dc5c Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 6 Sep 2024 09:34:24 +0100
Subject: [PATCH 38/44] Clearing some comments and TODOs

---
 prep_prediction_data.sh |  5 +----
 prep_training_data.sh   | 13 +++++++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/prep_prediction_data.sh b/prep_prediction_data.sh
index 03564e9..91de790 100755
--- a/prep_prediction_data.sh
+++ b/prep_prediction_data.sh
@@ -52,6 +52,7 @@ LOADER_CONFIGURATION="loader.${PREDICTION_DATASET}.json"
 PRED_DATA_START=`date --date "$PREDICTION_START - $LAG ${DATA_FREQUENCY}s" +%Y-%m-%d`
 # download-toolbox integration
 (
+  # We don't do AMSR2 and CMIP as part of this, but everything is similar if you want to ;)
   # download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS
   download_osisaf $DATA_ARGS $HEMI $PRED_DATA_START $PREDICTION_END $OSISAF_VAR_ARGS
   download_era5 $DATA_ARGS $HEMI $PRED_DATA_START $PREDICTION_END $ERA5_VAR_ARGS
@@ -71,8 +72,6 @@ ATMOS_PROC_DSC="${PROCESSED_DATA_STORE}/${ATMOS_PROC}/${DATASET_CONFIG_NAME}"
 # Create links to the central data store datasets for easier "mapping"
 [ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf
 [ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5
-# TODO: AMSR
-# TODO: CMIP
 
 LOADER_CONFIGURATION="loader.${PREDICTION_DATASET}.json"
 TRAIN_LOADER_CONFIGURATION="loader.${TRAIN_DATA_NAME}.${HEMI}.json"
@@ -86,7 +85,6 @@ preprocess_dataset $PROC_ARGS_SIC -v \
   -i "icenet.data.processors.osisaf:SICPreProcessor" \
   -sh $LAG -st $FORECAST_LENGTH \
   $OSISAF_DATASET ${PREDICTION_DATASET}_osisaf
-  # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly
 
 if [ ! -f ref.osisaf.${HEMI}.nc ]; then
   echo "Reference OSISAF for regrid should still be available, bailing for the mo"
@@ -106,7 +104,6 @@ preprocess_dataset $PROC_ARGS_ERA5 -v \
   -i "icenet.data.processors.cds:ERA5PreProcessor" \
   -sh $LAG -st $FORECAST_LENGTH \
   $ATMOS_PROC_DSC ${PREDICTION_DATASET}_era5
-  # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly
 
 preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PREDICTION_DATASET}_osisaf.json processed.${PREDICTION_DATASET}_era5.json
 
diff --git a/prep_training_data.sh b/prep_training_data.sh
index bb0d3c1..188c39d 100755
--- a/prep_training_data.sh
+++ b/prep_training_data.sh
@@ -33,8 +33,6 @@ ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}"
 # Create links to the central data store datasets for easier "mapping"
 [ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf
 [ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5
-# TODO: AMSR
-# TODO: CMIP
 
 GROUND_TRUTH_SIC="osi_sic.$TRAIN_DATA_NAME"
 ATMOS_PROC="era5_osi.$TRAIN_DATA_NAME"
@@ -50,13 +48,20 @@ DATASET_NAME="tfdata_${HEMI}"
 ## Workflow
 preprocess_loader_init -v $PROCESSED_DATASET
 
-preprocess_missing_time -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC
+# We CAN supply splits and lead / lag to prevent unnecessarily large copies of datasets
+# or interpolation of time across huge spans
+# TODO: temporal interpolation limiting
+preprocess_missing_time \
+#  -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \
+#  -sh $LAG -st $FORECAST_LENGTH \
+  -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC
 
 preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land "icenet.data.masks.osisaf:Masks"
 preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC polarhole "icenet.data.masks.osisaf:Masks"
 preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC active_grid_cell "icenet.data.masks.osisaf:Masks"
 
-preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,inactive_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC
+preprocess_missing_spatial \
+  -m processed.masks.${HEMI}.json -mp land,inactive_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC
 
 preprocess_dataset $PROC_ARGS_SIC -v \
   -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \

From 2d3ad5e75bb7b084249bb35f71146d2d4c2eebb3 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 10 Jan 2025 12:35:13 +0000
Subject: [PATCH 39/44] AMSR2 dataset generation now working

---
 prep_amsr_training_data.sh                    | 90 +++++++++++++++++++
 ...ng_data.sh => prep_osisaf_training_data.sh |  0
 2 files changed, 90 insertions(+)
 create mode 100755 prep_amsr_training_data.sh
 rename prep_training_data.sh => prep_osisaf_training_data.sh (100%)

diff --git a/prep_amsr_training_data.sh b/prep_amsr_training_data.sh
new file mode 100755
index 0000000..901fe29
--- /dev/null
+++ b/prep_amsr_training_data.sh
@@ -0,0 +1,90 @@
+HEMI="$1"
+DOWNLOAD=${2:-0}
+
+# download-toolbox integration
+# This updates our source
+if [ $DOWNLOAD -eq 1 ]; then
+  download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS
+  download_osisaf $DATA_ARGS $HEMI $OSISAF_DATES $OSISAF_VAR_ARGS
+  download_era5 $DATA_ARGS $HEMI $ERA5_DATES $ERA5_VAR_ARGS
+  download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS
+fi 2>&1 | tee logs/download.training.log
+
+### TODO:
+
+DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json"
+
+# preprocess-toolbox integration
+# Persistent datasets from the source data store, wherever that is
+AMSR2_DATASET="${SOURCE_DATA_STORE}/amsr2_6250/${DATASET_CONFIG_NAME}"
+CMIP6_DATASET="${SOURCE_DATA_STORE}/cmip6.MRI-ESM2-0.r1i1p1f1/${DATASET_CONFIG_NAME}"
+ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}"
+OSISAF_DATASET="${SOURCE_DATA_STORE}/osisaf/${DATASET_CONFIG_NAME}"
+
+# Create links to the central data store datasets for easier "mapping"
+[ ! -e data/amsr2_6250 ] && [ -d ${SOURCE_DATA_STORE}/amsr2_6250 ] && ln -s ${SOURCE_DATA_STORE}/amsr2_6250 ./data/amsr2_6250
+[ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5
+[ ! -e data/cmip6.MRI-ESM2-0.r1i1p1f1 ] && [ -d ${SOURCE_DATA_STORE}/cmip6.MRI-ESM2-0.r1i1p1f1 ] && ln -s ${SOURCE_DATA_STORE}/cmip6.MRI-ESM2-0.r1i1p1f1 ./data/cmip6.MRI-ESM2-0.r1i1p1f1
+[ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf
+
+PROCESSED_DATASET="${TRAIN_DATA_NAME}.${HEMI}"
+LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json"
+DATASET_NAME="tfdata_${HEMI}"
+
+ATMOS_PROC="era5_amsr.$TRAIN_DATA_NAME"
+ATMOS_PROC_DSC="${PROCESSED_DATA_STORE}/${ATMOS_PROC}/${DATASET_CONFIG_NAME}"
+GROUND_TRUTH_SIC="amsr2_sic.$TRAIN_DATA_NAME"
+GROUND_TRUTH_SIC_DSC="${PROCESSED_DATA_STORE}/${GROUND_TRUTH_SIC}/${DATASET_CONFIG_NAME}"
+
+###
+# Three stage training
+#
+
+##
+# Stage #1: CMIP6 ground truth with ERA5
+#
+
+##
+# Stage #2: OSISAF ground truth with ERA5
+#
+
+##
+# Stage #3: AMSR2 ground truth with ERA5
+#
+
+preprocess_loader_init -v $PROCESSED_DATASET
+preprocess_add_mask -v $LOADER_CONFIGURATION $AMSR2_DATASET land "icenet.data.masks.nsidc:Masks"
+
+preprocess_missing_time -n siconca -v $AMSR2_DATASET $GROUND_TRUTH_SIC
+
+preprocess_dataset $PROC_ARGS_SIC -v \
+  -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \
+  -i "icenet.data.processors.amsr:AMSR2PreProcessor" \
+  -sh $LAG -st $FORECAST_LENGTH \
+  $AMSR2_DATASET ${PROCESSED_DATASET}_amsr
+
+# IS THIS NEEDED? icenet_generate_ref_amsr -v ${PROCESSED_DATA_STORE}/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc
+# ln -s data/amsr2_6250/siconca/2014/asi-AMSR2-s6250-20140630-v5.4.nc ref.amsr.${HEMI}.nc
+
+preprocess_regrid -v $ERA5_DATASET ref.amsr.${HEMI}.nc $ATMOS_PROC
+
+preprocess_dataset $PROC_ARGS_ERA5 -v \
+  -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \
+  -i "icenet.data.processors.cds:ERA5PreProcessor" \
+  -sh $LAG -st $FORECAST_LENGTH \
+  $ATMOS_PROC_DSC ${PROCESSED_DATASET}_era5
+
+preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PROCESSED_DATASET}_amsr.json processed.${PROCESSED_DATASET}_era5.json
+
+preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC sin "icenet.data.meta:SinProcessor"
+preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC cos "icenet.data.meta:CosProcessor"
+preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "icenet.data.masks.nsidc:Masks"
+
+icenet_dataset_create -v -c -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME
+
+FIRST_DATE=${PLOT_DATE:-`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.train[0]' | tr -d '"'`}
+icenet_plot_input -p -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/input.${HEMI}.${FIRST_DATE}.png
+icenet_plot_input --outputs -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/outputs.${HEMI}.${FIRST_DATE}.png
+icenet_plot_input --weights -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/weights.${HEMI}.${FIRST_DATE}.png
+
+icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME
diff --git a/prep_training_data.sh b/prep_osisaf_training_data.sh
similarity index 100%
rename from prep_training_data.sh
rename to prep_osisaf_training_data.sh

From 11c4b20d11d0c99d3fbf545bc1d36c3819763a98 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 10 Jan 2025 23:36:55 +0000
Subject: [PATCH 40/44] Restrict the amount of copying on regrid for AMSR

---
 prep_amsr_training_data.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/prep_amsr_training_data.sh b/prep_amsr_training_data.sh
index 901fe29..97fa89d 100755
--- a/prep_amsr_training_data.sh
+++ b/prep_amsr_training_data.sh
@@ -64,9 +64,11 @@ preprocess_dataset $PROC_ARGS_SIC -v \
   $AMSR2_DATASET ${PROCESSED_DATASET}_amsr
 
 # IS THIS NEEDED? icenet_generate_ref_amsr -v ${PROCESSED_DATA_STORE}/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc
-# ln -s data/amsr2_6250/siconca/2014/asi-AMSR2-s6250-20140630-v5.4.nc ref.amsr.${HEMI}.nc
+[ ! -f ref.amsr.${HEMI}.nc ] && ln -s data/amsr2_6250/siconca/2014/asi-AMSR2-s6250-20140630-v5.4.nc ref.amsr.${HEMI}.nc
 
-preprocess_regrid -v $ERA5_DATASET ref.amsr.${HEMI}.nc $ATMOS_PROC
+preprocess_regrid -v \
+  -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \
+  $ERA5_DATASET ref.amsr.${HEMI}.nc $ATMOS_PROC
 
 preprocess_dataset $PROC_ARGS_ERA5 -v \
   -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \

From 8ae686a1e2246bbbb5afe29d224f67d19a54003b Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 10 Jan 2025 23:45:50 +0000
Subject: [PATCH 41/44] Changing name

---
 prep_amsr_training_data.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prep_amsr_training_data.sh b/prep_amsr_training_data.sh
index 97fa89d..20613d3 100755
--- a/prep_amsr_training_data.sh
+++ b/prep_amsr_training_data.sh
@@ -29,7 +29,7 @@ OSISAF_DATASET="${SOURCE_DATA_STORE}/osisaf/${DATASET_CONFIG_NAME}"
 
 PROCESSED_DATASET="${TRAIN_DATA_NAME}.${HEMI}"
 LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json"
-DATASET_NAME="tfdata_${HEMI}"
+DATASET_NAME="tfamsr_${HEMI}"
 
 ATMOS_PROC="era5_amsr.$TRAIN_DATA_NAME"
 ATMOS_PROC_DSC="${PROCESSED_DATA_STORE}/${ATMOS_PROC}/${DATASET_CONFIG_NAME}"

From bf97ad6ed08812e379adae2db33f8cda3e44f1ef Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Fri, 10 Jan 2025 23:48:12 +0000
Subject: [PATCH 42/44] ENVS

---
 prep_amsr_training_data.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/prep_amsr_training_data.sh b/prep_amsr_training_data.sh
index 20613d3..71989d4 100755
--- a/prep_amsr_training_data.sh
+++ b/prep_amsr_training_data.sh
@@ -1,6 +1,8 @@
 HEMI="$1"
 DOWNLOAD=${2:-0}
 
+source ENVS
+
 # download-toolbox integration
 # This updates our source
 if [ $DOWNLOAD -eq 1 ]; then

From 735d96e2b4f61b771513f4a58a2a6ea28d28d672 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Tue, 14 Jan 2025 08:48:33 +0000
Subject: [PATCH 43/44] Adding comments for transfer

---
 prep_amsr_prediction_data.sh | 75 ++++++++++++++++++++++++++++++++++++
 prep_amsr_training_data.sh   |  6 ++-
 2 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100755 prep_amsr_prediction_data.sh

diff --git a/prep_amsr_prediction_data.sh b/prep_amsr_prediction_data.sh
new file mode 100755
index 0000000..4d66381
--- /dev/null
+++ b/prep_amsr_prediction_data.sh
@@ -0,0 +1,75 @@
+HEMI="$1"
+DOWNLOAD=${2:-0}
+
+source ENVS
+
+##
+# TODO: Usable as is for training, but for prediction we need to restrict this to relevant activities and dates
+#   ./run_prediction.sh fc.09_12.2024 amsr_6k_6m_120125.south south
+
+# TODO: assuming monthly?
+# TODO: shift the FORECAST_START into the past for LAG
+export FORECAST_START="2024-09-01"
+export FORECAST_END="2024-12-31"
+export HEMI=south
+export FORECAST_NAME="fc.09_12.2024"
+
+# download-toolbox integration
+# This updates our source
+if [ $DOWNLOAD -eq 1 ]; then
+  download_amsr2 $DATA_ARGS $HEMI $FORECAST_START $FORECAST_END $AMSR2_VAR_ARGS
+  download_era5 $DATA_ARGS $HEMI $FORECAST_START $FORECAST_END $ERA5_VAR_ARGS
+fi 2>&1 | tee logs/download.prediction.log
+
+SOURCE_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json"
+
+AMSR2_DATASET="${SOURCE_DATA_STORE}/amsr2_6250/${SOURCE_CONFIG_NAME}"
+ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${SOURCE_CONFIG_NAME}"
+AMSR2_PROCESSED="processed.${TRAIN_DATA_NAME}.${HEMI}_amsr.json"
+ERA5_PROCESSED="processed.${TRAIN_DATA_NAME}.${HEMI}_era5.json"
+
+# preprocess-toolbox integration
+# Persistent datasets from the source data store, wherever that is
+FORECAST_DATASET="prediction.${FORECAST_NAME}.${HEMI}"
+LOADER_CONFIGURATION="loader.${FORECAST_DATASET}.json"
+
+ATMOS_PROC="${TRAIN_DATA_NAME}.${HEMI}_era5"
+ATMOS_PROC_DIR="processed/${ATMOS_PROC}"
+GROUND_TRUTH_SIC="${TRAIN_DATA_NAME}.${HEMI}_amsr"
+GROUND_TRUTH_SIC_DIR="processed/${GROUND_TRUTH_SIC}"
+
+
+preprocess_loader_init -v $FORECAST_DATASET
+preprocess_add_mask -v $LOADER_CONFIGURATION $AMSR2_DATASET land "icenet.data.masks.nsidc:Masks"
+
+preprocess_dataset $PROC_ARGS_SIC -v \
+  -r $GROUND_TRUTH_SIC_DIR \
+  -sn "test" -ss "$FORECAST_START" -se "$FORECAST_END" \
+  -i "icenet.data.processors.amsr:AMSR2PreProcessor" \
+  -sh $LAG \
+  $AMSR2_DATASET ${FORECAST_NAME}_amsr
+
+preprocess_regrid -v \
+  -sn "test" -ss "$FORECAST_START" -se "$FORECAST_END" \
+  $ERA5_DATASET ref.amsr.${HEMI}.nc ${FORECAST_NAME}_era5
+
+preprocess_dataset $PROC_ARGS_ERA5 -v \
+  -r $ATMOS_PROC_DIR \
+  -sn "test" -ss "$FORECAST_START" -se "$FORECAST_END" \
+  -i "icenet.data.processors.cds:ERA5PreProcessor" \
+  -sh $LAG \
+  ${PROCESSED_DATA_STORE}/${FORECAST_NAME}_era5/${SOURCE_CONFIG_NAME} ${FORECAST_NAME}_era5
+
+preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PROCESSED_DATASET}_amsr.json processed.${PROCESSED_DATASET}_era5.json
+
+preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC sin "icenet.data.meta:SinProcessor"
+preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC cos "icenet.data.meta:CosProcessor"
+preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "icenet.data.masks.nsidc:Masks"
+
+icenet_dataset_create -v -c -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $FORECAST_DATASET
+
+FIRST_DATE=${PLOT_DATE:-`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.test[0]' | tr -d '"'`}
+icenet_plot_input -p -v dataset_config.${FORECAST_DATASET}.json $FIRST_DATE ./plot/input.${HEMI}.${FIRST_DATE}.png
+icenet_plot_input --outputs -v dataset_config.${FORECAST_DATASET}.json $FIRST_DATE ./plot/outputs.${HEMI}.${FIRST_DATE}.png
+icenet_plot_input --weights -v dataset_config.${FORECAST_DATASET}.json $FIRST_DATE ./plot/weights.${HEMI}.${FIRST_DATE}.png
+
diff --git a/prep_amsr_training_data.sh b/prep_amsr_training_data.sh
index 71989d4..7dc03c2 100755
--- a/prep_amsr_training_data.sh
+++ b/prep_amsr_training_data.sh
@@ -3,6 +3,10 @@ DOWNLOAD=${2:-0}
 
 source ENVS
 
+##
+# TODO: Usable as is for training, but for prediction we need to restrict this to relevant activities and dates
+#   ./run_prediction.sh amsr_fc.09_12.2024 amsr_6k_6m_120125.south south
+
 # download-toolbox integration
 # This updates our source
 if [ $DOWNLOAD -eq 1 ]; then
@@ -12,8 +16,6 @@ if [ $DOWNLOAD -eq 1 ]; then
   download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS
 fi 2>&1 | tee logs/download.training.log
 
-### TODO:
-
 DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json"
 
 # preprocess-toolbox integration

From acf37e64c94d5d7504aa9b78b6f44eb30905ff28 Mon Sep 17 00:00:00 2001
From: James Byrne <jambyr@bas.ac.uk>
Date: Wed, 15 Jan 2025 16:41:08 +0000
Subject: [PATCH 44/44] Updating for revised split names

---
 prep_amsr_prediction_data.sh | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/prep_amsr_prediction_data.sh b/prep_amsr_prediction_data.sh
index 4d66381..f2d764a 100755
--- a/prep_amsr_prediction_data.sh
+++ b/prep_amsr_prediction_data.sh
@@ -44,32 +44,30 @@ preprocess_add_mask -v $LOADER_CONFIGURATION $AMSR2_DATASET land "icenet.data.ma
 
 preprocess_dataset $PROC_ARGS_SIC -v \
   -r $GROUND_TRUTH_SIC_DIR \
-  -sn "test" -ss "$FORECAST_START" -se "$FORECAST_END" \
+  -sn "prediction" -ss "$FORECAST_START" -se "$FORECAST_END" \
   -i "icenet.data.processors.amsr:AMSR2PreProcessor" \
   -sh $LAG \
   $AMSR2_DATASET ${FORECAST_NAME}_amsr
 
 preprocess_regrid -v \
-  -sn "test" -ss "$FORECAST_START" -se "$FORECAST_END" \
+  -sn "prediction" -ss "$FORECAST_START" -se "$FORECAST_END" \
   $ERA5_DATASET ref.amsr.${HEMI}.nc ${FORECAST_NAME}_era5
 
 preprocess_dataset $PROC_ARGS_ERA5 -v \
   -r $ATMOS_PROC_DIR \
-  -sn "test" -ss "$FORECAST_START" -se "$FORECAST_END" \
+  -sn "prediction" -ss "$FORECAST_START" -se "$FORECAST_END" \
   -i "icenet.data.processors.cds:ERA5PreProcessor" \
   -sh $LAG \
   ${PROCESSED_DATA_STORE}/${FORECAST_NAME}_era5/${SOURCE_CONFIG_NAME} ${FORECAST_NAME}_era5
 
-preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PROCESSED_DATASET}_amsr.json processed.${PROCESSED_DATASET}_era5.json
+preprocess_add_processed -v $LOADER_CONFIGURATION processed.${FORECAST_NAME}_amsr.json processed.${FORECAST_NAME}_era5.json
 
-preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC sin "icenet.data.meta:SinProcessor"
-preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC cos "icenet.data.meta:CosProcessor"
-preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "icenet.data.masks.nsidc:Masks"
+preprocess_add_channel -v $LOADER_CONFIGURATION $AMSR2_DATASET sin "icenet.data.meta:SinProcessor"
+preprocess_add_channel -v $LOADER_CONFIGURATION $AMSR2_DATASET cos "icenet.data.meta:CosProcessor"
+preprocess_add_channel -v $LOADER_CONFIGURATION $AMSR2_DATASET land_map "icenet.data.masks.nsidc:Masks"
 
 icenet_dataset_create -v -c -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $FORECAST_DATASET
 
-FIRST_DATE=${PLOT_DATE:-`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.test[0]' | tr -d '"'`}
+FIRST_DATE=${PLOT_DATE:-`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.prediction[0]' | tr -d '"'`}
 icenet_plot_input -p -v dataset_config.${FORECAST_DATASET}.json $FIRST_DATE ./plot/input.${HEMI}.${FIRST_DATE}.png
-icenet_plot_input --outputs -v dataset_config.${FORECAST_DATASET}.json $FIRST_DATE ./plot/outputs.${HEMI}.${FIRST_DATE}.png
-icenet_plot_input --weights -v dataset_config.${FORECAST_DATASET}.json $FIRST_DATE ./plot/weights.${HEMI}.${FIRST_DATE}.png