From 5cd09f347a5122d56465c672ef9e8524071b2f1f Mon Sep 17 00:00:00 2001 From: "Bryn N. Ubald" <55503826+bnubald@users.noreply.github.com> Date: Wed, 28 Feb 2024 16:05:40 +0000 Subject: [PATCH 01/44] Fixes #14: Dynamically generate seed block in yaml --- .gitignore | 1 + ENVS.example | 11 +++++++++++ ensemble/predict.tmpl.yaml | 4 +--- ensemble/train.tmpl.yaml | 11 +---------- run_predict_ensemble.sh | 36 +++++++++++++++++++++++++++++++++++- run_train_ensemble.sh | 37 ++++++++++++++++++++++++++++++++++++- 6 files changed, 85 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 0d668a7..b49d126 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ loader.*.json *.npy *.out tmp.* +*.swp *test* *.png diff --git a/ENVS.example b/ENVS.example index 58a711f..0850cca 100644 --- a/ENVS.example +++ b/ENVS.example @@ -61,6 +61,12 @@ DEMO_PIPELINE_VAL_END="2022-2-14" DEMO_PIPELINE_TEST_START="2022-2-15" DEMO_PIPELINE_TEST_END="2022-2-28" +## +# Training & Prediction ensemble run seeds +# +SMALL_DEMO_PIPELINE_ENSEMBLE_TRAIN_SEEDS="42,46" +SMALL_DEMO_PIPELINE_ENSEMBLE_PREDICT_SEEDS="42,46" + ## # The prefix to use for training date ranges # @@ -83,6 +89,9 @@ VAL_END_NAME="${PREFIX}_VAL_END" TEST_START_NAME="${PREFIX}_TEST_START" TEST_END_NAME="${PREFIX}_TEST_END" +ENSEMBLE_TRAIN_SEEDS_NAME="${PREFIX}_ENSEMBLE_TRAIN_SEEDS" +ENSEMBLE_PREDICT_SEEDS_NAME="${PREFIX}_ENSEMBLE_PREDICT_SEEDS" + # What are we exporting export TRAIN_START=${!TRAIN_START_NAME} @@ -92,3 +101,5 @@ export VAL_END=${!VAL_END_NAME} export TEST_START=${!TEST_START_NAME} export TEST_END=${!TEST_END_NAME} +export ENSEMBLE_TRAIN_SEEDS=${!ENSEMBLE_TRAIN_SEEDS_NAME} +export ENSEMBLE_PREDICT_SEEDS=${!ENSEMBLE_PREDICT_SEEDS_NAME} diff --git a/ensemble/predict.tmpl.yaml b/ensemble/predict.tmpl.yaml index 2916b3c..f960174 100644 --- a/ensemble/predict.tmpl.yaml +++ b/ensemble/predict.tmpl.yaml @@ -40,9 +40,7 @@ ensemble: cmd: /usr/bin/ln -s ../../data pre_run: [] runs: - - seed: 42 - - seed: 46 - - seed: 45 + - seed: SEEDS post_run: [] post_batch: - name: execute diff --git a/ensemble/train.tmpl.yaml b/ensemble/train.tmpl.yaml index 4f8af88..0bed0bb 100644 --- a/ensemble/train.tmpl.yaml +++ b/ensemble/train.tmpl.yaml @@ -44,16 +44,7 @@ ensemble: pre_batch: [] pre_run: [] runs: - - seed: 42 - - seed: 46 - - seed: 45 - - seed: 17 - - seed: 24 - - seed: 84 - - seed: 83 - - seed: 16 - - seed: 5 - - seed: 3 + - seed: SEEDS post_run: [] post_batch: - name: execute diff --git a/run_predict_ensemble.sh b/run_predict_ensemble.sh index 5e834d8..280cce6 100755 --- a/run_predict_ensemble.sh +++ b/run_predict_ensemble.sh @@ -14,8 +14,9 @@ ENSEMBLE_TARGET="slurm" ENSEMBLE_SWITCH="" ENSEMBLE_ARGS="" TRAIN_IDENT="" +ENSEMBLE_SEEDS_DEFAULT=42,46,45 -while getopts ":b:df:i:lm:p:x" opt; do +while getopts ":b:df:i:lm:p:r:x" opt; do case "$opt" in b) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_batch=$OPTARG ";; d) ENSEMBLE_TARGET="dummy";; @@ -24,6 +25,7 @@ while getopts ":b:df:i:lm:p:x" opt; do l) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_testset=false ";; m) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}mem=$OPTARG ";; p) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_prep=$OPTARG ";; + r) ENSEMBLE_RUNS=$OPTARG ;; # Ensemble member run seed values x) DO_NOT_EXECUTE=1 esac done @@ -52,11 +54,43 @@ ln -s `realpath ${DATEFILE}` ensemble/${NAME}/predict_dates.csv PREDICT_CONFIG=`mktemp -p . --suffix ".predict"` +## +# Dynamically generate seeds for ensemble run. +# + +IFS="," read -ra SEEDS <<< "$ENSEMBLE_RUNS" + +# Check if seeds defined as CLI args (e.g. `-r 42,46`) +if [ ${#SEEDS[@]} -eq 0 ]; then + IFS="," read -ra SEEDS <<< "$ENSEMBLE_PREDICT_SEEDS" + # Check if seeds defined in ENVS exported variables (else use defaults) + if [ ${#SEEDS[@]} -eq 0 ]; then + IFS="," read -ra SEEDS <<< "$ENSEMBLE_SEEDS_DEFAULT" + fi +fi + +# Generate seed lines for yaml output +ENSEMBLE_SEEDS="" +COUNTER=0 +for seed in ${SEEDS[@]} +do + ENSEMBLE_SEEDS+=" - seed: "$seed + if [ $COUNTER -lt $((${#SEEDS[@]}-1)) ]; then + ENSEMBLE_SEEDS+="\n" + fi + ((COUNTER++)) +done + +echo "No. of ensemble members: " "${#SEEDS[@]}" +printf -v joined '%s,' "${SEEDS[@]}" +echo "Ensemble members: " "${joined%,}" + sed -r \ -e "s/NETWORK/${NETWORK}/g" \ -e "s/DATASET/${DATASET}/g" \ -e "s/LOADER/${LOADER}/g" \ -e "s/NAME/${NAME}/g" \ + -e "/\bSEEDS$/s/.*/${ENSEMBLE_SEEDS}/g" \ ensemble/predict.tmpl.yaml >$PREDICT_CONFIG COMMAND="model_ensemble $PREDICT_CONFIG $ENSEMBLE_TARGET $ENSEMBLE_SWITCH $ENSEMBLE_ARGS" diff --git a/run_train_ensemble.sh b/run_train_ensemble.sh index 3de09bb..2c86b12 100755 --- a/run_train_ensemble.sh +++ b/run_train_ensemble.sh @@ -9,13 +9,15 @@ fi echo "ARGS: $@" +# Defaults if not specified ENSEMBLE_TARGET="slurm" ENSEMBLE_SWITCH="" ENSEMBLE_ARGS="" ENSEMBLE_JOBS=1 ENSEMBLE_NTASKS=4 +ENSEMBLE_SEEDS_DEFAULT=42,46,45,17,24,84,83,16,5,3 -while getopts ":b:c:de:f:g:j:l:m:n:p:q:s:t:" opt; do +while getopts ":b:c:de:f:g:j:l:m:n:p:q:r:s:t:" opt; do case "$opt" in b) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_batch=$OPTARG ";; c) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}cluster=$OPTARG ";; @@ -29,6 +31,7 @@ while getopts ":b:c:de:f:g:j:l:m:n:p:q:s:t:" opt; do n) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}nodelist=$OPTARG ";; p) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_prep=$OPTARG ";; q) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_queue=$OPTARG ";; + r) ENSEMBLE_RUNS=$OPTARG ;; # Ensemble member run seed values s) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_strategy=$OPTARG ";; t) ENSEMBLE_NTASKS=$OPTARG ;; esac @@ -47,12 +50,44 @@ NAME="$3" TRAIN_CONFIG=`mktemp -p . --suffix ".train"` +## +# Dynamically generate seeds for ensemble run. +# + +IFS="," read -ra SEEDS <<< "$ENSEMBLE_RUNS" + +# Check if seeds defined as CLI args (e.g. `-r 42,46`) +if [ ${#SEEDS[@]} -eq 0 ]; then + IFS="," read -ra SEEDS <<< "$ENSEMBLE_TRAIN_SEEDS" + # Check if seeds defined in ENVS exported variables (else use defaults) + if [ ${#SEEDS[@]} -eq 0 ]; then + IFS="," read -ra SEEDS <<< "$ENSEMBLE_SEEDS_DEFAULT" + fi +fi + +# Generate seed lines for yaml output +ENSEMBLE_SEEDS="" +COUNTER=0 +for seed in ${SEEDS[@]} +do + ENSEMBLE_SEEDS+=" - seed: "$seed + if [ $COUNTER -lt $((${#SEEDS[@]}-1)) ]; then + ENSEMBLE_SEEDS+="\n" + fi + ((COUNTER++)) +done + +echo "No. of ensemble members: " "${#SEEDS[@]}" +printf -v joined '%s,' "${SEEDS[@]}" +echo "Ensemble members: " "${joined%,}" + sed -r \ -e "s/NAME/${NAME}/g" \ -e "s/LOADER/${LOADER}/g" \ -e "s/DATASET/${DATASET}/g" \ -e "s/MAXJOBS/${ENSEMBLE_JOBS}/g" \ -e "s/NTASKS/${ENSEMBLE_NTASKS}/g" \ + -e "/\bSEEDS$/s/.*/${ENSEMBLE_SEEDS}/g" \ ensemble/train.tmpl.yaml >$TRAIN_CONFIG COMMAND="model_ensemble $TRAIN_CONFIG $ENSEMBLE_TARGET $ENSEMBLE_SWITCH $ENSEMBLE_ARGS" From 5e86f996b0252af0b81064b79f5c7f66b47cb2c2 Mon Sep 17 00:00:00 2001 From: "Bryn N. Ubald" <55503826+bnubald@users.noreply.github.com> Date: Thu, 7 Mar 2024 11:25:32 +0000 Subject: [PATCH 02/44] Fixes #36: Rename seed variables in ENVS.example --- ENVS.example | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ENVS.example b/ENVS.example index 0850cca..715d973 100644 --- a/ENVS.example +++ b/ENVS.example @@ -64,8 +64,8 @@ DEMO_PIPELINE_TEST_END="2022-2-28" ## # Training & Prediction ensemble run seeds # -SMALL_DEMO_PIPELINE_ENSEMBLE_TRAIN_SEEDS="42,46" -SMALL_DEMO_PIPELINE_ENSEMBLE_PREDICT_SEEDS="42,46" +DEMO_PIPELINE_ENSEMBLE_TRAIN_SEEDS="42,46" +DEMO_PIPELINE_ENSEMBLE_PREDICT_SEEDS="42,46" ## # The prefix to use for training date ranges From d2098e9b62db5640f5a9a3fa42dcb389bce2b957 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 22 Mar 2024 16:57:18 +0000 Subject: [PATCH 03/44] Dev #38: adding support for incremental HPC environment installation from cloned tensorflow-gpu --- environment.dawn.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 environment.dawn.yml diff --git a/environment.dawn.yml b/environment.dawn.yml new file mode 100644 index 0000000..2713e21 --- /dev/null +++ b/environment.dawn.yml @@ -0,0 +1,11 @@ +channels: + - conda-forge + - defaults +dependencies: + - cartopy + - eccodes + - ffmpeg + - hdf5 + - netcdf4 + - openh264 + - xarray From bef3bd7f9c59b7c1218250f3d52ed0a47555d027 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Sat, 23 Mar 2024 07:22:46 +0000 Subject: [PATCH 04/44] Dev #38: sticking in some stubs for dawn use --- ensemble/predict.tmpl.yaml | 2 +- ensemble/template/dawn.sh | 5 +++++ ensemble/train.tmpl.yaml | 4 ++-- 3 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 ensemble/template/dawn.sh diff --git a/ensemble/predict.tmpl.yaml b/ensemble/predict.tmpl.yaml index f960174..55ad34d 100644 --- a/ensemble/predict.tmpl.yaml +++ b/ensemble/predict.tmpl.yaml @@ -14,6 +14,7 @@ ensemble: - ../../../processed - ../../../results mem: 224gb + cluster: pvc pre_process: [] post_process: [] @@ -24,7 +25,6 @@ ensemble: - icenet_predict.sh.j2 email: someone@example.com job_file: icenet_predict.sh - cluster: short nodes: 1 ntasks: 8 length: 00:30:00 diff --git a/ensemble/template/dawn.sh b/ensemble/template/dawn.sh new file mode 100644 index 0000000..e73154b --- /dev/null +++ b/ensemble/template/dawn.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +module purge +module load default-dawn +module load intelpython-conda diff --git a/ensemble/train.tmpl.yaml b/ensemble/train.tmpl.yaml index 0bed0bb..0a7e1c0 100644 --- a/ensemble/train.tmpl.yaml +++ b/ensemble/train.tmpl.yaml @@ -17,6 +17,8 @@ ensemble: - ../../../results gpus: 1 mem: 128gb + cluster: pvc + nodes: 1 pre_process: - name: execute @@ -31,8 +33,6 @@ ensemble: - icenet_train.sh.j2 email: someone@example.com job_file: icenet_train.sh - cluster: gpu - nodes: 1 ntasks: NTASKS length: 4-00:00:00 maxruns: 5 From 2fd5ae4e99306fee0da5bb2d76a751663dcfe45a Mon Sep 17 00:00:00 2001 From: "Bryn N. Ubald" <55503826+bnubald@users.noreply.github.com> Date: Wed, 28 Feb 2024 16:05:40 +0000 Subject: [PATCH 05/44] Fixes #14: Dynamically generate seed block in yaml --- .gitignore | 1 + ENVS.example | 11 +++++++++++ ensemble/predict.tmpl.yaml | 4 +--- ensemble/train.tmpl.yaml | 11 +---------- run_predict_ensemble.sh | 36 +++++++++++++++++++++++++++++++++++- run_train_ensemble.sh | 37 ++++++++++++++++++++++++++++++++++++- 6 files changed, 85 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 0d668a7..b49d126 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ loader.*.json *.npy *.out tmp.* +*.swp *test* *.png diff --git a/ENVS.example b/ENVS.example index 58a711f..0850cca 100644 --- a/ENVS.example +++ b/ENVS.example @@ -61,6 +61,12 @@ DEMO_PIPELINE_VAL_END="2022-2-14" DEMO_PIPELINE_TEST_START="2022-2-15" DEMO_PIPELINE_TEST_END="2022-2-28" +## +# Training & Prediction ensemble run seeds +# +SMALL_DEMO_PIPELINE_ENSEMBLE_TRAIN_SEEDS="42,46" +SMALL_DEMO_PIPELINE_ENSEMBLE_PREDICT_SEEDS="42,46" + ## # The prefix to use for training date ranges # @@ -83,6 +89,9 @@ VAL_END_NAME="${PREFIX}_VAL_END" TEST_START_NAME="${PREFIX}_TEST_START" TEST_END_NAME="${PREFIX}_TEST_END" +ENSEMBLE_TRAIN_SEEDS_NAME="${PREFIX}_ENSEMBLE_TRAIN_SEEDS" +ENSEMBLE_PREDICT_SEEDS_NAME="${PREFIX}_ENSEMBLE_PREDICT_SEEDS" + # What are we exporting export TRAIN_START=${!TRAIN_START_NAME} @@ -92,3 +101,5 @@ export VAL_END=${!VAL_END_NAME} export TEST_START=${!TEST_START_NAME} export TEST_END=${!TEST_END_NAME} +export ENSEMBLE_TRAIN_SEEDS=${!ENSEMBLE_TRAIN_SEEDS_NAME} +export ENSEMBLE_PREDICT_SEEDS=${!ENSEMBLE_PREDICT_SEEDS_NAME} diff --git a/ensemble/predict.tmpl.yaml b/ensemble/predict.tmpl.yaml index 2916b3c..f960174 100644 --- a/ensemble/predict.tmpl.yaml +++ b/ensemble/predict.tmpl.yaml @@ -40,9 +40,7 @@ ensemble: cmd: /usr/bin/ln -s ../../data pre_run: [] runs: - - seed: 42 - - seed: 46 - - seed: 45 + - seed: SEEDS post_run: [] post_batch: - name: execute diff --git a/ensemble/train.tmpl.yaml b/ensemble/train.tmpl.yaml index 4f8af88..0bed0bb 100644 --- a/ensemble/train.tmpl.yaml +++ b/ensemble/train.tmpl.yaml @@ -44,16 +44,7 @@ ensemble: pre_batch: [] pre_run: [] runs: - - seed: 42 - - seed: 46 - - seed: 45 - - seed: 17 - - seed: 24 - - seed: 84 - - seed: 83 - - seed: 16 - - seed: 5 - - seed: 3 + - seed: SEEDS post_run: [] post_batch: - name: execute diff --git a/run_predict_ensemble.sh b/run_predict_ensemble.sh index 5e834d8..280cce6 100755 --- a/run_predict_ensemble.sh +++ b/run_predict_ensemble.sh @@ -14,8 +14,9 @@ ENSEMBLE_TARGET="slurm" ENSEMBLE_SWITCH="" ENSEMBLE_ARGS="" TRAIN_IDENT="" +ENSEMBLE_SEEDS_DEFAULT=42,46,45 -while getopts ":b:df:i:lm:p:x" opt; do +while getopts ":b:df:i:lm:p:r:x" opt; do case "$opt" in b) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_batch=$OPTARG ";; d) ENSEMBLE_TARGET="dummy";; @@ -24,6 +25,7 @@ while getopts ":b:df:i:lm:p:x" opt; do l) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_testset=false ";; m) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}mem=$OPTARG ";; p) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_prep=$OPTARG ";; + r) ENSEMBLE_RUNS=$OPTARG ;; # Ensemble member run seed values x) DO_NOT_EXECUTE=1 esac done @@ -52,11 +54,43 @@ ln -s `realpath ${DATEFILE}` ensemble/${NAME}/predict_dates.csv PREDICT_CONFIG=`mktemp -p . --suffix ".predict"` +## +# Dynamically generate seeds for ensemble run. +# + +IFS="," read -ra SEEDS <<< "$ENSEMBLE_RUNS" + +# Check if seeds defined as CLI args (e.g. `-r 42,46`) +if [ ${#SEEDS[@]} -eq 0 ]; then + IFS="," read -ra SEEDS <<< "$ENSEMBLE_PREDICT_SEEDS" + # Check if seeds defined in ENVS exported variables (else use defaults) + if [ ${#SEEDS[@]} -eq 0 ]; then + IFS="," read -ra SEEDS <<< "$ENSEMBLE_SEEDS_DEFAULT" + fi +fi + +# Generate seed lines for yaml output +ENSEMBLE_SEEDS="" +COUNTER=0 +for seed in ${SEEDS[@]} +do + ENSEMBLE_SEEDS+=" - seed: "$seed + if [ $COUNTER -lt $((${#SEEDS[@]}-1)) ]; then + ENSEMBLE_SEEDS+="\n" + fi + ((COUNTER++)) +done + +echo "No. of ensemble members: " "${#SEEDS[@]}" +printf -v joined '%s,' "${SEEDS[@]}" +echo "Ensemble members: " "${joined%,}" + sed -r \ -e "s/NETWORK/${NETWORK}/g" \ -e "s/DATASET/${DATASET}/g" \ -e "s/LOADER/${LOADER}/g" \ -e "s/NAME/${NAME}/g" \ + -e "/\bSEEDS$/s/.*/${ENSEMBLE_SEEDS}/g" \ ensemble/predict.tmpl.yaml >$PREDICT_CONFIG COMMAND="model_ensemble $PREDICT_CONFIG $ENSEMBLE_TARGET $ENSEMBLE_SWITCH $ENSEMBLE_ARGS" diff --git a/run_train_ensemble.sh b/run_train_ensemble.sh index 3de09bb..2c86b12 100755 --- a/run_train_ensemble.sh +++ b/run_train_ensemble.sh @@ -9,13 +9,15 @@ fi echo "ARGS: $@" +# Defaults if not specified ENSEMBLE_TARGET="slurm" ENSEMBLE_SWITCH="" ENSEMBLE_ARGS="" ENSEMBLE_JOBS=1 ENSEMBLE_NTASKS=4 +ENSEMBLE_SEEDS_DEFAULT=42,46,45,17,24,84,83,16,5,3 -while getopts ":b:c:de:f:g:j:l:m:n:p:q:s:t:" opt; do +while getopts ":b:c:de:f:g:j:l:m:n:p:q:r:s:t:" opt; do case "$opt" in b) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_batch=$OPTARG ";; c) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}cluster=$OPTARG ";; @@ -29,6 +31,7 @@ while getopts ":b:c:de:f:g:j:l:m:n:p:q:s:t:" opt; do n) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}nodelist=$OPTARG ";; p) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_prep=$OPTARG ";; q) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_queue=$OPTARG ";; + r) ENSEMBLE_RUNS=$OPTARG ;; # Ensemble member run seed values s) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_strategy=$OPTARG ";; t) ENSEMBLE_NTASKS=$OPTARG ;; esac @@ -47,12 +50,44 @@ NAME="$3" TRAIN_CONFIG=`mktemp -p . --suffix ".train"` +## +# Dynamically generate seeds for ensemble run. +# + +IFS="," read -ra SEEDS <<< "$ENSEMBLE_RUNS" + +# Check if seeds defined as CLI args (e.g. `-r 42,46`) +if [ ${#SEEDS[@]} -eq 0 ]; then + IFS="," read -ra SEEDS <<< "$ENSEMBLE_TRAIN_SEEDS" + # Check if seeds defined in ENVS exported variables (else use defaults) + if [ ${#SEEDS[@]} -eq 0 ]; then + IFS="," read -ra SEEDS <<< "$ENSEMBLE_SEEDS_DEFAULT" + fi +fi + +# Generate seed lines for yaml output +ENSEMBLE_SEEDS="" +COUNTER=0 +for seed in ${SEEDS[@]} +do + ENSEMBLE_SEEDS+=" - seed: "$seed + if [ $COUNTER -lt $((${#SEEDS[@]}-1)) ]; then + ENSEMBLE_SEEDS+="\n" + fi + ((COUNTER++)) +done + +echo "No. of ensemble members: " "${#SEEDS[@]}" +printf -v joined '%s,' "${SEEDS[@]}" +echo "Ensemble members: " "${joined%,}" + sed -r \ -e "s/NAME/${NAME}/g" \ -e "s/LOADER/${LOADER}/g" \ -e "s/DATASET/${DATASET}/g" \ -e "s/MAXJOBS/${ENSEMBLE_JOBS}/g" \ -e "s/NTASKS/${ENSEMBLE_NTASKS}/g" \ + -e "/\bSEEDS$/s/.*/${ENSEMBLE_SEEDS}/g" \ ensemble/train.tmpl.yaml >$TRAIN_CONFIG COMMAND="model_ensemble $TRAIN_CONFIG $ENSEMBLE_TARGET $ENSEMBLE_SWITCH $ENSEMBLE_ARGS" From 100d399a2541b5db387f6196376783796187841c Mon Sep 17 00:00:00 2001 From: "Bryn N. Ubald" <55503826+bnubald@users.noreply.github.com> Date: Thu, 7 Mar 2024 11:25:32 +0000 Subject: [PATCH 06/44] Fixes #36: Rename seed variables in ENVS.example --- ENVS.example | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ENVS.example b/ENVS.example index 0850cca..715d973 100644 --- a/ENVS.example +++ b/ENVS.example @@ -64,8 +64,8 @@ DEMO_PIPELINE_TEST_END="2022-2-28" ## # Training & Prediction ensemble run seeds # -SMALL_DEMO_PIPELINE_ENSEMBLE_TRAIN_SEEDS="42,46" -SMALL_DEMO_PIPELINE_ENSEMBLE_PREDICT_SEEDS="42,46" +DEMO_PIPELINE_ENSEMBLE_TRAIN_SEEDS="42,46" +DEMO_PIPELINE_ENSEMBLE_PREDICT_SEEDS="42,46" ## # The prefix to use for training date ranges From 3a15c62548fd58095517c8edf3089f4a77465bcd Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 22 Mar 2024 16:57:18 +0000 Subject: [PATCH 07/44] Dev #38: adding support for incremental HPC environment installation from cloned tensorflow-gpu --- environment.dawn.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 environment.dawn.yml diff --git a/environment.dawn.yml b/environment.dawn.yml new file mode 100644 index 0000000..2713e21 --- /dev/null +++ b/environment.dawn.yml @@ -0,0 +1,11 @@ +channels: + - conda-forge + - defaults +dependencies: + - cartopy + - eccodes + - ffmpeg + - hdf5 + - netcdf4 + - openh264 + - xarray From a8f1193477e2c7c7949db51ee55d772b78ee605f Mon Sep 17 00:00:00 2001 From: James Byrne Date: Sat, 23 Mar 2024 07:22:46 +0000 Subject: [PATCH 08/44] Dev #38: sticking in some stubs for dawn use --- ensemble/predict.tmpl.yaml | 2 +- ensemble/template/dawn.sh | 5 +++++ ensemble/train.tmpl.yaml | 4 ++-- 3 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 ensemble/template/dawn.sh diff --git a/ensemble/predict.tmpl.yaml b/ensemble/predict.tmpl.yaml index f960174..55ad34d 100644 --- a/ensemble/predict.tmpl.yaml +++ b/ensemble/predict.tmpl.yaml @@ -14,6 +14,7 @@ ensemble: - ../../../processed - ../../../results mem: 224gb + cluster: pvc pre_process: [] post_process: [] @@ -24,7 +25,6 @@ ensemble: - icenet_predict.sh.j2 email: someone@example.com job_file: icenet_predict.sh - cluster: short nodes: 1 ntasks: 8 length: 00:30:00 diff --git a/ensemble/template/dawn.sh b/ensemble/template/dawn.sh new file mode 100644 index 0000000..e73154b --- /dev/null +++ b/ensemble/template/dawn.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +module purge +module load default-dawn +module load intelpython-conda diff --git a/ensemble/train.tmpl.yaml b/ensemble/train.tmpl.yaml index 0bed0bb..0a7e1c0 100644 --- a/ensemble/train.tmpl.yaml +++ b/ensemble/train.tmpl.yaml @@ -17,6 +17,8 @@ ensemble: - ../../../results gpus: 1 mem: 128gb + cluster: pvc + nodes: 1 pre_process: - name: execute @@ -31,8 +33,6 @@ ensemble: - icenet_train.sh.j2 email: someone@example.com job_file: icenet_train.sh - cluster: gpu - nodes: 1 ntasks: NTASKS length: 4-00:00:00 maxruns: 5 From adb234f6b64f03fa35b1d72f04bc96de0430b782 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Thu, 28 Mar 2024 15:14:15 +0000 Subject: [PATCH 09/44] Dev #39: highlighting what the intention is for specifying basic pip dependencies --- environment.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/environment.yml b/environment.yml index 17bb914..937f0e6 100644 --- a/environment.yml +++ b/environment.yml @@ -13,3 +13,6 @@ dependencies: - openh264 - python=3.8 - xarray + pip: + - icenet==0.2.7 + - model-ensembler From 3e4f9280ba997993a5530177b461e6db712ed226 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 12 Apr 2024 16:12:03 +0100 Subject: [PATCH 10/44] Removing explict icenet dependency, that's not necessary under pip (and certainly shouldn't be pinned --- environment.dawn.yml | 2 ++ environment.yml | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/environment.dawn.yml b/environment.dawn.yml index 2713e21..29ea4b7 100644 --- a/environment.dawn.yml +++ b/environment.dawn.yml @@ -9,3 +9,5 @@ dependencies: - netcdf4 - openh264 - xarray + pip: + - model-ensembler diff --git a/environment.yml b/environment.yml index 937f0e6..9a67f24 100644 --- a/environment.yml +++ b/environment.yml @@ -14,5 +14,4 @@ dependencies: - python=3.8 - xarray pip: - - icenet==0.2.7 - model-ensembler From 0d5bf55fe0bf2606523728f7a1ab39418a9df149 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 12 Apr 2024 17:15:52 +0100 Subject: [PATCH 11/44] Fixes #39: sorted this out properly --- environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 9a67f24..2e454cb 100644 --- a/environment.yml +++ b/environment.yml @@ -12,6 +12,7 @@ dependencies: - netcdf4 - openh264 - python=3.8 + - pip - xarray - pip: + - pip: - model-ensembler From 81ca63ca996a8a98da4cf86b9aac872c8544181e Mon Sep 17 00:00:00 2001 From: James Byrne Date: Wed, 19 Jun 2024 14:38:43 +0100 Subject: [PATCH 12/44] Version of python bump --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 2e454cb..b7912b6 100644 --- a/environment.yml +++ b/environment.yml @@ -11,7 +11,7 @@ dependencies: - ipykernel - netcdf4 - openh264 - - python=3.8 + - python=3.9 - pip - xarray - pip: From fa29ce53d0a69bbc627b7e724a9c59f328f1fafe Mon Sep 17 00:00:00 2001 From: James Byrne Date: Mon, 22 Jul 2024 20:46:19 +0100 Subject: [PATCH 13/44] Development rationalisation to support 0.4 development --- clean_pipeline.sh => script.backup/clean_pipeline.sh | 0 loader_test_dates.sh => script.backup/loader_test_dates.sh | 0 produce_op_assets.sh => script.backup/produce_op_assets.sh | 0 run_check.sh => script.backup/run_check.sh | 0 run_daily.sh => script.backup/run_daily.sh | 0 run_era5_forecast.sh => script.backup/run_era5_forecast.sh | 0 run_forecast_plots.sh => script.backup/run_forecast_plots.sh | 0 run_input_plots.sh => script.backup/run_input_plots.sh | 0 run_predict_ensemble.sh => script.backup/run_predict_ensemble.sh | 0 run_prediction.sh => script.backup/run_prediction.sh | 0 run_train_ensemble.sh => script.backup/run_train_ensemble.sh | 0 run_validation.sh => script.backup/run_validation.sh | 0 train_analysis.sh => script.backup/train_analysis.sh | 0 13 files changed, 0 insertions(+), 0 deletions(-) rename clean_pipeline.sh => script.backup/clean_pipeline.sh (100%) rename loader_test_dates.sh => script.backup/loader_test_dates.sh (100%) rename produce_op_assets.sh => script.backup/produce_op_assets.sh (100%) rename run_check.sh => script.backup/run_check.sh (100%) rename run_daily.sh => script.backup/run_daily.sh (100%) rename run_era5_forecast.sh => script.backup/run_era5_forecast.sh (100%) rename run_forecast_plots.sh => script.backup/run_forecast_plots.sh (100%) rename run_input_plots.sh => script.backup/run_input_plots.sh (100%) rename run_predict_ensemble.sh => script.backup/run_predict_ensemble.sh (100%) rename run_prediction.sh => script.backup/run_prediction.sh (100%) rename run_train_ensemble.sh => script.backup/run_train_ensemble.sh (100%) rename run_validation.sh => script.backup/run_validation.sh (100%) rename train_analysis.sh => script.backup/train_analysis.sh (100%) diff --git a/clean_pipeline.sh b/script.backup/clean_pipeline.sh similarity index 100% rename from clean_pipeline.sh rename to script.backup/clean_pipeline.sh diff --git a/loader_test_dates.sh b/script.backup/loader_test_dates.sh similarity index 100% rename from loader_test_dates.sh rename to script.backup/loader_test_dates.sh diff --git a/produce_op_assets.sh b/script.backup/produce_op_assets.sh similarity index 100% rename from produce_op_assets.sh rename to script.backup/produce_op_assets.sh diff --git a/run_check.sh b/script.backup/run_check.sh similarity index 100% rename from run_check.sh rename to script.backup/run_check.sh diff --git a/run_daily.sh b/script.backup/run_daily.sh similarity index 100% rename from run_daily.sh rename to script.backup/run_daily.sh diff --git a/run_era5_forecast.sh b/script.backup/run_era5_forecast.sh similarity index 100% rename from run_era5_forecast.sh rename to script.backup/run_era5_forecast.sh diff --git a/run_forecast_plots.sh b/script.backup/run_forecast_plots.sh similarity index 100% rename from run_forecast_plots.sh rename to script.backup/run_forecast_plots.sh diff --git a/run_input_plots.sh b/script.backup/run_input_plots.sh similarity index 100% rename from run_input_plots.sh rename to script.backup/run_input_plots.sh diff --git a/run_predict_ensemble.sh b/script.backup/run_predict_ensemble.sh similarity index 100% rename from run_predict_ensemble.sh rename to script.backup/run_predict_ensemble.sh diff --git a/run_prediction.sh b/script.backup/run_prediction.sh similarity index 100% rename from run_prediction.sh rename to script.backup/run_prediction.sh diff --git a/run_train_ensemble.sh b/script.backup/run_train_ensemble.sh similarity index 100% rename from run_train_ensemble.sh rename to script.backup/run_train_ensemble.sh diff --git a/run_validation.sh b/script.backup/run_validation.sh similarity index 100% rename from run_validation.sh rename to script.backup/run_validation.sh diff --git a/train_analysis.sh b/script.backup/train_analysis.sh similarity index 100% rename from train_analysis.sh rename to script.backup/train_analysis.sh From 515333de168ac57e279f84d7cd5250449b7870c9 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Sat, 17 Aug 2024 00:20:57 +0100 Subject: [PATCH 14/44] Removing unnecessary pinning --- environment.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/environment.yml b/environment.yml index b7912b6..80cc4cf 100644 --- a/environment.yml +++ b/environment.yml @@ -1,10 +1,9 @@ channels: - conda-forge - - defaults dependencies: - cartopy - - cudatoolkit=11.2 - - cudnn=8.1.0 + - cudatoolkit + - cudnn - eccodes - ffmpeg - hdf5 From 0d91837c953f303eda929d4a4f72393f79831d96 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Sat, 17 Aug 2024 00:22:27 +0100 Subject: [PATCH 15/44] Sorting out new preprocess config gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index b49d126..0faae3c 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ __pycache__/ /wandb/ *test*.json dataset_config.*.json +processed.*.json loader.*.json *.csv *.err From 735cbf7efbd167c43d20fb2eb17be0f49804d0a9 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Tue, 20 Aug 2024 21:10:53 +0100 Subject: [PATCH 16/44] Dev #53: reorganising structure of scripts --- .gitignore | 1 + ENVS | 1 - condense.slurm.sh | 10 -- .../run_check.sh => dataset_check.sh | 0 .../run_forecast_plots.sh => plot_forecast.sh | 0 .../run_input_plots.sh => plot_inputs.sh | 0 .../run_validation.sh => plot_validations.sh | 16 +-- prep_prediction_data.sh | 11 ++ prep_training_data.sh | 102 ++++++++++++++++++ ...oduce_op_assets.sh => process_op_assets.sh | 0 run_data.sh | 40 ------- ...ict_ensemble.sh => run_predict_ensemble.sh | 0 ...train_ensemble.sh => run_train_ensemble.sh | 0 script.backup/create_masks_plots.txt | 19 ++++ 14 files changed, 141 insertions(+), 59 deletions(-) delete mode 120000 ENVS delete mode 100755 condense.slurm.sh rename script.backup/run_check.sh => dataset_check.sh (100%) rename script.backup/run_forecast_plots.sh => plot_forecast.sh (100%) rename script.backup/run_input_plots.sh => plot_inputs.sh (100%) rename script.backup/run_validation.sh => plot_validations.sh (83%) create mode 100755 prep_prediction_data.sh create mode 100755 prep_training_data.sh rename script.backup/produce_op_assets.sh => process_op_assets.sh (100%) delete mode 100755 run_data.sh rename script.backup/run_predict_ensemble.sh => run_predict_ensemble.sh (100%) rename script.backup/run_train_ensemble.sh => run_train_ensemble.sh (100%) create mode 100644 script.backup/create_masks_plots.txt diff --git a/.gitignore b/.gitignore index 0faae3c..f89f39b 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ tmp.* *test* *.png +!ENVS !ENVS.example ENVS.* diff --git a/ENVS b/ENVS deleted file mode 120000 index 73248d3..0000000 --- a/ENVS +++ /dev/null @@ -1 +0,0 @@ -ENVS.example \ No newline at end of file diff --git a/condense.slurm.sh b/condense.slurm.sh deleted file mode 100755 index 99f021e..0000000 --- a/condense.slurm.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -. ENVS - -conda activate $ICENET_CONDA - -echo "START $1 $2 $3: `date +%T`" -icenet_process_condense -v $1 $2 $3 >$ICENET_HOME/logs/condense.$1.$2.$3.log 2>&1 -echo "END $1 $2 $3 `date +%T`" - diff --git a/script.backup/run_check.sh b/dataset_check.sh similarity index 100% rename from script.backup/run_check.sh rename to dataset_check.sh diff --git a/script.backup/run_forecast_plots.sh b/plot_forecast.sh similarity index 100% rename from script.backup/run_forecast_plots.sh rename to plot_forecast.sh diff --git a/script.backup/run_input_plots.sh b/plot_inputs.sh similarity index 100% rename from script.backup/run_input_plots.sh rename to plot_inputs.sh diff --git a/script.backup/run_validation.sh b/plot_validations.sh similarity index 83% rename from script.backup/run_validation.sh rename to plot_validations.sh index a0ad829..17c6d36 100755 --- a/script.backup/run_validation.sh +++ b/plot_validations.sh @@ -16,7 +16,7 @@ if [ $# -lt 2 ] || [ "$1" == "-h" ]; then echo -e "\nThe script will generate several plots which can be used to validate the forecast (and also to compare with ECMWF)" echo "The plots to analyse the performance of the forecasts will be saved to " echo "and the plots to compare performance with ECMWF will be saved to /ECMWF_comp" - echo "Run \"run_forecast_plots.sh -h\" for more details of what the plots generated are" + echo "Run \"plot_forecast.sh -h\" for more details of what the plots generated are" exit 1 fi @@ -74,20 +74,20 @@ for element in "${METRICS[@]}" do if [ "${element}" == "binacc" ]; then for THRESH in ${THRESHOLDS[@]}; do - ./run_forecast_plots.sh -m ${element} $REGION -v -l -t $THRESH \ + ./plot_forecast.sh -m ${element} $REGION -v -l -t $THRESH \ -o $OUTPUT_DIR $FORECAST $HEMI - ./run_forecast_plots.sh -m ${element} $REGION -e -v -l -t $THRESH \ + ./plot_forecast.sh -m ${element} $REGION -e -v -l -t $THRESH \ -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST $HEMI done elif [ "${element}" == "sie" ]; then for THRESH in ${THRESHOLDS[@]}; do - ./run_forecast_plots.sh -m ${element} $REGION -v -l -t $THRESH $GRID_AREA_SIZE \ + ./plot_forecast.sh -m ${element} $REGION -v -l -t $THRESH $GRID_AREA_SIZE \ -o $OUTPUT_DIR $FORECAST $HEMI - ./run_forecast_plots.sh -m ${element} $REGION -e -v -l -t $THRESH $GRID_AREA_SIZE \ + ./plot_forecast.sh -m ${element} $REGION -e -v -l -t $THRESH $GRID_AREA_SIZE \ -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST $HEMI done elif [ "${element}" == "sic" ]; then - ./run_forecast_plots.sh -m ${element} $REGION -v \ + ./plot_forecast.sh -m ${element} $REGION -v \ -o $OUTPUT_DIR $FORECAST $HEMI else if [ "${element}" == "mae" ]; then @@ -97,9 +97,9 @@ for element in "${METRICS[@]}" elif [ "${element}" == "rmse" ]; then LOGFILE="${RMSE_LOG}" fi - ./run_forecast_plots.sh -m ${element} $REGION -v -l \ + ./plot_forecast.sh -m ${element} $REGION -v -l \ -o $OUTPUT_DIR $FORECAST $HEMI - ./run_forecast_plots.sh -m ${element} $REGION -e -v -l \ + ./plot_forecast.sh -m ${element} $REGION -e -v -l \ -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST $HEMI fi done diff --git a/prep_prediction_data.sh b/prep_prediction_data.sh new file mode 100755 index 0000000..bf0ca5f --- /dev/null +++ b/prep_prediction_data.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +source ENVS + +conda activate $ICENET_CONDA + +set -o pipefail +set -eu + + + diff --git a/prep_training_data.sh b/prep_training_data.sh new file mode 100755 index 0000000..c71a52d --- /dev/null +++ b/prep_training_data.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +source ENVS + +conda activate $ICENET_CONDA + +set -o pipefail +set -eu + +if [ $# -lt 1 ] || [ "$1" == "-h" ]; then + echo "Usage $0 " +fi + +HEMI="$1" + +export OSISAF_DATASET="data/osisaf/dataset_config.month.hemi.north.json" # Persistent dataset +export ERA5_DATASET="data/era5/dataset_config.month.hemi.north.json" # Persistent dataset +export GROUND_TRUTH_SIC="osi_sic" # Ephemeral dataset +export GROUND_TRUTH_SIC_DSC="data/$GROUND_TRUTH_SIC/dataset_config.month.hemi.north.json" +export ATMOS_PROC="era5_osi" # Ephemeral dataset +export ATMOS_PROC_DSC="data/$ATMOS_PROC/dataset_config.month.hemi.north.json" +export PROCESSED_DATASET="test" +export LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json" +export DATASET_NAME="test_net_ds" + + +source ENVS + + + + + +( + for HEMI in north south; do echo download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS; done + for HEMI in north south; do echo download_osisaf $DATA_ARGS $HEMI $OSISAF_DATES $OSISAF_VAR_ARGS; done + for HEMI in north south; do echo download_era5 $DATA_ARGS $HEMI $ERA5_DATES $ERA5_VAR_ARGS; done + + for HEMI in north south; do echo download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS; done +) + + +source ENVS + +## Process + +preprocess_loader_init -v $PROCESSED_DATASET + +preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET land "icenet.data.masks.osisaf:Masks" + * TODO: masks is not compatible with dual hemisphere in this form! +preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.data.masks.osisaf:Masks" +preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks" + +preprocess_missing_time -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC +# TODO: didn't seemingly detect missing months? data/osi_sic/month/hemi.north/siconca.missing_days.csv +# TODO: undoubtedly need to include the known invalid dates - added these to the osisaf downloader +preprocess_missing_spatial -m processed.masks.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC +# TODO: Interpolation failing in all cases? +# TODO: this undoubtedly explains the stray nans present in dataset generation + +preprocess_dataset $PROC_ARGS_SIC -v \ + -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ + -i "icenet.data.processors.osisaf:SICPreProcessor" \ + $GROUND_TRUTH_SIC_DSC ${PROCESSED_DATASET}_osisaf +# TODO: plenty of nans contained in here, we need better assesments + +# TODO: icenet_osisaf_ref -v data/osisaf/hemi.north/siconca/2012.nc ref.osisaf.north.nc +# this needs to: +# - ds = xr.open_dataset("./data/osisaf/month/hemi.north/siconca/1978.nc") +# - ds = ds.drop_vars(["raw_ice_conc_values", "smearing_standard_error", "algorithm_standard_error"]) +# - cube = ds.siconca.to_iris() +# - cube.coord('projection_x_coordinate').convert_units('meters') +# - cube.coord('projection_y_coordinate').convert_units('meters') +# - iris.save("ref.osisaf.nc") + + +preprocess_regrid -v $ERA5_DATASET ref.osisaf.nc $ATMOS_PROC +# TODO: get the batcher back in place for multiprocessing this +# TODO: this should regrid ALL files in the dataset, for some reason 2024.nc did not get wrapped in +preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.nc + * TODO: get the batcher back in place for multiprocessing this + +preprocess_dataset $PROC_ARGS_ERA5 -v \ + -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ + -i "icenet.data.processors.cds:ERA5PreProcessor" \ + $ATMOS_PROC_DSC ${PROCESSED_DATASET}_era5 + * TODO: naive copy of "./data/era5_osi/month/hemi.north/uas/2024.nc" results in mistaken loading - not regridded + * TODO: dask multiprocessing cluster with task batcher across multiple variables would be sensible + +preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PROCESSED_DATASET}_osisaf.json processed.${PROCESSED_DATASET}_era5.json + +preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC sin "icenet.data.meta:SinProcessor" +preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC cos "icenet.data.meta:CosProcessor" +preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "icenet.data.masks.osisaf:Masks" + +icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME + * TODO: FIXME in here to override the creation of nan containing sets + +icenet_plot_input -p -v dataset_config.test_net_ds.json 2021-04-30 ./plot/input.png +icenet_plot_input --outputs -v dataset_config.test_net_ds.json 2021-04-30 ./plot/outputs.png +icenet_plot_input --weights -v dataset_config.test_net_ds.json 2021-04-30 ./plot/weights.png + +icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42 \ No newline at end of file diff --git a/script.backup/produce_op_assets.sh b/process_op_assets.sh similarity index 100% rename from script.backup/produce_op_assets.sh rename to process_op_assets.sh diff --git a/run_data.sh b/run_data.sh deleted file mode 100755 index 1c90194..0000000 --- a/run_data.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -source ENVS - -conda activate $ICENET_CONDA - -set -o pipefail -set -eu - -if [ $# -lt 1 ] || [ "$1" == "-h" ]; then - echo "Usage $0 [batch_size] [workers]" -fi - -DATANAME="$TRAIN_DATA_NAME" -HEMI="$1" -BATCH_SIZE=${2:-2} -WORKERS=${3:-8} - -if [ ! -f loader.${DATANAME}_${HEMI}.json ]; then - [ ! -z "$PROC_ARGS_ERA5" ] && icenet_process_era5 -v -l $LAG \ - $PROC_ARGS_ERA5 \ - -ns $TRAIN_START -ne $TRAIN_END -vs $VAL_START -ve $VAL_END -ts $TEST_START -te $TEST_END \ - ${DATANAME}_${HEMI} $HEMI - - [ ! -z "$PROC_ARGS_ORAS5" ] && icenet_process_oras5 -v -l $LAG \ - $PROC_ARGS_ORAS5 \ - -ns $TRAIN_START -ne $TRAIN_END -vs $VAL_START -ve $VAL_END -ts $TEST_START -te $TEST_END \ - ${DATANAME}_${HEMI} $HEMI - - [ ! -z "$PROC_ARGS_SIC" ] && icenet_process_sic -v -l $LAG \ - $PROC_ARGS_SIC \ - -ns $TRAIN_START -ne $TRAIN_END -vs $VAL_START -ve $VAL_END -ts $TEST_START -te $TEST_END \ - ${DATANAME}_${HEMI} $HEMI - - icenet_process_metadata ${DATANAME}_${HEMI} $HEMI -else - echo "Skipping preprocessing as loader.${DATANAME}_${HEMI}.json already exists..." -fi - -icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fd $FORECAST_DAYS -l $LAG ${DATANAME}_${HEMI} $HEMI diff --git a/script.backup/run_predict_ensemble.sh b/run_predict_ensemble.sh similarity index 100% rename from script.backup/run_predict_ensemble.sh rename to run_predict_ensemble.sh diff --git a/script.backup/run_train_ensemble.sh b/run_train_ensemble.sh similarity index 100% rename from script.backup/run_train_ensemble.sh rename to run_train_ensemble.sh diff --git a/script.backup/create_masks_plots.txt b/script.backup/create_masks_plots.txt new file mode 100644 index 0000000..25d6939 --- /dev/null +++ b/script.backup/create_masks_plots.txt @@ -0,0 +1,19 @@ +import pandas as pd +import matplotlib.pyplot as plt +from icenet.data.masks.osisaf import Masks +from download_toolbox.interface import get_dataset_config_implementation +dsc = get_dataset_config_implementation("data/osi_sic/dataset_config.month.hemi.north.json") +m = Masks(dsc) +m.polarhole_filename + +for i in range(1,13): + plt.contourf(m.active_grid_cell(pd.Timestamp("2020-{}-1".format(i)))) + plt.savefig("agcm{}.png".format(i)) + + +plt.contourf(m.land()) +plt.savefig("land.png") +for i in range(1975, 2026, 10): + plt.contourf(m.polarhole(pd.Timestamp("{}-1-1".format(i)))) + plt.savefig("polarhole.{}.png".format(i)) + From 1316e6a2201069745a7ea4f2bb942dea1d77c278 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Tue, 20 Aug 2024 21:12:02 +0100 Subject: [PATCH 17/44] Messed up gitignore, getting rid of ENVS --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f89f39b..0c5445a 100644 --- a/.gitignore +++ b/.gitignore @@ -34,7 +34,7 @@ tmp.* *test* *.png -!ENVS +ENVS !ENVS.example ENVS.* From 8d39b50edb2d5d3a5d7ccd048c4e97f0b592c549 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Wed, 21 Aug 2024 09:02:45 +0100 Subject: [PATCH 18/44] Dev #53: Adapted for new structure of environmental-forecasting training data preparation --- prep_training_data.sh | 104 ++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 49 deletions(-) diff --git a/prep_training_data.sh b/prep_training_data.sh index c71a52d..dd63262 100755 --- a/prep_training_data.sh +++ b/prep_training_data.sh @@ -8,62 +8,67 @@ set -o pipefail set -eu if [ $# -lt 1 ] || [ "$1" == "-h" ]; then - echo "Usage $0 " + echo "Usage $0 [download=0|1]" fi HEMI="$1" - -export OSISAF_DATASET="data/osisaf/dataset_config.month.hemi.north.json" # Persistent dataset -export ERA5_DATASET="data/era5/dataset_config.month.hemi.north.json" # Persistent dataset -export GROUND_TRUTH_SIC="osi_sic" # Ephemeral dataset -export GROUND_TRUTH_SIC_DSC="data/$GROUND_TRUTH_SIC/dataset_config.month.hemi.north.json" -export ATMOS_PROC="era5_osi" # Ephemeral dataset -export ATMOS_PROC_DSC="data/$ATMOS_PROC/dataset_config.month.hemi.north.json" -export PROCESSED_DATASET="test" -export LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json" -export DATASET_NAME="test_net_ds" - - -source ENVS - - - - - -( - for HEMI in north south; do echo download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS; done - for HEMI in north south; do echo download_osisaf $DATA_ARGS $HEMI $OSISAF_DATES $OSISAF_VAR_ARGS; done - for HEMI in north south; do echo download_era5 $DATA_ARGS $HEMI $ERA5_DATES $ERA5_VAR_ARGS; done - - for HEMI in north south; do echo download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS; done -) - - -source ENVS - -## Process - +DOWNLOAD=$2 + +# download-toolbox integration +# This updates our source +if [ $DOWNLOAD -eq 1 ]; then + download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS + download_osisaf $DATA_ARGS $HEMI $OSISAF_DATES $OSISAF_VAR_ARGS + download_era5 $DATA_ARGS $HEMI $ERA5_DATES $ERA5_VAR_ARGS + download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS +fi 2>&1 | tee logs/download.log + +DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json" + +# preprocess-toolbox integration +# Persistent datasets from the source data store, wherever that is +OSISAF_DATASET="${SOURCE_DATA_STORE}/osisaf/${DATASET_CONFIG_NAME}" +ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}" + +# Create links to the central data store datasets for easier "mapping" +[ [ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf +[ [ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5 +# TODO: AMSR +# TODO: CMIP + +GROUND_TRUTH_SIC="osi_sic" +ATMOS_PROC="era5_osi" + +# Our processed dataset configurations, we localise data when regridding and reprojecting +GROUND_TRUTH_SIC_DSC="data/${GROUND_TRUTH_SIC}/${DATASET_CONFIG_NAME}" +ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}" + +PROCESSED_DATASET="training" +LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json" +DATASET_NAME="tfdata_cache" + +## Workflow preprocess_loader_init -v $PROCESSED_DATASET preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET land "icenet.data.masks.osisaf:Masks" - * TODO: masks is not compatible with dual hemisphere in this form! + # TODO: masks is not compatible with dual hemisphere in this form! preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.data.masks.osisaf:Masks" preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks" preprocess_missing_time -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC -# TODO: didn't seemingly detect missing months? data/osi_sic/month/hemi.north/siconca.missing_days.csv -# TODO: undoubtedly need to include the known invalid dates - added these to the osisaf downloader + # TODO: didn't seemingly detect missing months? data/osi_sic/month/hemi.north/siconca.missing_days.csv + # TODO: undoubtedly need to include the known invalid dates - added these to the osisaf downloader preprocess_missing_spatial -m processed.masks.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC -# TODO: Interpolation failing in all cases? -# TODO: this undoubtedly explains the stray nans present in dataset generation + # TODO: Interpolation failing in all cases? + # TODO: this undoubtedly explains the stray nans present in dataset generation preprocess_dataset $PROC_ARGS_SIC -v \ -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ -i "icenet.data.processors.osisaf:SICPreProcessor" \ $GROUND_TRUTH_SIC_DSC ${PROCESSED_DATASET}_osisaf -# TODO: plenty of nans contained in here, we need better assesments + # TODO: plenty of nans contained in here due to failing spatial interpolation - needs investigation -# TODO: icenet_osisaf_ref -v data/osisaf/hemi.north/siconca/2012.nc ref.osisaf.north.nc +# TODO: icenet_osisaf_ref for geospatial metadata -v data/osisaf/hemi.north/siconca/???.nc ref.osisaf.north.nc # this needs to: # - ds = xr.open_dataset("./data/osisaf/month/hemi.north/siconca/1978.nc") # - ds = ds.drop_vars(["raw_ice_conc_values", "smearing_standard_error", "algorithm_standard_error"]) @@ -72,19 +77,18 @@ preprocess_dataset $PROC_ARGS_SIC -v \ # - cube.coord('projection_y_coordinate').convert_units('meters') # - iris.save("ref.osisaf.nc") - preprocess_regrid -v $ERA5_DATASET ref.osisaf.nc $ATMOS_PROC -# TODO: get the batcher back in place for multiprocessing this -# TODO: this should regrid ALL files in the dataset, for some reason 2024.nc did not get wrapped in + # TODO: get the batcher back in place for multiprocessing this + # TODO: this should regrid ALL files in the dataset, for some reason 2024.nc did not get wrapped in preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.nc - * TODO: get the batcher back in place for multiprocessing this + # TODO: get the batcher back in place for multiprocessing this preprocess_dataset $PROC_ARGS_ERA5 -v \ -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ -i "icenet.data.processors.cds:ERA5PreProcessor" \ $ATMOS_PROC_DSC ${PROCESSED_DATASET}_era5 - * TODO: naive copy of "./data/era5_osi/month/hemi.north/uas/2024.nc" results in mistaken loading - not regridded - * TODO: dask multiprocessing cluster with task batcher across multiple variables would be sensible + # TODO: naive copy of "./data/era5_osi/month/hemi.north/uas/2024.nc" earlier is not regridded? + # TODO: dask multiprocessing cluster with task batcher across multiple variables would be sensible preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PROCESSED_DATASET}_osisaf.json processed.${PROCESSED_DATASET}_era5.json @@ -92,11 +96,13 @@ preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC sin "icene preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC cos "icenet.data.meta:CosProcessor" preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "icenet.data.masks.osisaf:Masks" -icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME - * TODO: FIXME in here to override the creation of nan containing sets - +# TODO: select a random date from the training set, plot and log so user can double check outputs icenet_plot_input -p -v dataset_config.test_net_ds.json 2021-04-30 ./plot/input.png icenet_plot_input --outputs -v dataset_config.test_net_ds.json 2021-04-30 ./plot/outputs.png icenet_plot_input --weights -v dataset_config.test_net_ds.json 2021-04-30 ./plot/weights.png -icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42 \ No newline at end of file +icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME + # TODO: FIXME in here to override the creation of nan containing sets due to earlier issues + + +# icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42 \ No newline at end of file From 7679464ca480b88996ddd3fedd39a5cc0b53854e Mon Sep 17 00:00:00 2001 From: James Byrne Date: Wed, 21 Aug 2024 12:55:38 +0100 Subject: [PATCH 19/44] Dev #53: finalised scripting of prep_training_data --- prep_training_data.sh | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/prep_training_data.sh b/prep_training_data.sh index dd63262..2cafab2 100755 --- a/prep_training_data.sh +++ b/prep_training_data.sh @@ -1,7 +1,6 @@ -#!/bin/bash +#!/bin/bash -l source ENVS - conda activate $ICENET_CONDA set -o pipefail @@ -9,18 +8,19 @@ set -eu if [ $# -lt 1 ] || [ "$1" == "-h" ]; then echo "Usage $0 [download=0|1]" + exit 1 fi HEMI="$1" -DOWNLOAD=$2 +DOWNLOAD=${2:-0} # download-toolbox integration # This updates our source if [ $DOWNLOAD -eq 1 ]; then - download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS + # download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS download_osisaf $DATA_ARGS $HEMI $OSISAF_DATES $OSISAF_VAR_ARGS download_era5 $DATA_ARGS $HEMI $ERA5_DATES $ERA5_VAR_ARGS - download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS + # download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS fi 2>&1 | tee logs/download.log DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json" @@ -31,8 +31,8 @@ OSISAF_DATASET="${SOURCE_DATA_STORE}/osisaf/${DATASET_CONFIG_NAME}" ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}" # Create links to the central data store datasets for easier "mapping" -[ [ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf -[ [ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5 +[ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf +[ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5 # TODO: AMSR # TODO: CMIP @@ -96,13 +96,15 @@ preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC sin "icene preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC cos "icenet.data.meta:CosProcessor" preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "icenet.data.masks.osisaf:Masks" +icenet_dataset_create -v -c -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME + # TODO: select a random date from the training set, plot and log so user can double check outputs -icenet_plot_input -p -v dataset_config.test_net_ds.json 2021-04-30 ./plot/input.png -icenet_plot_input --outputs -v dataset_config.test_net_ds.json 2021-04-30 ./plot/outputs.png -icenet_plot_input --weights -v dataset_config.test_net_ds.json 2021-04-30 ./plot/weights.png +icenet_plot_input -p -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./plot/input.png +icenet_plot_input --outputs -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./plot/outputs.png +icenet_plot_input --weights -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./plot/weights.png icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME # TODO: FIXME in here to override the creation of nan containing sets due to earlier issues -# icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42 \ No newline at end of file +# icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42 From e6673fed108b626d045303b04b7f92befcff3312 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Wed, 21 Aug 2024 16:55:56 +0100 Subject: [PATCH 20/44] Dev #53: implementation for new structure of training runs --- ensemble/template/icenet_train.sh.j2 | 24 ++++++++++++++++++--- ensemble/train.tmpl.yaml | 32 ++++++++++++++-------------- prep_training_data.sh | 1 + run_train_ensemble.sh | 31 +++++++++++++-------------- 4 files changed, 53 insertions(+), 35 deletions(-) diff --git a/ensemble/template/icenet_train.sh.j2 b/ensemble/template/icenet_train.sh.j2 index 478ce92..73962d0 100755 --- a/ensemble/template/icenet_train.sh.j2 +++ b/ensemble/template/icenet_train.sh.j2 @@ -1,4 +1,5 @@ -#!/bin/bash +#!/bin/bash -l +{% if run.cluster != "test" %} #SBATCH --output={{ run.dir }}/train.%j.%N.{{ run.seed }}.out #SBATCH --error={{ run.dir }}/train.%j.%N.{{ run.seed }}.err #SBATCH --chdir={{ run.dir }} @@ -15,6 +16,7 @@ #SBATCH --cpus-per-task={{ run.ntasks }} #SBATCH --mem={{ run.mem }} {% if run.nodelist %}#SBATCH --nodelist={{ run.nodelist }}{% endif %} +{% endif %} cd {{ run.dir }} @@ -36,8 +38,24 @@ echo "START `date +%F\ %T`" source $PREP_SCRIPT conda activate $ICENET_CONDA -# TODO: run.arg_filter_factor comes from ENVS now -COMMAND="icenet_train -v {{ run.arg_dataset }} {{ run.name }} {{ run.seed }} $TRAIN_STATIC_ARGS -b {{ run.arg_batch }} -e {{ run.arg_epochs }} -m -qs {{ run.arg_queue }} -w {{ run.ntasks }} -s {{ run.arg_strategy }} {% if run.arg_preload %} -p results/networks/{{ run.name }}/{{ run.name }}.network_{{ run.arg_preload }}.{{ run.seed }}.h5 {% endif %}{% if run.arg_filter_factor %} -n {{ run.arg_filter_factor }}{% endif %}" +PRELOAD="" +FINAL_WEIGHTS="results/networks/{{ run.name }}/{{ run.name }}.network_{{ run.preload }}.{{ run.seed }}.h5" +CHECKPOINT_WEIGHTS="`ls results/networks/{{ run.name }}/checkpoint.{{ run.name }}.network_{{ run.preload }}.{{ run.seed }}.*.keras 2>/dev/null`" + +# TODO: do we have keras / h5 weight multi-handling in place in library? +if [ -f $FINAL_WEIGHTS ]; then + echo "Preloading from previously trained network $FINAL_WEIGHTS" + PRELOAD="-p $FINAL_WEIGHTS" +elif [ ! -z "$CHECKPOINT_WEIGHTS" ]; then + CHECKPOINT_FILE=`echo "$CHECKPOINT_WEIGHTS" | sort | head -n 1` + echo "Preloading from checkpoint file $CHECKPOINT_FILE" + PRELOAD="-p $CHECKPOINT_FILE" +fi + +COMMAND="icenet_train_tensorflow -v \ + $TRAIN_STATIC_ARGS \ + -b {{ run.batch }} -e {{ run.epochs }} -n $FILTER_FACTOR -s {{ run.strategy }} \ + $PRELOAD {{ run.dataset }} {{ run.name }} {{ run.seed }} " echo "Running $COMMAND" eval $COMMAND diff --git a/ensemble/train.tmpl.yaml b/ensemble/train.tmpl.yaml index 0a7e1c0..423cfb2 100644 --- a/ensemble/train.tmpl.yaml +++ b/ensemble/train.tmpl.yaml @@ -1,24 +1,27 @@ --- ensemble: vars: - arg_batch: 4 - arg_dataset: DATASET - arg_epochs: 100 - arg_filter_factor: 1 - arg_queue: 2 - arg_strategy: default + batch: 4 + cluster: dummy + dataset: DATASET + email: someone@example.com + epochs: 100 + filter_factor: 1 + gpus: 1 + length: 1-00:00:00 + mem: 128gb + nodes: 1 + ntasks: 2 + preload: DATASET + strategy: default symlinks: - ../../../data - ../../../ENVS* - - ../../../loader.LOADER.json - - ../../../dataset_config.DATASET.json + - ../../../LOADER + - ../../../DATASET - ../../../network_datasets - ../../../processed - ../../../results - gpus: 1 - mem: 128gb - cluster: pvc - nodes: 1 pre_process: - name: execute @@ -31,11 +34,8 @@ ensemble: templatedir: ../template templates: - icenet_train.sh.j2 - email: someone@example.com job_file: icenet_train.sh - ntasks: NTASKS - length: 4-00:00:00 - maxruns: 5 + maxruns: MAXJOBS maxjobs: MAXJOBS batches: diff --git a/prep_training_data.sh b/prep_training_data.sh index 2cafab2..a794de9 100755 --- a/prep_training_data.sh +++ b/prep_training_data.sh @@ -108,3 +108,4 @@ icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LO # icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42 +# run_train_ensemble \ No newline at end of file diff --git a/run_train_ensemble.sh b/run_train_ensemble.sh index 2c86b12..2ceba8e 100755 --- a/run_train_ensemble.sh +++ b/run_train_ensemble.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash -if [[ $# -lt 3 ]]; then - echo "Usage $0 LOADER DATASET NAME" +if [[ $# -lt 2 ]]; then + echo "Usage $0 DATASET NAME" exit 1 fi @@ -14,26 +14,26 @@ ENSEMBLE_TARGET="slurm" ENSEMBLE_SWITCH="" ENSEMBLE_ARGS="" ENSEMBLE_JOBS=1 -ENSEMBLE_NTASKS=4 ENSEMBLE_SEEDS_DEFAULT=42,46,45,17,24,84,83,16,5,3 -while getopts ":b:c:de:f:g:j:l:m:n:p:q:r:s:t:" opt; do +while getopts ":b:c:de:f:g:j:l:m:n:o:p:q:r:s:t:x:" opt; do case "$opt" in - b) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_batch=$OPTARG ";; + b) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}batch=$OPTARG ";; c) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}cluster=$OPTARG ";; d) ENSEMBLE_TARGET="dummy";; - e) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_epochs=$OPTARG ";; - f) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_filter_factor=$OPTARG ";; + e) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}epochs=$OPTARG ";; + f) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}filter_factor=$OPTARG ";; g) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}gpus=$OPTARG ";; j) ENSEMBLE_JOBS=$OPTARG ;; - l) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_preload=$OPTARG ";; + l) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}preload=$OPTARG ";; m) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}mem=$OPTARG ";; n) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}nodelist=$OPTARG ";; - p) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_prep=$OPTARG ";; - q) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_queue=$OPTARG ";; + o) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}nodes=$OPTARG ";; + p) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}prep=$OPTARG ";; r) ENSEMBLE_RUNS=$OPTARG ;; # Ensemble member run seed values - s) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}arg_strategy=$OPTARG ";; - t) ENSEMBLE_NTASKS=$OPTARG ;; + s) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}strategy=$OPTARG ";; + t) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}ntasks=$OPTARG ";; + x) ENSEMBLE_ARGS="${ENSEMBLE_ARGS}email=$OPTARG ";; esac done @@ -44,10 +44,10 @@ shift $((OPTIND-1)) echo "ARGS = $ENSEMBLE_SWITCH $ENSEMBLE_ARGS, Leftovers: $@" -LOADER="$1" -DATASET="$2" -NAME="$3" +DATASET="$1" +NAME="$2" +LOADER=`basename $( cat dataset_config.${DATASET}.json | jq '.loader_config' | tr -d '"' )` TRAIN_CONFIG=`mktemp -p . --suffix ".train"` ## @@ -86,7 +86,6 @@ sed -r \ -e "s/LOADER/${LOADER}/g" \ -e "s/DATASET/${DATASET}/g" \ -e "s/MAXJOBS/${ENSEMBLE_JOBS}/g" \ - -e "s/NTASKS/${ENSEMBLE_NTASKS}/g" \ -e "/\bSEEDS$/s/.*/${ENSEMBLE_SEEDS}/g" \ ensemble/train.tmpl.yaml >$TRAIN_CONFIG From 13204a236cbca6e143ec683639c49aa4f4a3c549 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Wed, 21 Aug 2024 21:37:36 +0100 Subject: [PATCH 21/44] Updating refs for creation of links --- ensemble/train.tmpl.yaml | 4 ++-- prep_training_data.sh | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/ensemble/train.tmpl.yaml b/ensemble/train.tmpl.yaml index 423cfb2..3ef8d00 100644 --- a/ensemble/train.tmpl.yaml +++ b/ensemble/train.tmpl.yaml @@ -8,7 +8,7 @@ ensemble: epochs: 100 filter_factor: 1 gpus: 1 - length: 1-00:00:00 + length: "1-00:00:00" mem: 128gb nodes: 1 ntasks: 2 @@ -18,7 +18,7 @@ ensemble: - ../../../data - ../../../ENVS* - ../../../LOADER - - ../../../DATASET + - ../../../dataset_config.DATASET.json - ../../../network_datasets - ../../../processed - ../../../results diff --git a/prep_training_data.sh b/prep_training_data.sh index a794de9..68431af 100755 --- a/prep_training_data.sh +++ b/prep_training_data.sh @@ -105,7 +105,3 @@ icenet_plot_input --weights -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./ icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME # TODO: FIXME in here to override the creation of nan containing sets due to earlier issues - - -# icenet_train_tensorflow -b 1 -e 5 -f 1 -n 0.2 -nw -v dataset_config.${DATASET_NAME}.json test_network 42 -# run_train_ensemble \ No newline at end of file From f2bd7d74d7ba7bfbfbd6d497ca7a061e0fb21f65 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Thu, 22 Aug 2024 15:08:09 +0100 Subject: [PATCH 22/44] Training data working for both hemispheres --- prep_training_data.sh | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/prep_training_data.sh b/prep_training_data.sh index 68431af..dcf64f6 100755 --- a/prep_training_data.sh +++ b/prep_training_data.sh @@ -43,22 +43,21 @@ ATMOS_PROC="era5_osi" GROUND_TRUTH_SIC_DSC="data/${GROUND_TRUTH_SIC}/${DATASET_CONFIG_NAME}" ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}" -PROCESSED_DATASET="training" +PROCESSED_DATASET="training.${HEMI}" LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json" -DATASET_NAME="tfdata_cache" +DATASET_NAME="tfdata_${HEMI}" ## Workflow preprocess_loader_init -v $PROCESSED_DATASET preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET land "icenet.data.masks.osisaf:Masks" - # TODO: masks is not compatible with dual hemisphere in this form! preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.data.masks.osisaf:Masks" preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks" preprocess_missing_time -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC # TODO: didn't seemingly detect missing months? data/osi_sic/month/hemi.north/siconca.missing_days.csv # TODO: undoubtedly need to include the known invalid dates - added these to the osisaf downloader -preprocess_missing_spatial -m processed.masks.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC +preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC # TODO: Interpolation failing in all cases? # TODO: this undoubtedly explains the stray nans present in dataset generation @@ -68,16 +67,13 @@ preprocess_dataset $PROC_ARGS_SIC -v \ $GROUND_TRUTH_SIC_DSC ${PROCESSED_DATASET}_osisaf # TODO: plenty of nans contained in here due to failing spatial interpolation - needs investigation -# TODO: icenet_osisaf_ref for geospatial metadata -v data/osisaf/hemi.north/siconca/???.nc ref.osisaf.north.nc -# this needs to: -# - ds = xr.open_dataset("./data/osisaf/month/hemi.north/siconca/1978.nc") -# - ds = ds.drop_vars(["raw_ice_conc_values", "smearing_standard_error", "algorithm_standard_error"]) -# - cube = ds.siconca.to_iris() -# - cube.coord('projection_x_coordinate').convert_units('meters') -# - cube.coord('projection_y_coordinate').convert_units('meters') -# - iris.save("ref.osisaf.nc") +HEMI_SHORT="nh" +[ $HEMI == "south" ] && HEMI_SHORT="sh" +# TODO: we should be able to preseve data during download-toolbox processing for this, but +# alas this needs some investigation to achieve, so this will work for the moment +icenet_generate_ref_osisaf -v data/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc -preprocess_regrid -v $ERA5_DATASET ref.osisaf.nc $ATMOS_PROC +preprocess_regrid -v $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC # TODO: get the batcher back in place for multiprocessing this # TODO: this should regrid ALL files in the dataset, for some reason 2024.nc did not get wrapped in preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.nc @@ -98,10 +94,10 @@ preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map " icenet_dataset_create -v -c -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME -# TODO: select a random date from the training set, plot and log so user can double check outputs -icenet_plot_input -p -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./plot/input.png -icenet_plot_input --outputs -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./plot/outputs.png -icenet_plot_input --weights -v dataset_config.${DATASET_NAME}.json 1985-04-30 ./plot/weights.png +FIRST_DATE=`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.train[0]' | tr -d '"'` +icenet_plot_input -p -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/input.${HEMI}.${FIRST_DATE}.png +icenet_plot_input --outputs -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/outputs.${HEMI}.${FIRST_DATE}.png +icenet_plot_input --weights -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/weights.${HEMI}.${FIRST_DATE}.png icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME # TODO: FIXME in here to override the creation of nan containing sets due to earlier issues From 3016b5d5b647482dd7fd25ad775b3b085205d9ca Mon Sep 17 00:00:00 2001 From: James Byrne Date: Tue, 27 Aug 2024 23:33:50 +0100 Subject: [PATCH 23/44] Dev #53: implementing prediction and more comprehensive lifecycle, BUT with significant issues around download-toobox --- ensemble/predict.tmpl.yaml | 9 ++ ensemble/template/icenet_predict.sh.j2 | 2 +- prep_prediction_data.sh | 111 ++++++++++++++++++++++++- prep_training_data.sh | 20 ++--- run_prediction.sh | 50 +++++++++++ script.backup/loader_test_dates.sh | 14 ---- script.backup/run_prediction.sh | 82 ------------------ 7 files changed, 174 insertions(+), 114 deletions(-) create mode 100755 run_prediction.sh delete mode 100755 script.backup/loader_test_dates.sh delete mode 100755 script.backup/run_prediction.sh diff --git a/ensemble/predict.tmpl.yaml b/ensemble/predict.tmpl.yaml index 55ad34d..9b84fef 100644 --- a/ensemble/predict.tmpl.yaml +++ b/ensemble/predict.tmpl.yaml @@ -38,6 +38,15 @@ ensemble: - name: execute args: cmd: /usr/bin/ln -s ../../data + - name: execute + args: + cmd: /usr/bin/ln -s ../../processed + - name: execute + args: + cmd: /usr/bin/ln -s ../../ref.osisaf.north.nc + - name: execute + args: + cmd: /usr/bin/ln -s ../../ref.osisaf.south.nc pre_run: [] runs: - seed: SEEDS diff --git a/ensemble/template/icenet_predict.sh.j2 b/ensemble/template/icenet_predict.sh.j2 index ba69559..a190cd5 100755 --- a/ensemble/template/icenet_predict.sh.j2 +++ b/ensemble/template/icenet_predict.sh.j2 @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -l #SBATCH --output={{ run.dir }}/predict.%j.%N.{{ run.seed }}.out #SBATCH --error={{ run.dir }}/predict.%j.%N.{{ run.seed }}.err #SBATCH --chdir={{ run.dir }} diff --git a/prep_prediction_data.sh b/prep_prediction_data.sh index bf0ca5f..884a6e3 100755 --- a/prep_prediction_data.sh +++ b/prep_prediction_data.sh @@ -1,11 +1,114 @@ -#!/bin/bash +#!/usr/bin/bash -l -source ENVS +set -e -o pipefail + +. ENVS conda activate $ICENET_CONDA -set -o pipefail -set -eu +if [ $# -lt 2 ] || [ "$1" == "-h" ]; then + echo "Usage $0 [date_vars] [train_data_name]" + echo " name of prediction dataset" + echo " hemisphere to use" + echo "[date_vars] variables for defining start and end dates to forecast" + echo "[train_data_name] name of data used to train the model" + echo "Options: none" + exit 1 +fi + +# obtaining any arguments that should be passed onto run_forecast_plots.sh +OPTIND=1 +while getopts "" opt; do + case "$opt" in + esac +done + +shift $((OPTIND-1)) + +echo "Leftovers from getopt: $@" + +PREDICTION_NAME="$1" +HEMI="$2" +DATE_VARS="${3:-$PREDICTION_NAME}" +DATA_PROC="${4:-${TRAIN_DATA_NAME}}.${HEMI}" + +NAME_START="${DATE_VARS^^}_START" +NAME_END="${DATE_VARS^^}_END" +echo "Dates from ENVS: $NAME_START and $NAME_END" +PREDICTION_START=${!NAME_START} +PREDICTION_END=${!NAME_END} + +if [ -z $PREDICTION_START ] || [ -z $PREDICTION_END ]; then + echo "Prediction date args not set correctly: \"$PREDICTION_START\" to \"$PREDICTION_END\"" + exit 1 +else + echo "Prediction start arg: $PREDICTION_START" + echo "Prediction end arg: $PREDICTION_END" +fi + +PREDICTION_DATASET="prediction.${PREDICTION_NAME}.${HEMI}" +LOADER_CONFIGURATION="loader.${PREDICTION_DATASET}.json" + +PRED_DATA_START=`date --date "$PREDICTION_START - $LAG ${DATA_FREQUENCY}s" +%Y-%m-%d` +# download-toolbox integration +( + # download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS + download_osisaf $DATA_ARGS $HEMI $PRED_DATA_START $PREDICTION_END $OSISAF_VAR_ARGS + download_era5 $DATA_ARGS $HEMI $PRED_DATA_START $PREDICTION_END $ERA5_VAR_ARGS + # download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS + + # TODO: this overwrites the ./data/osisaf/dataset_config.month.hemi.north.json files, which is unacceptable - localise + # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly +) 2>&1 | tee logs/download.${PREDICTION_DATASET}.log + +DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json" + +# preprocess-toolbox integration +# Persistent datasets from the source data store, wherever that is +OSISAF_DATASET="${SOURCE_DATA_STORE}/osisaf/${DATASET_CONFIG_NAME}" +ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}" +ATMOS_PROC="era5_osi.$PREDICTION_DATASET" +ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}" + +# Create links to the central data store datasets for easier "mapping" +[ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf +[ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5 +# TODO: AMSR +# TODO: CMIP + +LOADER_CONFIGURATION="loader.${PREDICTION_DATASET}.json" + +preprocess_loader_init -v $PREDICTION_DATASET +preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET land "icenet.data.masks.osisaf:Masks" +preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.data.masks.osisaf:Masks" +preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks" + +preprocess_dataset $PROC_ARGS_SIC -v \ + -sn "prediction" -ss "$PREDICTION_START" -se "$PREDICTION_END" \ + -r processed/${DATA_PROC}_osisaf/ \ + -i "icenet.data.processors.osisaf:SICPreProcessor" \ + $OSISAF_DATASET ${PREDICTION_DATASET}_osisaf + +if [ ! -f ref.osisaf.${HEMI}.nc ]; then + echo "Reference OSISAF for regrid should still be available, bailing for the mo" + exit 1 +fi + +preprocess_regrid -v $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC +preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc + +preprocess_dataset $PROC_ARGS_ERA5 -v \ + -sn "prediction" -ss "$PREDICTION_START" -se "$PREDICTION_END" \ + -r processed/${DATA_PROC}_era5/ \ + -i "icenet.data.processors.cds:ERA5PreProcessor" \ + $ATMOS_PROC_DSC ${PREDICTION_DATASET}_era5 +preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PREDICTION_DATASET}_osisaf.json processed.${PREDICTION_DATASET}_era5.json +preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET sin "icenet.data.meta:SinProcessor" +preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET cos "icenet.data.meta:CosProcessor" +preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET land_map "icenet.data.masks.osisaf:Masks" +icenet_dataset_create -v -c -p -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $PREDICTION_DATASET +FIRST_DATE=${PLOT_DATE:-`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.prediction[0]' | tr -d '"'`} +icenet_plot_input -p -v dataset_config.${PREDICTION_DATASET}.json $FIRST_DATE ./plot/input.${HEMI}.${FIRST_DATE}.png diff --git a/prep_training_data.sh b/prep_training_data.sh index dcf64f6..6106ed7 100755 --- a/prep_training_data.sh +++ b/prep_training_data.sh @@ -21,7 +21,7 @@ if [ $DOWNLOAD -eq 1 ]; then download_osisaf $DATA_ARGS $HEMI $OSISAF_DATES $OSISAF_VAR_ARGS download_era5 $DATA_ARGS $HEMI $ERA5_DATES $ERA5_VAR_ARGS # download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS -fi 2>&1 | tee logs/download.log +fi 2>&1 | tee logs/download.training.log DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json" @@ -36,14 +36,14 @@ ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}" # TODO: AMSR # TODO: CMIP -GROUND_TRUTH_SIC="osi_sic" -ATMOS_PROC="era5_osi" +GROUND_TRUTH_SIC="osi_sic.$TRAIN_DATA_NAME" +ATMOS_PROC="era5_osi.$TRAIN_DATA_NAME" # Our processed dataset configurations, we localise data when regridding and reprojecting GROUND_TRUTH_SIC_DSC="data/${GROUND_TRUTH_SIC}/${DATASET_CONFIG_NAME}" ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}" -PROCESSED_DATASET="training.${HEMI}" +PROCESSED_DATASET="${TRAIN_DATA_NAME}.${HEMI}" LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json" DATASET_NAME="tfdata_${HEMI}" @@ -55,28 +55,23 @@ preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.d preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks" preprocess_missing_time -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC - # TODO: didn't seemingly detect missing months? data/osi_sic/month/hemi.north/siconca.missing_days.csv - # TODO: undoubtedly need to include the known invalid dates - added these to the osisaf downloader preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC - # TODO: Interpolation failing in all cases? - # TODO: this undoubtedly explains the stray nans present in dataset generation preprocess_dataset $PROC_ARGS_SIC -v \ -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ -i "icenet.data.processors.osisaf:SICPreProcessor" \ $GROUND_TRUTH_SIC_DSC ${PROCESSED_DATASET}_osisaf - # TODO: plenty of nans contained in here due to failing spatial interpolation - needs investigation HEMI_SHORT="nh" [ $HEMI == "south" ] && HEMI_SHORT="sh" -# TODO: we should be able to preseve data during download-toolbox processing for this, but +# TODO: we should be able to preserve data during download-toolbox processing for this, but # alas this needs some investigation to achieve, so this will work for the moment icenet_generate_ref_osisaf -v data/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc preprocess_regrid -v $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC # TODO: get the batcher back in place for multiprocessing this # TODO: this should regrid ALL files in the dataset, for some reason 2024.nc did not get wrapped in -preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.nc +preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc # TODO: get the batcher back in place for multiprocessing this preprocess_dataset $PROC_ARGS_ERA5 -v \ @@ -94,10 +89,9 @@ preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map " icenet_dataset_create -v -c -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME -FIRST_DATE=`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.train[0]' | tr -d '"'` +FIRST_DATE=${PLOT_DATE:-`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.train[0]' | tr -d '"'`} icenet_plot_input -p -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/input.${HEMI}.${FIRST_DATE}.png icenet_plot_input --outputs -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/outputs.${HEMI}.${FIRST_DATE}.png icenet_plot_input --weights -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/weights.${HEMI}.${FIRST_DATE}.png icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME - # TODO: FIXME in here to override the creation of nan containing sets due to earlier issues diff --git a/run_prediction.sh b/run_prediction.sh new file mode 100755 index 0000000..321a1ba --- /dev/null +++ b/run_prediction.sh @@ -0,0 +1,50 @@ +#!/usr/bin/bash -l + +set -e -o pipefail + +. ENVS + +conda activate $ICENET_CONDA + +if [ $# -lt 3 ] || [ "$1" == "-h" ]; then + echo "Usage $0 [date_vars] [train_data_name]" + echo " name of prediction]" + echo " model name" + echo " hemisphere to use" + echo "Options: none" + exit 1 +fi + +# obtaining any arguments that should be passed onto run_forecast_plots.sh +OPTIND=1 +while getopts "" opt; do + case "$opt" in + esac +done + +shift $((OPTIND-1)) + +echo "Leftovers from getopt: $@" + +PREDICTION_NAME="prediction.$1" +MODEL="$2" +HEMI="$3" + +# This assumes you're not retraining using the same model name, eek +if [ -d results/networks/${MODEL}_${HEMI} ]; then + SAVEFILE=`ls results/networks/${MODEL}_${HEMI}/${MODEL}_${HEMI}.*.h5 | head -n 1` + DATASET=`echo $SAVEFILE | perl -lpe's/.+\.network_(.+)\.[0-9]+\.h5/$1/'` + echo "First model file: $SAVEFILE" + echo "Dataset model was trained on: $DATASET" +else + echo "Model $MODEL doesn't exist" + exit 1 +fi + +LOADER_NAME="loader.${PREDICTION_NAME}.${HEMI}.json" +jq -c '.sources[].splits["prediction"][]' $LOADER_NAME | sort | uniq | sed -r \ + -e 's/"//g' \ + -e 's/([0-9]{4})_([0-9]{2})_([0-9]{2})/\1-\2-\3/' >${PREDICTION_NAME}.${HEMI}.csv + +./run_predict_ensemble.sh -d -i $DATASET -f $FILTER_FACTOR -p $PREP_SCRIPT \ + ${MODEL}_${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI}.csv diff --git a/script.backup/loader_test_dates.sh b/script.backup/loader_test_dates.sh deleted file mode 100755 index 3b78579..0000000 --- a/script.backup/loader_test_dates.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -if [ $# -ne 1 ]; then - echo "Usage $0 " - exit 1 -fi - -LOADER_NAME="loader.${1}.json" - -jq -c '.sources[].dates["test"][]' $LOADER_NAME | sort | uniq | sed -r \ - -e 's/"//g' \ - -e 's/([0-9]{4})_([0-9]{2})_([0-9]{2})/\1-\2-\3/' - -exit 0 diff --git a/script.backup/run_prediction.sh b/script.backup/run_prediction.sh deleted file mode 100755 index 5d31bca..0000000 --- a/script.backup/run_prediction.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/bash - -set -e -o pipefail - -. ENVS - -conda activate $ICENET_CONDA - -if [ $# -lt 3 ] || [ "$1" == "-h" ]; then - echo "Usage $0 [date_vars] [train_data_name]" - echo " name of forecast" - echo " model name" - echo " hemisphere to use" - echo "[date_vars] variables for defining start and end dates to forecast" - echo "[train_data_name] name of data used to train the model" - echo "Options: none" - exit 1 -fi - -# obtaining any arguments that should be passed onto run_forecast_plots.sh -OPTIND=1 -while getopts "" opt; do - case "$opt" in - esac -done - -shift $((OPTIND-1)) - -echo "Leftovers from getopt: $@" - -FORECAST="$1" -MODEL="$2" -HEMI="$3" -DATE_VARS="${4:-$FORECAST}" -DATA_PROC="${5:-${TRAIN_DATA_NAME}}_${HEMI}" - -# This assumes you're not retraining using the same model name, eek -if [ -d results/networks/$MODEL ]; then - SAVEFILE=`ls results/networks/${MODEL}/${MODEL}.*.h5 | head -n 1` - DATASET=`echo $SAVEFILE | perl -lpe's/.+\.network_(.+)\.[0-9]+\.h5/$1/'` - echo "First model file: $SAVEFILE" - echo "Dataset model was trained on: $DATASET" -else - echo "Model $MODEL doesn't exist" - exit 1 -fi - -NAME_START="${DATE_VARS^^}_START" -NAME_END="${DATE_VARS^^}_END" -echo "Dates from ENVS: $NAME_START and $NAME_END" -PREDICTION_START=${!NAME_START} -PREDICTION_END=${!NAME_END} - -if [ -z $PREDICTION_START ] || [ -z $PREDICTION_END ]; then - echo "Prediction date args not set correctly: \"$PREDICTION_START\" to \"$PREDICTION_END\"" - exit 1 -else - echo "Prediction start arg: $PREDICTION_START" - echo "Prediction end arg: $PREDICTION_END" -fi - -[ ! -z "$PROC_ARGS_ERA5" ] && \ - icenet_process_era5 -r processed/$DATA_PROC/era5/$HEMI \ - $PROC_ARGS_ERA5 \ - -v -l $LAG -ts $PREDICTION_START -te $PREDICTION_END ${FORECAST}_${HEMI} $HEMI - -[ ! -z "$PROC_ARGS_ORAS5" ] && \ - icenet_process_oras5 -r processed/$DATA_PROC/oras5/$HEMI \ - $PROC_ARGS_ORAS5 \ - -v -l $LAG -ts $PREDICTION_START -te $PREDICTION_END ${FORECAST}_${HEMI} $HEMI - -[ ! -z "$PROC_ARGS_SIC" ] && \ - icenet_process_sic -r processed/$DATA_PROC/osisaf/$HEMI \ - $PROC_ARGS_SIC \ - -v -l $LAG -ts $PREDICTION_START -te $PREDICTION_END ${FORECAST}_${HEMI} $HEMI - -icenet_process_metadata ${FORECAST}_${HEMI} $HEMI -icenet_dataset_create -l $LAG -c ${FORECAST}_${HEMI} $HEMI -./loader_test_dates.sh ${FORECAST}_${HEMI} >${FORECAST}_${HEMI}.csv - -./run_predict_ensemble.sh -i $DATASET -f $FILTER_FACTOR -p $PREP_SCRIPT \ - $MODEL ${FORECAST}_${HEMI} ${FORECAST}_${HEMI} ${FORECAST}_${HEMI}.csv From ddc593c2c65e88f39324833b9b3ddac2e40270f0 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Thu, 29 Aug 2024 11:46:21 +0100 Subject: [PATCH 24/44] Remapping lag and lead to the forecasting processing --- prep_training_data.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/prep_training_data.sh b/prep_training_data.sh index 6106ed7..07a1d89 100755 --- a/prep_training_data.sh +++ b/prep_training_data.sh @@ -60,6 +60,7 @@ preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,active_grid_ preprocess_dataset $PROC_ARGS_SIC -v \ -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ -i "icenet.data.processors.osisaf:SICPreProcessor" \ + -sh $LAG -st $FORECAST_LENGTH \ $GROUND_TRUTH_SIC_DSC ${PROCESSED_DATASET}_osisaf HEMI_SHORT="nh" @@ -77,6 +78,7 @@ preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc preprocess_dataset $PROC_ARGS_ERA5 -v \ -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ -i "icenet.data.processors.cds:ERA5PreProcessor" \ + -sh $LAG -st $FORECAST_LENGTH \ $ATMOS_PROC_DSC ${PROCESSED_DATASET}_era5 # TODO: naive copy of "./data/era5_osi/month/hemi.north/uas/2024.nc" earlier is not regridded? # TODO: dask multiprocessing cluster with task batcher across multiple variables would be sensible From c9bfb382ca592445565bf7a3db2230b058c767d6 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Thu, 29 Aug 2024 12:25:25 +0100 Subject: [PATCH 25/44] Correcting for localised processed data store --- prep_prediction_data.sh | 3 ++- prep_training_data.sh | 21 ++++++++------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/prep_prediction_data.sh b/prep_prediction_data.sh index 884a6e3..af30826 100755 --- a/prep_prediction_data.sh +++ b/prep_prediction_data.sh @@ -58,7 +58,6 @@ PRED_DATA_START=`date --date "$PREDICTION_START - $LAG ${DATA_FREQUENCY}s" +%Y-% # download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS # TODO: this overwrites the ./data/osisaf/dataset_config.month.hemi.north.json files, which is unacceptable - localise - # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly ) 2>&1 | tee logs/download.${PREDICTION_DATASET}.log DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json" @@ -88,6 +87,7 @@ preprocess_dataset $PROC_ARGS_SIC -v \ -r processed/${DATA_PROC}_osisaf/ \ -i "icenet.data.processors.osisaf:SICPreProcessor" \ $OSISAF_DATASET ${PREDICTION_DATASET}_osisaf + # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly if [ ! -f ref.osisaf.${HEMI}.nc ]; then echo "Reference OSISAF for regrid should still be available, bailing for the mo" @@ -102,6 +102,7 @@ preprocess_dataset $PROC_ARGS_ERA5 -v \ -r processed/${DATA_PROC}_era5/ \ -i "icenet.data.processors.cds:ERA5PreProcessor" \ $ATMOS_PROC_DSC ${PREDICTION_DATASET}_era5 + # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PREDICTION_DATASET}_osisaf.json processed.${PREDICTION_DATASET}_era5.json preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET sin "icenet.data.meta:SinProcessor" diff --git a/prep_training_data.sh b/prep_training_data.sh index 07a1d89..ed818f3 100755 --- a/prep_training_data.sh +++ b/prep_training_data.sh @@ -40,8 +40,8 @@ GROUND_TRUTH_SIC="osi_sic.$TRAIN_DATA_NAME" ATMOS_PROC="era5_osi.$TRAIN_DATA_NAME" # Our processed dataset configurations, we localise data when regridding and reprojecting -GROUND_TRUTH_SIC_DSC="data/${GROUND_TRUTH_SIC}/${DATASET_CONFIG_NAME}" -ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}" +GROUND_TRUTH_SIC_DSC="${PROCESSED_DATA_STORE}/${GROUND_TRUTH_SIC}/${DATASET_CONFIG_NAME}" +ATMOS_PROC_DSC="${PROCESSED_DATA_STORE}/${ATMOS_PROC}/${DATASET_CONFIG_NAME}" PROCESSED_DATASET="${TRAIN_DATA_NAME}.${HEMI}" LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json" @@ -50,11 +50,12 @@ DATASET_NAME="tfdata_${HEMI}" ## Workflow preprocess_loader_init -v $PROCESSED_DATASET -preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET land "icenet.data.masks.osisaf:Masks" -preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.data.masks.osisaf:Masks" -preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks" - preprocess_missing_time -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC + +preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land "icenet.data.masks.osisaf:Masks" +preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC polarhole "icenet.data.masks.osisaf:Masks" +preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC active_grid_cell "icenet.data.masks.osisaf:Masks" + preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC preprocess_dataset $PROC_ARGS_SIC -v \ @@ -65,23 +66,17 @@ preprocess_dataset $PROC_ARGS_SIC -v \ HEMI_SHORT="nh" [ $HEMI == "south" ] && HEMI_SHORT="sh" -# TODO: we should be able to preserve data during download-toolbox processing for this, but -# alas this needs some investigation to achieve, so this will work for the moment + icenet_generate_ref_osisaf -v data/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc preprocess_regrid -v $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC - # TODO: get the batcher back in place for multiprocessing this - # TODO: this should regrid ALL files in the dataset, for some reason 2024.nc did not get wrapped in preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc - # TODO: get the batcher back in place for multiprocessing this preprocess_dataset $PROC_ARGS_ERA5 -v \ -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ -i "icenet.data.processors.cds:ERA5PreProcessor" \ -sh $LAG -st $FORECAST_LENGTH \ $ATMOS_PROC_DSC ${PROCESSED_DATASET}_era5 - # TODO: naive copy of "./data/era5_osi/month/hemi.north/uas/2024.nc" earlier is not regridded? - # TODO: dask multiprocessing cluster with task batcher across multiple variables would be sensible preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PROCESSED_DATASET}_osisaf.json processed.${PROCESSED_DATASET}_era5.json From a9a34c7db71146a98f41826c03133298be7648a0 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Thu, 29 Aug 2024 12:47:16 +0100 Subject: [PATCH 26/44] Mask data ref was missing --- prep_training_data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prep_training_data.sh b/prep_training_data.sh index ed818f3..9de3f77 100755 --- a/prep_training_data.sh +++ b/prep_training_data.sh @@ -67,7 +67,7 @@ preprocess_dataset $PROC_ARGS_SIC -v \ HEMI_SHORT="nh" [ $HEMI == "south" ] && HEMI_SHORT="sh" -icenet_generate_ref_osisaf -v data/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc +icenet_generate_ref_osisaf -v ${PROCESSED_DATA_STORE}/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc preprocess_regrid -v $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc From 2c2ee7ae0fb24036226270b55f26650ea49ba365 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 30 Aug 2024 17:07:23 +0100 Subject: [PATCH 27/44] Updated for much more efficient copying and processing of prediction datasets --- prep_prediction_data.sh | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/prep_prediction_data.sh b/prep_prediction_data.sh index af30826..4dae82a 100755 --- a/prep_prediction_data.sh +++ b/prep_prediction_data.sh @@ -56,8 +56,6 @@ PRED_DATA_START=`date --date "$PREDICTION_START - $LAG ${DATA_FREQUENCY}s" +%Y-% download_osisaf $DATA_ARGS $HEMI $PRED_DATA_START $PREDICTION_END $OSISAF_VAR_ARGS download_era5 $DATA_ARGS $HEMI $PRED_DATA_START $PREDICTION_END $ERA5_VAR_ARGS # download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS - - # TODO: this overwrites the ./data/osisaf/dataset_config.month.hemi.north.json files, which is unacceptable - localise ) 2>&1 | tee logs/download.${PREDICTION_DATASET}.log DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json" @@ -66,8 +64,9 @@ DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json" # Persistent datasets from the source data store, wherever that is OSISAF_DATASET="${SOURCE_DATA_STORE}/osisaf/${DATASET_CONFIG_NAME}" ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}" + ATMOS_PROC="era5_osi.$PREDICTION_DATASET" -ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}" +ATMOS_PROC_DSC="${PROCESSED_DATA_STORE}/${ATMOS_PROC}/${DATASET_CONFIG_NAME}" # Create links to the central data store datasets for easier "mapping" [ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf @@ -76,16 +75,16 @@ ATMOS_PROC_DSC="data/${ATMOS_PROC}/${DATASET_CONFIG_NAME}" # TODO: CMIP LOADER_CONFIGURATION="loader.${PREDICTION_DATASET}.json" +TRAIN_LOADER_CONFIGURATION="loader.${TRAIN_DATA_NAME}.${HEMI}.json" preprocess_loader_init -v $PREDICTION_DATASET -preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET land "icenet.data.masks.osisaf:Masks" -preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET polarhole "icenet.data.masks.osisaf:Masks" -preprocess_add_mask -v $LOADER_CONFIGURATION $OSISAF_DATASET active_grid_cell "icenet.data.masks.osisaf:Masks" +preprocess_loader_copy $TRAIN_LOADER_CONFIGURATION $PREDICTION_DATASET masks channels preprocess_dataset $PROC_ARGS_SIC -v \ -sn "prediction" -ss "$PREDICTION_START" -se "$PREDICTION_END" \ -r processed/${DATA_PROC}_osisaf/ \ -i "icenet.data.processors.osisaf:SICPreProcessor" \ + -sh $LAG -st $FORECAST_LENGTH \ $OSISAF_DATASET ${PREDICTION_DATASET}_osisaf # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly @@ -94,20 +93,22 @@ if [ ! -f ref.osisaf.${HEMI}.nc ]; then exit 1 fi -preprocess_regrid -v $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC -preprocess_rotate -n uas,vas -v $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc +preprocess_regrid -v \ + -sn "prediction" -ss "$PREDICTION_START" -se "$PREDICTION_END" \ + -sh $LAG -st $FORECAST_LENGTH \ + $ERA5_DATASET ref.osisaf.${HEMI}.nc $ATMOS_PROC +preprocess_rotate -v \ + -n uas,vas $ATMOS_PROC_DSC ref.osisaf.${HEMI}.nc preprocess_dataset $PROC_ARGS_ERA5 -v \ -sn "prediction" -ss "$PREDICTION_START" -se "$PREDICTION_END" \ -r processed/${DATA_PROC}_era5/ \ -i "icenet.data.processors.cds:ERA5PreProcessor" \ + -sh $LAG -st $FORECAST_LENGTH \ $ATMOS_PROC_DSC ${PREDICTION_DATASET}_era5 # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PREDICTION_DATASET}_osisaf.json processed.${PREDICTION_DATASET}_era5.json -preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET sin "icenet.data.meta:SinProcessor" -preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET cos "icenet.data.meta:CosProcessor" -preprocess_add_channel -v $LOADER_CONFIGURATION $OSISAF_DATASET land_map "icenet.data.masks.osisaf:Masks" icenet_dataset_create -v -c -p -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $PREDICTION_DATASET From 0dcdf7b1c91f932a48512e0ea2a95e79a0fe74e1 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 30 Aug 2024 17:15:18 +0100 Subject: [PATCH 28/44] Fixes #53: last amendment for patching model name correctly? --- run_prediction.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_prediction.sh b/run_prediction.sh index 321a1ba..87ebf6e 100755 --- a/run_prediction.sh +++ b/run_prediction.sh @@ -47,4 +47,4 @@ jq -c '.sources[].splits["prediction"][]' $LOADER_NAME | sort | uniq | sed -r \ -e 's/([0-9]{4})_([0-9]{2})_([0-9]{2})/\1-\2-\3/' >${PREDICTION_NAME}.${HEMI}.csv ./run_predict_ensemble.sh -d -i $DATASET -f $FILTER_FACTOR -p $PREP_SCRIPT \ - ${MODEL}_${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI}.csv + ${MODEL}.${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI}.csv From 461e18e8942950ef725d759172d515d950a7e42b Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 30 Aug 2024 17:24:21 +0100 Subject: [PATCH 29/44] Forgot to complete the full comfiguration naming in prediction copies --- prep_prediction_data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prep_prediction_data.sh b/prep_prediction_data.sh index 4dae82a..03564e9 100755 --- a/prep_prediction_data.sh +++ b/prep_prediction_data.sh @@ -78,7 +78,7 @@ LOADER_CONFIGURATION="loader.${PREDICTION_DATASET}.json" TRAIN_LOADER_CONFIGURATION="loader.${TRAIN_DATA_NAME}.${HEMI}.json" preprocess_loader_init -v $PREDICTION_DATASET -preprocess_loader_copy $TRAIN_LOADER_CONFIGURATION $PREDICTION_DATASET masks channels +preprocess_loader_copy $TRAIN_LOADER_CONFIGURATION loader.${PREDICTION_DATASET}.json masks channels preprocess_dataset $PROC_ARGS_SIC -v \ -sn "prediction" -ss "$PREDICTION_START" -se "$PREDICTION_END" \ From 15fa9491781ab0deb98b0572448b218c332a50cb Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 30 Aug 2024 17:41:57 +0100 Subject: [PATCH 30/44] Further fixing of changes to model delimiters --- run_prediction.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run_prediction.sh b/run_prediction.sh index 87ebf6e..8097f9c 100755 --- a/run_prediction.sh +++ b/run_prediction.sh @@ -31,8 +31,8 @@ MODEL="$2" HEMI="$3" # This assumes you're not retraining using the same model name, eek -if [ -d results/networks/${MODEL}_${HEMI} ]; then - SAVEFILE=`ls results/networks/${MODEL}_${HEMI}/${MODEL}_${HEMI}.*.h5 | head -n 1` +if [ -d results/networks/${MODEL}.${HEMI} ]; then + SAVEFILE=`ls results/networks/${MODEL}.${HEMI}/${MODEL}.${HEMI}.*.h5 | head -n 1` DATASET=`echo $SAVEFILE | perl -lpe's/.+\.network_(.+)\.[0-9]+\.h5/$1/'` echo "First model file: $SAVEFILE" echo "Dataset model was trained on: $DATASET" From a183f4165232ffe44ad2b024655f03da576367ee Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 30 Aug 2024 17:44:31 +0100 Subject: [PATCH 31/44] Clearing up some cruft and giving the option to supply extra args --- run_prediction.sh | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/run_prediction.sh b/run_prediction.sh index 8097f9c..9d3e7e9 100755 --- a/run_prediction.sh +++ b/run_prediction.sh @@ -15,20 +15,10 @@ if [ $# -lt 3 ] || [ "$1" == "-h" ]; then exit 1 fi -# obtaining any arguments that should be passed onto run_forecast_plots.sh -OPTIND=1 -while getopts "" opt; do - case "$opt" in - esac -done - -shift $((OPTIND-1)) - -echo "Leftovers from getopt: $@" - PREDICTION_NAME="prediction.$1" MODEL="$2" HEMI="$3" +EXTRA_ARGS="${4:-""}" # This assumes you're not retraining using the same model name, eek if [ -d results/networks/${MODEL}.${HEMI} ]; then @@ -46,5 +36,5 @@ jq -c '.sources[].splits["prediction"][]' $LOADER_NAME | sort | uniq | sed -r \ -e 's/"//g' \ -e 's/([0-9]{4})_([0-9]{2})_([0-9]{2})/\1-\2-\3/' >${PREDICTION_NAME}.${HEMI}.csv -./run_predict_ensemble.sh -d -i $DATASET -f $FILTER_FACTOR -p $PREP_SCRIPT \ +./run_predict_ensemble.sh $EXTRA_ARGS -i $DATASET -f $FILTER_FACTOR -p $PREP_SCRIPT \ ${MODEL}.${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI} ${PREDICTION_NAME}.${HEMI}.csv From 7a6d2b1a67d8d54cb1569b17fcc9d66f8c1ce7df Mon Sep 17 00:00:00 2001 From: James Byrne Date: Mon, 2 Sep 2024 10:17:35 +0100 Subject: [PATCH 32/44] Updating for variable temporal lengths and resolutions --- ensemble/predict.tmpl.yaml | 3 +++ process_op_assets.sh | 11 ++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/ensemble/predict.tmpl.yaml b/ensemble/predict.tmpl.yaml index 9b84fef..cfe61cf 100644 --- a/ensemble/predict.tmpl.yaml +++ b/ensemble/predict.tmpl.yaml @@ -41,6 +41,9 @@ ensemble: - name: execute args: cmd: /usr/bin/ln -s ../../processed + - name: execute + args: + cmd: /usr/bin/ln -s ../../processed_data - name: execute args: cmd: /usr/bin/ln -s ../../ref.osisaf.north.nc diff --git a/process_op_assets.sh b/process_op_assets.sh index 46b8a44..72f8446 100755 --- a/process_op_assets.sh +++ b/process_op_assets.sh @@ -9,6 +9,7 @@ OUTPUT_DIR="results/forecasts/$FORECAST_NAME" LOG_DIR="log/forecasts/$FORECAST_NAME" FORECAST_FILE="results/predict/${FORECAST_NAME}.nc" +FORECAST_LENGTH=`python -c 'import xarray as xr; print(int(xr.open_dataset("'$FORECAST_FILE'").leadtime.max()))'` HEMI=`echo $FORECAST_NAME | sed -r 's/^.+_(north|south)$/\1/'` if [ $# -lt 1 ] || [ "$1" == "-h" ]; then @@ -62,23 +63,23 @@ for DATE_FORECAST in $( cat ${FORECAST_NAME}.csv ); do python -c 'import xarray; xarray.open_dataset("'$FORECAST_FILE'").sel(time=slice("'$DATE_FORECAST'", "'$DATE_FORECAST'")).to_netcdf("'$DATE_DIR'/'$DATE_FORECAST'.nc")' echo "Producing geotiffs from that file" - icenet_output_geotiff -o $DATE_DIR $FORECAST_FILE $DATE_FORECAST 1..93 + icenet_output_geotiff -o $DATE_DIR $FORECAST_FILE $DATE_FORECAST 1..$FORECAST_LENGTH rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.tiff' echo "Producing movie file of raw video" - icenet_plot_forecast $REGION -o $DATE_DIR -l 1..93 -f mp4 $HEMI $FORECAST_FILE $DATE_FORECAST + icenet_plot_forecast $REGION -o $DATE_DIR -l 1..$FORECAST_LENGTH -f mp4 $HEMI $FORECAST_FILE $DATE_FORECAST rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.mp4' echo "Producing stills for manual composition (with coastlines)" - icenet_plot_forecast $REGION -o $DATE_DIR -l 1..93 $HEMI $FORECAST_FILE $DATE_FORECAST + icenet_plot_forecast $REGION -o $DATE_DIR -l 1..$FORECAST_LENGTH $HEMI $FORECAST_FILE $DATE_FORECAST ffmpeg -framerate 5 -pattern_type glob -i ${DATE_DIR}'/'${FORECAST_NAME}'.*.png' -c:v libx264 ${DATE_DIR}/${FORECAST_NAME}.mp4 rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.png' echo "Producing movie and stills of ensemble standard deviation in predictions" - icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..93 -f mp4 $HEMI $FORECAST_FILE $DATE_FORECAST + icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..$FORECAST_LENGTH -f mp4 $HEMI $FORECAST_FILE $DATE_FORECAST rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.stddev.mp4' - icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..93 $HEMI $FORECAST_FILE $DATE_FORECAST + icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..$FORECAST_LENGTH $HEMI $FORECAST_FILE $DATE_FORECAST ffmpeg -framerate 5 -pattern_type glob -i ${DATE_DIR}'/'${FORECAST_NAME}'.*.stddev.png' -c:v libx264 ${DATE_DIR}/${FORECAST_NAME}.stddev.mp4 rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.stddev.png' From 97f137bc6ecd34899d7414ab6181013e7b5c8cb3 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Mon, 2 Sep 2024 10:19:13 +0100 Subject: [PATCH 33/44] Updating hemi regex --- process_op_assets.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/process_op_assets.sh b/process_op_assets.sh index 72f8446..f21acf3 100755 --- a/process_op_assets.sh +++ b/process_op_assets.sh @@ -10,7 +10,7 @@ LOG_DIR="log/forecasts/$FORECAST_NAME" FORECAST_FILE="results/predict/${FORECAST_NAME}.nc" FORECAST_LENGTH=`python -c 'import xarray as xr; print(int(xr.open_dataset("'$FORECAST_FILE'").leadtime.max()))'` -HEMI=`echo $FORECAST_NAME | sed -r 's/^.+_(north|south)$/\1/'` +HEMI=`echo $FORECAST_NAME | sed -r 's/^.+\.(north|south)$/\1/'` if [ $# -lt 1 ] || [ "$1" == "-h" ]; then echo "$0 [region]" From 8344fdd1e75a578d4389f6a293f4dac2c849adb6 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Tue, 3 Sep 2024 17:18:39 +0100 Subject: [PATCH 34/44] Updated for producing op assets with environmental forecasting --- process_op_assets.sh | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/process_op_assets.sh b/process_op_assets.sh index f21acf3..5ea098c 100755 --- a/process_op_assets.sh +++ b/process_op_assets.sh @@ -12,6 +12,9 @@ FORECAST_FILE="results/predict/${FORECAST_NAME}.nc" FORECAST_LENGTH=`python -c 'import xarray as xr; print(int(xr.open_dataset("'$FORECAST_FILE'").leadtime.max()))'` HEMI=`echo $FORECAST_NAME | sed -r 's/^.+\.(north|south)$/\1/'` +GROUND_TRUTH_DS=`jq -r 'first(.sources[]).dataset_config' loader.${FORECAST_NAME}.json` +GROUND_TRUTH_DIR=`dirname $( jq -r '.data._var_files.siconca[0]' $GROUND_TRUTH_DS )` + if [ $# -lt 1 ] || [ "$1" == "-h" ]; then echo "$0 [region]" exit 1 @@ -24,7 +27,6 @@ if ! [[ $HEMI =~ ^(north|south)$ ]]; then exit 1 fi - function produce_docs { local DIR=$1 @@ -43,7 +45,6 @@ function rename_gfx { done } - for WORKING_DIR in "$OUTPUT_DIR" "$LOG_DIR"; do if [ -d $WORKING_DIR ]; then echo "Output directory $WORKING_DIR already exists, removing" @@ -67,20 +68,21 @@ for DATE_FORECAST in $( cat ${FORECAST_NAME}.csv ); do rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.tiff' echo "Producing movie file of raw video" - icenet_plot_forecast $REGION -o $DATE_DIR -l 1..$FORECAST_LENGTH -f mp4 $HEMI $FORECAST_FILE $DATE_FORECAST + icenet_plot_forecast $REGION -o $DATE_DIR -l 1..$FORECAST_LENGTH -f mp4 $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.mp4' echo "Producing stills for manual composition (with coastlines)" - icenet_plot_forecast $REGION -o $DATE_DIR -l 1..$FORECAST_LENGTH $HEMI $FORECAST_FILE $DATE_FORECAST - ffmpeg -framerate 5 -pattern_type glob -i ${DATE_DIR}'/'${FORECAST_NAME}'.*.png' -c:v libx264 ${DATE_DIR}/${FORECAST_NAME}.mp4 + icenet_plot_forecast $REGION -o $DATE_DIR -l 1..$FORECAST_LENGTH $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST + # Removed -c:v libx264 + ffmpeg -framerate 5 -pattern_type glob -i ${DATE_DIR}'/'${FORECAST_NAME}'.*.png' ${DATE_DIR}/${FORECAST_NAME}.mp4 rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.png' echo "Producing movie and stills of ensemble standard deviation in predictions" - icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..$FORECAST_LENGTH -f mp4 $HEMI $FORECAST_FILE $DATE_FORECAST + icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..$FORECAST_LENGTH -f mp4 $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.stddev.mp4' - icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..$FORECAST_LENGTH $HEMI $FORECAST_FILE $DATE_FORECAST - ffmpeg -framerate 5 -pattern_type glob -i ${DATE_DIR}'/'${FORECAST_NAME}'.*.stddev.png' -c:v libx264 ${DATE_DIR}/${FORECAST_NAME}.stddev.mp4 + icenet_plot_forecast $REGION -s -o $DATE_DIR -l 1..$FORECAST_LENGTH $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST + ffmpeg -framerate 5 -pattern_type glob -i ${DATE_DIR}'/'${FORECAST_NAME}'.*.stddev.png' ${DATE_DIR}/${FORECAST_NAME}.stddev.mp4 rename_gfx $DATE_DIR "${FORECAST_NAME}.${DATE_FORECAST}." '*.stddev.png' produce_docs $DATE_DIR @@ -90,7 +92,7 @@ for DATE_FORECAST in $( cat ${FORECAST_NAME}.csv ); do echo "Producing binary accuracy plots (these are meaningless forecasting into the future w.r.t the OSISAF data)" - SIC_FILENAME="./data/osisaf/${HEMI}/siconca/`date +%Y`.nc" + SIC_FILENAME="${GROUND_TRUTH_DIR}/`date +%Y`.nc" # Get the most recent day, sorry for ignoring all timezone information SIC_LATEST=`python -c 'import xarray; print(str(xarray.open_dataset("'$SIC_FILENAME'").time.values[-1])[0:10])'` @@ -100,20 +102,20 @@ for DATE_FORECAST in $( cat ${FORECAST_NAME}.csv ); do for THRESHOLD in 0.15 0.5 0.8 0.9; do icenet_plot_bin_accuracy $REGION -e -b -t $THRESHOLD \ -o ${DATE_DIR}/bin_accuracy.${THRESHOLD}.png \ - $HEMI $FORECAST_FILE $DATE_FORECAST + $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST done icenet_plot_metrics $REGION -e -b -s \ -o ${DATE_DIR}/ \ - $HEMI $FORECAST_FILE $DATE_FORECAST + $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST icenet_plot_sic_error $REGION \ -o ${DATE_DIR}/${DATE_FORECAST}.sic_error.mp4 \ - $HEMI $FORECAST_FILE $DATE_FORECAST + $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST icenet_plot_sie_error $REGION -e -b \ -o ${DATE_DIR}/${DATE_FORECAST}.sie_error.25.png \ - $HEMI $FORECAST_FILE $DATE_FORECAST + $GROUND_TRUTH_DS $FORECAST_FILE $DATE_FORECAST else echo "We do not have observational SIC data ($SIC_LATEST) for plotting \ forecast date $DATE_FORECAST" From 0a5960b693f22d30c139a43624542fc831cf7fe2 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Wed, 4 Sep 2024 08:18:22 +0100 Subject: [PATCH 35/44] Updating plotting commands --- plot_forecast.sh | 54 ++++++++++++++++++++++----------------------- plot_validations.sh | 22 +++++++++--------- 2 files changed, 36 insertions(+), 40 deletions(-) diff --git a/plot_forecast.sh b/plot_forecast.sh index 3b40954..c877587 100755 --- a/plot_forecast.sh +++ b/plot_forecast.sh @@ -2,11 +2,10 @@ source ENVS -if [ $# -lt 2 ] || [ "$1" == "-h" ]; then - echo -e "\nUsage $0 " +if [ $# -lt 1 ] || [ "$1" == "-h" ]; then + echo -e "\nUsage $0 " echo -e "\nArguments" echo " name of forecast" - echo " hemisphere to use" echo -e "\nOptions" echo "-m string of metrics separated by commas, by default \"binacc,sie,mae,rmse,sic\". Options: \"binacc\", \"sie\", \"mae\", \"mse\", \"rmse\", \"sic\"" echo "-r region arguments, by default uses full hemisphere" @@ -17,7 +16,7 @@ if [ $# -lt 2 ] || [ "$1" == "-h" ]; then echo "-g grid area resolution to use - i.e. the length of the sides in km, by default 25 (i.e. 25km^2)" echo "-o output directory path to store plots, by default \"plot/\"" echo -e "\nList of outputs generated" - echo "* If \"binacc\" is included in the requested metrics, will generate all binary accuracy plots for dates in _.csv" + echo "* If \"binacc\" is included in the requested metrics, will generate all binary accuracy plots for dates in .csv" echo "- these will be saved in the format \"/binacc.t_..png\"" echo "If \"-l\" is passed, leadtime averaged plots for binary accuracy will be generated too:" echo " - averaging over all: \"/binacc.t__leadtime_avg_all.png\"" @@ -26,7 +25,7 @@ if [ $# -lt 2 ] || [ "$1" == "-h" ]; then echo " - averaging by month and for target date: \"/binacc.t__leadtime_avg_target_month.png\"" echo " - averaging by day and for target date: \"/binacc.t__leadtime_avg_target_day.png\"" echo "If \"-v\" is passed, a video will be produced to stitch all these plots together and saved in \"/binacc.t_.mp4\"" - echo "* If \"sie\" is included in the requested metrics, will generate all SIE error plots for dates in _.csv" + echo "* If \"sie\" is included in the requested metrics, will generate all SIE error plots for dates in .csv" echo "(these will be saved in the format \"/sie.t_.g_..png\")" echo "If \"-l\" is passed, leadtime averaged plots for SIE error will be generated too:" echo " - averaging over all: \"/sie.t_.g__leadtime_avg_all.png\"" @@ -35,7 +34,7 @@ if [ $# -lt 2 ] || [ "$1" == "-h" ]; then echo " - averaging by month and for target date: \"/sie.t_.g__leadtime_avg_target_month.png\"" echo " - averaging by day and for target date: \"/sie.t_.g__leadtime_avg_target_day.png\"" echo "If \"-v\" is passed, a video will be produced to stitch all these plots together and saved in \"/sie.t_.g_.mp4\"" - echo "* If \"mae\", \"mse\", or \"rmse\" is included in the requested metrics, will generate all MAE, MSE, or RMSE plots for dates in _.csv" + echo "* If \"mae\", \"mse\", or \"rmse\" is included in the requested metrics, will generate all MAE, MSE, or RMSE plots for dates in .csv" echo "the names for the plots follow a similar convention as above but without the threshold or grid-area-size being saved in the name..." echo "for instance, for a given , these will be saved in the format \"/..png\"" echo "If \"-l\" is passed, leadtime averaged plots for will be generated too:" @@ -47,7 +46,7 @@ if [ $# -lt 2 ] || [ "$1" == "-h" ]; then echo "Note that if $\"-e\" is passed, all of these will have \"_comp\" after \"avg\"" echo "The plot of the standard deviation of the metric for each forecast will also be generated" echo "If \"-v\" is passed, a video will be produced to stitch all these plots together and saved in \"/.mp4\"" - echo "* If \"sic\" is included in the requested metrics, will generate all SIC error videos for dates in _.csv" + echo "* If \"sic\" is included in the requested metrics, will generate all SIC error videos for dates in .csv" echo "(these will be saved in the format \"/sic..mp4\")" exit 1 fi @@ -106,18 +105,17 @@ shift $((OPTIND-1)) # echo "Leftovers from getopt: $@" -FORECAST="$1" -HEMI="$2" - -FORECAST_NAME=${FORECAST}_${HEMI} +FORECAST_NAME="$1" FORECAST_FILE="results/predict/${FORECAST_NAME}.nc" LOG_PREFIX="logs/${FORECAST_NAME}" -BINACC_LOG="${LOG_PREFIX}_binacc.log" -SIE_LOG="${LOG_PREFIX}_sie.log" -MAE_LOG="${LOG_PREFIX}_mae.log" -MSE_LOG="${LOG_PREFIX}_mse.log" -RMSE_LOG="${LOG_PREFIX}_rmse.log" -SICERR_LOG="${LOG_PREFIX}_sic.log" +BINACC_LOG="${LOG_PREFIX}.binacc.log" +SIE_LOG="${LOG_PREFIX}.sie.log" +MAE_LOG="${LOG_PREFIX}.mae.log" +MSE_LOG="${LOG_PREFIX}.mse.log" +RMSE_LOG="${LOG_PREFIX}.rmse.log" +SICERR_LOG="${LOG_PREFIX}.sic.log" + +GROUND_TRUTH_DS=`jq -r 'first(.sources[]).dataset_config' loader.${FORECAST_NAME}.json` if [ "${REQUESTED_OUTPUT_DIR}" == "" ]; then OUTPUT_DIR="plot/${FORECAST_NAME}" @@ -143,29 +141,29 @@ cat ${FORECAST_NAME}.csv | while read -r FORECAST_DATE; do OUTPUT="${OUTPUT_DIR}/${element}.t_${THRESHOLD:3}.${FORECAST_DATE}.png" echo "Producing binary accuracy plot for $FORECAST_DATE (${OUTPUT})" icenet_plot_bin_accuracy -b $E_FLAG -v $REGION -o $OUTPUT $THRESHOLD \ - $HEMI $FORECAST_FILE $FORECAST_DATE >> $BINACC_LOG 2>&1 + $GROUND_TRUTH_DS $FORECAST_FILE $FORECAST_DATE >> $BINACC_LOG 2>&1 elif [ "${element}" == "sie" ]; then OUTPUT="${OUTPUT_DIR}/${element}.t_${THRESHOLD:3}.ga_${GRID_AREA_SIZE:4}.${FORECAST_DATE}.png" echo "Producing sea ice extent error plot for $FORECAST_DATE (${OUTPUT})" icenet_plot_sie_error -b $E_FLAG -v $REGION -o $OUTPUT $THRESHOLD $GRID_AREA_SIZE \ - $HEMI $FORECAST_FILE $FORECAST_DATE >> $SIE_LOG 2>&1 + $GROUND_TRUTH_DS $FORECAST_FILE $FORECAST_DATE >> $SIE_LOG 2>&1 elif [ "${element}" == "mae" ]; then echo "Producing MAE plot for $FORECAST_DATE (${OUTPUT})" icenet_plot_metrics -b $E_FLAG -v $REGION -m $element -o $OUTPUT \ - $HEMI $FORECAST_FILE $FORECAST_DATE >> $MAE_LOG 2>&1 + $GROUND_TRUTH_DS $FORECAST_FILE $FORECAST_DATE >> $MAE_LOG 2>&1 elif [ "${element}" == "mse" ]; then echo "Producing MSE plot for $FORECAST_DATE (${OUTPUT})" icenet_plot_metrics -b $E_FLAG -v $REGION -m $element -o $OUTPUT \ - $HEMI $FORECAST_FILE $FORECAST_DATE >> $MSE_LOG 2>&1 + $GROUND_TRUTH_DS $FORECAST_FILE $FORECAST_DATE >> $MSE_LOG 2>&1 elif [ "${element}" == "rmse" ]; then echo "Producing RMSE plot for $FORECAST_DATE (${OUTPUT})" icenet_plot_metrics -b $E_FLAG -v $REGION -m $element -o $OUTPUT \ - $HEMI $FORECAST_FILE $FORECAST_DATE >> $RMSE_LOG 2>&1 + $GROUND_TRUTH_DS $FORECAST_FILE $FORECAST_DATE >> $RMSE_LOG 2>&1 elif [ "${element}" == "sic" ]; then OUTPUT="${OUTPUT_DIR}/${element}.${FORECAST_DATE}.mp4" echo "Producing SIC error video for $FORECAST_DATE (${OUTPUT})" icenet_plot_sic_error -v $REGION -o $OUTPUT \ - $HEMI $FORECAST_FILE $FORECAST_DATE >> $SICERR_LOG 2>&1 + $GROUND_TRUTH_DS $FORECAST_FILE $FORECAST_DATE >> $SICERR_LOG 2>&1 fi done done @@ -208,33 +206,33 @@ if [[ "${LEADTIME_AVG}" == true ]]; then echo "Plots produced:" # averaging over all OUTPUT="${OUTPUT_PATH_START}_all.png" - icenet_plot_leadtime_avg $HEMI $FORECAST_FILE $REGION \ + icenet_plot_leadtime_avg $GROUND_TRUTH_DS $FORECAST_FILE $REGION \ -m $element -ao "all" -s -sm 1 $E_FLAG $THRESHOLD $GRID_AREA_SIZE \ -dp $DATA_PATH -o $OUTPUT >> $LOGFILE 2>&1 echo "* ${OUTPUT}" ##### initialisation day # averaging over monthly OUTPUT="${OUTPUT_PATH_START}_init_month.png" - icenet_plot_leadtime_avg $HEMI $FORECAST_FILE $REGION \ + icenet_plot_leadtime_avg $GROUND_TRUTH_DS $FORECAST_FILE $REGION \ -m $element -ao "month" -s $E_FLAG $THRESHOLD $GRID_AREA_SIZE \ -dp $DATA_PATH -o $OUTPUT >> $LOGFILE 2>&1 echo "* ${OUTPUT}" # averaging over daily OUTPUT="${OUTPUT_PATH_START}_init_day.png" - icenet_plot_leadtime_avg $HEMI $FORECAST_FILE $REGION \ + icenet_plot_leadtime_avg $GROUND_TRUTH_DS $FORECAST_FILE $REGION \ -m $element -ao "day" -s $E_FLAG $THRESHOLD $GRID_AREA_SIZE \ -dp $DATA_PATH -o $OUTPUT >> $LOGFILE 2>&1 echo "* ${OUTPUT}" ##### target day # averaging over monthly OUTPUT="${OUTPUT_PATH_START}_target_month.png" - icenet_plot_leadtime_avg $HEMI $FORECAST_FILE $REGION \ + icenet_plot_leadtime_avg $GROUND_TRUTH_DS $FORECAST_FILE $REGION \ -m $element -ao "month" -s -td $E_FLAG $THRESHOLD $GRID_AREA_SIZE \ -dp $DATA_PATH -o $OUTPUT >> $LOGFILE 2>&1 echo "* ${OUTPUT}" # averaging over daily OUTPUT="${OUTPUT_PATH_START}_target_day.png" - icenet_plot_leadtime_avg $HEMI $FORECAST_FILE $REGION \ + icenet_plot_leadtime_avg $GROUND_TRUTH_DS $FORECAST_FILE $REGION \ -m $element -ao "day" -s -td $E_FLAG $THRESHOLD $GRID_AREA_SIZE \ -dp $DATA_PATH -o $OUTPUT >> $LOGFILE 2>&1 echo "* ${OUTPUT}" diff --git a/plot_validations.sh b/plot_validations.sh index 17c6d36..fcf56eb 100755 --- a/plot_validations.sh +++ b/plot_validations.sh @@ -2,11 +2,10 @@ source ENVS -if [ $# -lt 2 ] || [ "$1" == "-h" ]; then +if [ $# -lt 1 ] || [ "$1" == "-h" ]; then echo -e "\nUsage $0 " echo -e "\nArguments" echo " name of forecast" - echo " hemisphere to use" echo -e "\nOptions" echo "-m string of metrics separated by commas, by default \"binacc,sie,mae,rmse,sic\"" echo "-r region arguments, by default uses full hemisphere" @@ -27,6 +26,7 @@ THRESHOLDS=(0.15, 0.8) GRID_AREA_SIZE="-g 25" REQUESTED_OUTPUT_DIR="" OPTIND=1 + while getopts "m:r:t:g:o:" opt; do case "$opt" in m) METRICS=${OPTARG} ;; @@ -56,9 +56,7 @@ shift $((OPTIND-1)) # echo "Leftovers from getopt: $@" -FORECAST="$1" -HEMI="$2" -FORECAST_NAME=${FORECAST}_${HEMI} +FORECAST_NAME=${1} if [ "${REQUESTED_OUTPUT_DIR}" == "" ]; then OUTPUT_DIR="plot/validation/${FORECAST_NAME}" @@ -75,20 +73,20 @@ for element in "${METRICS[@]}" if [ "${element}" == "binacc" ]; then for THRESH in ${THRESHOLDS[@]}; do ./plot_forecast.sh -m ${element} $REGION -v -l -t $THRESH \ - -o $OUTPUT_DIR $FORECAST $HEMI + -o $OUTPUT_DIR $FORECAST_NAME ./plot_forecast.sh -m ${element} $REGION -e -v -l -t $THRESH \ - -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST $HEMI + -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST_NAME done elif [ "${element}" == "sie" ]; then for THRESH in ${THRESHOLDS[@]}; do ./plot_forecast.sh -m ${element} $REGION -v -l -t $THRESH $GRID_AREA_SIZE \ - -o $OUTPUT_DIR $FORECAST $HEMI + -o $OUTPUT_DIR $FORECAST_NAME ./plot_forecast.sh -m ${element} $REGION -e -v -l -t $THRESH $GRID_AREA_SIZE \ - -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST $HEMI + -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST_NAME done elif [ "${element}" == "sic" ]; then ./plot_forecast.sh -m ${element} $REGION -v \ - -o $OUTPUT_DIR $FORECAST $HEMI + -o $OUTPUT_DIR $FORECAST_NAME else if [ "${element}" == "mae" ]; then LOGFILE="${MAE_LOG}" @@ -98,8 +96,8 @@ for element in "${METRICS[@]}" LOGFILE="${RMSE_LOG}" fi ./plot_forecast.sh -m ${element} $REGION -v -l \ - -o $OUTPUT_DIR $FORECAST $HEMI + -o $OUTPUT_DIR $FORECAST_NAME ./plot_forecast.sh -m ${element} $REGION -e -v -l \ - -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST $HEMI + -o "${OUTPUT_DIR}/ECMWF_comp" $FORECAST_NAME fi done From 4923cf92f70fce8fc546d5c9f744270c3d10d1c1 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Wed, 4 Sep 2024 11:23:27 +0100 Subject: [PATCH 36/44] Updating template dates --- template_LICENSE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/template_LICENSE.md b/template_LICENSE.md index 6154e7c..84828f7 100644 --- a/template_LICENSE.md +++ b/template_LICENSE.md @@ -1 +1 @@ -Unless otherwise stated, all content is © British Antarctic Survey and The Alan Turing Institute 2023 and made available via the [Open Government License](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) which is compatible with the [CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/) +Unless otherwise stated, all content is © British Antarctic Survey and The Alan Turing Institute 2024 and made available via the [Open Government License](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) which is compatible with the [CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/) From 1e63b0198b4b8f8c06c04f1e41562948e768d376 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Thu, 5 Sep 2024 17:27:01 +0100 Subject: [PATCH 37/44] Validating and sorting out spatial interpolation --- prep_training_data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prep_training_data.sh b/prep_training_data.sh index 9de3f77..bb0d3c1 100755 --- a/prep_training_data.sh +++ b/prep_training_data.sh @@ -56,7 +56,7 @@ preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land "icenet. preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC polarhole "icenet.data.masks.osisaf:Masks" preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC active_grid_cell "icenet.data.masks.osisaf:Masks" -preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,active_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC +preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,inactive_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC preprocess_dataset $PROC_ARGS_SIC -v \ -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ From 795e9c7dc8bbb3a039845cf1c46148ee1b31dc5c Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 6 Sep 2024 09:34:24 +0100 Subject: [PATCH 38/44] Clearing some comments and TODOs --- prep_prediction_data.sh | 5 +---- prep_training_data.sh | 13 +++++++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/prep_prediction_data.sh b/prep_prediction_data.sh index 03564e9..91de790 100755 --- a/prep_prediction_data.sh +++ b/prep_prediction_data.sh @@ -52,6 +52,7 @@ LOADER_CONFIGURATION="loader.${PREDICTION_DATASET}.json" PRED_DATA_START=`date --date "$PREDICTION_START - $LAG ${DATA_FREQUENCY}s" +%Y-%m-%d` # download-toolbox integration ( + # We don't do AMSR2 and CMIP as part of this, but everything is similar if you want to ;) # download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS download_osisaf $DATA_ARGS $HEMI $PRED_DATA_START $PREDICTION_END $OSISAF_VAR_ARGS download_era5 $DATA_ARGS $HEMI $PRED_DATA_START $PREDICTION_END $ERA5_VAR_ARGS @@ -71,8 +72,6 @@ ATMOS_PROC_DSC="${PROCESSED_DATA_STORE}/${ATMOS_PROC}/${DATASET_CONFIG_NAME}" # Create links to the central data store datasets for easier "mapping" [ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf [ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5 -# TODO: AMSR -# TODO: CMIP LOADER_CONFIGURATION="loader.${PREDICTION_DATASET}.json" TRAIN_LOADER_CONFIGURATION="loader.${TRAIN_DATA_NAME}.${HEMI}.json" @@ -86,7 +85,6 @@ preprocess_dataset $PROC_ARGS_SIC -v \ -i "icenet.data.processors.osisaf:SICPreProcessor" \ -sh $LAG -st $FORECAST_LENGTH \ $OSISAF_DATASET ${PREDICTION_DATASET}_osisaf - # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly if [ ! -f ref.osisaf.${HEMI}.nc ]; then echo "Reference OSISAF for regrid should still be available, bailing for the mo" @@ -106,7 +104,6 @@ preprocess_dataset $PROC_ARGS_ERA5 -v \ -i "icenet.data.processors.cds:ERA5PreProcessor" \ -sh $LAG -st $FORECAST_LENGTH \ $ATMOS_PROC_DSC ${PREDICTION_DATASET}_era5 - # TODO: we inadvertently clone existing datasets which is also unacceptable for predictions - filter data accordingly preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PREDICTION_DATASET}_osisaf.json processed.${PREDICTION_DATASET}_era5.json diff --git a/prep_training_data.sh b/prep_training_data.sh index bb0d3c1..188c39d 100755 --- a/prep_training_data.sh +++ b/prep_training_data.sh @@ -33,8 +33,6 @@ ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}" # Create links to the central data store datasets for easier "mapping" [ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf [ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5 -# TODO: AMSR -# TODO: CMIP GROUND_TRUTH_SIC="osi_sic.$TRAIN_DATA_NAME" ATMOS_PROC="era5_osi.$TRAIN_DATA_NAME" @@ -50,13 +48,20 @@ DATASET_NAME="tfdata_${HEMI}" ## Workflow preprocess_loader_init -v $PROCESSED_DATASET -preprocess_missing_time -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC +# We CAN supply splits and lead / lag to prevent unnecessarily large copies of datasets +# or interpolation of time across huge spans +# TODO: temporal interpolation limiting +preprocess_missing_time \ +# -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ +# -sh $LAG -st $FORECAST_LENGTH \ + -n siconca -v $OSISAF_DATASET $GROUND_TRUTH_SIC preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land "icenet.data.masks.osisaf:Masks" preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC polarhole "icenet.data.masks.osisaf:Masks" preprocess_add_mask -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC active_grid_cell "icenet.data.masks.osisaf:Masks" -preprocess_missing_spatial -m processed.masks.${HEMI}.json -mp land,inactive_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC +preprocess_missing_spatial \ + -m processed.masks.${HEMI}.json -mp land,inactive_grid_cell,polarhole -n siconca -v $GROUND_TRUTH_SIC_DSC preprocess_dataset $PROC_ARGS_SIC -v \ -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ From 2d3ad5e75bb7b084249bb35f71146d2d4c2eebb3 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 10 Jan 2025 12:35:13 +0000 Subject: [PATCH 39/44] AMSR2 dataset generation now working --- prep_amsr_training_data.sh | 90 +++++++++++++++++++ ...ng_data.sh => prep_osisaf_training_data.sh | 0 2 files changed, 90 insertions(+) create mode 100755 prep_amsr_training_data.sh rename prep_training_data.sh => prep_osisaf_training_data.sh (100%) diff --git a/prep_amsr_training_data.sh b/prep_amsr_training_data.sh new file mode 100755 index 0000000..901fe29 --- /dev/null +++ b/prep_amsr_training_data.sh @@ -0,0 +1,90 @@ +HEMI="$1" +DOWNLOAD=${2:-0} + +# download-toolbox integration +# This updates our source +if [ $DOWNLOAD -eq 1 ]; then + download_amsr2 $DATA_ARGS $HEMI $AMSR2_DATES $AMSR2_VAR_ARGS + download_osisaf $DATA_ARGS $HEMI $OSISAF_DATES $OSISAF_VAR_ARGS + download_era5 $DATA_ARGS $HEMI $ERA5_DATES $ERA5_VAR_ARGS + download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS +fi 2>&1 | tee logs/download.training.log + +### TODO: + +DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json" + +# preprocess-toolbox integration +# Persistent datasets from the source data store, wherever that is +AMSR2_DATASET="${SOURCE_DATA_STORE}/amsr2_6250/${DATASET_CONFIG_NAME}" +CMIP6_DATASET="${SOURCE_DATA_STORE}/cmip6.MRI-ESM2-0.r1i1p1f1/${DATASET_CONFIG_NAME}" +ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${DATASET_CONFIG_NAME}" +OSISAF_DATASET="${SOURCE_DATA_STORE}/osisaf/${DATASET_CONFIG_NAME}" + +# Create links to the central data store datasets for easier "mapping" +[ ! -e data/amsr2_6250 ] && [ -d ${SOURCE_DATA_STORE}/amsr2_6250 ] && ln -s ${SOURCE_DATA_STORE}/amsr2_6250 ./data/amsr2_6250 +[ ! -e data/era5 ] && [ -d ${SOURCE_DATA_STORE}/era5 ] && ln -s ${SOURCE_DATA_STORE}/era5 ./data/era5 +[ ! -e data/cmip6.MRI-ESM2-0.r1i1p1f1 ] && [ -d ${SOURCE_DATA_STORE}/cmip6.MRI-ESM2-0.r1i1p1f1 ] && ln -s ${SOURCE_DATA_STORE}/cmip6.MRI-ESM2-0.r1i1p1f1 ./data/cmip6.MRI-ESM2-0.r1i1p1f1 +[ ! -e data/osisaf ] && [ -d ${SOURCE_DATA_STORE}/osisaf ] && ln -s ${SOURCE_DATA_STORE}/osisaf ./data/osisaf + +PROCESSED_DATASET="${TRAIN_DATA_NAME}.${HEMI}" +LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json" +DATASET_NAME="tfdata_${HEMI}" + +ATMOS_PROC="era5_amsr.$TRAIN_DATA_NAME" +ATMOS_PROC_DSC="${PROCESSED_DATA_STORE}/${ATMOS_PROC}/${DATASET_CONFIG_NAME}" +GROUND_TRUTH_SIC="amsr2_sic.$TRAIN_DATA_NAME" +GROUND_TRUTH_SIC_DSC="${PROCESSED_DATA_STORE}/${GROUND_TRUTH_SIC}/${DATASET_CONFIG_NAME}" + +### +# Three stage training +# + +## +# Stage #1: CMIP6 ground truth with ERA5 +# + +## +# Stage #2: OSISAF ground truth with ERA5 +# + +## +# Stage #3: AMSR2 ground truth with ERA5 +# + +preprocess_loader_init -v $PROCESSED_DATASET +preprocess_add_mask -v $LOADER_CONFIGURATION $AMSR2_DATASET land "icenet.data.masks.nsidc:Masks" + +preprocess_missing_time -n siconca -v $AMSR2_DATASET $GROUND_TRUTH_SIC + +preprocess_dataset $PROC_ARGS_SIC -v \ + -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ + -i "icenet.data.processors.amsr:AMSR2PreProcessor" \ + -sh $LAG -st $FORECAST_LENGTH \ + $AMSR2_DATASET ${PROCESSED_DATASET}_amsr + +# IS THIS NEEDED? icenet_generate_ref_amsr -v ${PROCESSED_DATA_STORE}/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc +# ln -s data/amsr2_6250/siconca/2014/asi-AMSR2-s6250-20140630-v5.4.nc ref.amsr.${HEMI}.nc + +preprocess_regrid -v $ERA5_DATASET ref.amsr.${HEMI}.nc $ATMOS_PROC + +preprocess_dataset $PROC_ARGS_ERA5 -v \ + -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ + -i "icenet.data.processors.cds:ERA5PreProcessor" \ + -sh $LAG -st $FORECAST_LENGTH \ + $ATMOS_PROC_DSC ${PROCESSED_DATASET}_era5 + +preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PROCESSED_DATASET}_amsr.json processed.${PROCESSED_DATASET}_era5.json + +preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC sin "icenet.data.meta:SinProcessor" +preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC cos "icenet.data.meta:CosProcessor" +preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "icenet.data.masks.nsidc:Masks" + +icenet_dataset_create -v -c -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME + +FIRST_DATE=${PLOT_DATE:-`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.train[0]' | tr -d '"'`} +icenet_plot_input -p -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/input.${HEMI}.${FIRST_DATE}.png +icenet_plot_input --outputs -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/outputs.${HEMI}.${FIRST_DATE}.png +icenet_plot_input --weights -v dataset_config.${DATASET_NAME}.json $FIRST_DATE ./plot/weights.${HEMI}.${FIRST_DATE}.png + +icenet_dataset_create -v -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $DATASET_NAME diff --git a/prep_training_data.sh b/prep_osisaf_training_data.sh similarity index 100% rename from prep_training_data.sh rename to prep_osisaf_training_data.sh From 11c4b20d11d0c99d3fbf545bc1d36c3819763a98 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 10 Jan 2025 23:36:55 +0000 Subject: [PATCH 40/44] Restrict the amount of copying on regrid for AMSR --- prep_amsr_training_data.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/prep_amsr_training_data.sh b/prep_amsr_training_data.sh index 901fe29..97fa89d 100755 --- a/prep_amsr_training_data.sh +++ b/prep_amsr_training_data.sh @@ -64,9 +64,11 @@ preprocess_dataset $PROC_ARGS_SIC -v \ $AMSR2_DATASET ${PROCESSED_DATASET}_amsr # IS THIS NEEDED? icenet_generate_ref_amsr -v ${PROCESSED_DATA_STORE}/masks/ice_conc_${HEMI_SHORT}_ease2-250_cdr-v2p0_200001021200.nc -# ln -s data/amsr2_6250/siconca/2014/asi-AMSR2-s6250-20140630-v5.4.nc ref.amsr.${HEMI}.nc +[ ! -f ref.amsr.${HEMI}.nc ] && ln -s data/amsr2_6250/siconca/2014/asi-AMSR2-s6250-20140630-v5.4.nc ref.amsr.${HEMI}.nc -preprocess_regrid -v $ERA5_DATASET ref.amsr.${HEMI}.nc $ATMOS_PROC +preprocess_regrid -v \ + -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ + $ERA5_DATASET ref.amsr.${HEMI}.nc $ATMOS_PROC preprocess_dataset $PROC_ARGS_ERA5 -v \ -ps "train" -sn "train,val,test" -ss "$TRAIN_START,$VAL_START,$TEST_START" -se "$TRAIN_END,$VAL_END,$TEST_END" \ From 8ae686a1e2246bbbb5afe29d224f67d19a54003b Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 10 Jan 2025 23:45:50 +0000 Subject: [PATCH 41/44] Changing name --- prep_amsr_training_data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prep_amsr_training_data.sh b/prep_amsr_training_data.sh index 97fa89d..20613d3 100755 --- a/prep_amsr_training_data.sh +++ b/prep_amsr_training_data.sh @@ -29,7 +29,7 @@ OSISAF_DATASET="${SOURCE_DATA_STORE}/osisaf/${DATASET_CONFIG_NAME}" PROCESSED_DATASET="${TRAIN_DATA_NAME}.${HEMI}" LOADER_CONFIGURATION="loader.${PROCESSED_DATASET}.json" -DATASET_NAME="tfdata_${HEMI}" +DATASET_NAME="tfamsr_${HEMI}" ATMOS_PROC="era5_amsr.$TRAIN_DATA_NAME" ATMOS_PROC_DSC="${PROCESSED_DATA_STORE}/${ATMOS_PROC}/${DATASET_CONFIG_NAME}" From bf97ad6ed08812e379adae2db33f8cda3e44f1ef Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 10 Jan 2025 23:48:12 +0000 Subject: [PATCH 42/44] ENVS --- prep_amsr_training_data.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/prep_amsr_training_data.sh b/prep_amsr_training_data.sh index 20613d3..71989d4 100755 --- a/prep_amsr_training_data.sh +++ b/prep_amsr_training_data.sh @@ -1,6 +1,8 @@ HEMI="$1" DOWNLOAD=${2:-0} +source ENVS + # download-toolbox integration # This updates our source if [ $DOWNLOAD -eq 1 ]; then From 735d96e2b4f61b771513f4a58a2a6ea28d28d672 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Tue, 14 Jan 2025 08:48:33 +0000 Subject: [PATCH 43/44] Adding comments for transfer --- prep_amsr_prediction_data.sh | 75 ++++++++++++++++++++++++++++++++++++ prep_amsr_training_data.sh | 6 ++- 2 files changed, 79 insertions(+), 2 deletions(-) create mode 100755 prep_amsr_prediction_data.sh diff --git a/prep_amsr_prediction_data.sh b/prep_amsr_prediction_data.sh new file mode 100755 index 0000000..4d66381 --- /dev/null +++ b/prep_amsr_prediction_data.sh @@ -0,0 +1,75 @@ +HEMI="$1" +DOWNLOAD=${2:-0} + +source ENVS + +## +# TODO: Usable as is for training, but for prediction we need to restrict this to relevant activities and dates +# ./run_prediction.sh fc.09_12.2024 amsr_6k_6m_120125.south south + +# TODO: assuming monthly? +# TODO: shift the FORECAST_START into the past for LAG +export FORECAST_START="2024-09-01" +export FORECAST_END="2024-12-31" +export HEMI=south +export FORECAST_NAME="fc.09_12.2024" + +# download-toolbox integration +# This updates our source +if [ $DOWNLOAD -eq 1 ]; then + download_amsr2 $DATA_ARGS $HEMI $FORECAST_START $FORECAST_END $AMSR2_VAR_ARGS + download_era5 $DATA_ARGS $HEMI $FORECAST_START $FORECAST_END $ERA5_VAR_ARGS +fi 2>&1 | tee logs/download.prediction.log + +SOURCE_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json" + +AMSR2_DATASET="${SOURCE_DATA_STORE}/amsr2_6250/${SOURCE_CONFIG_NAME}" +ERA5_DATASET="${SOURCE_DATA_STORE}/era5/${SOURCE_CONFIG_NAME}" +AMSR2_PROCESSED="processed.${TRAIN_DATA_NAME}.${HEMI}_amsr.json" +ERA5_PROCESSED="processed.${TRAIN_DATA_NAME}.${HEMI}_era5.json" + +# preprocess-toolbox integration +# Persistent datasets from the source data store, wherever that is +FORECAST_DATASET="prediction.${FORECAST_NAME}.${HEMI}" +LOADER_CONFIGURATION="loader.${FORECAST_DATASET}.json" + +ATMOS_PROC="${TRAIN_DATA_NAME}.${HEMI}_era5" +ATMOS_PROC_DIR="processed/${ATMOS_PROC}" +GROUND_TRUTH_SIC="${TRAIN_DATA_NAME}.${HEMI}_amsr" +GROUND_TRUTH_SIC_DIR="processed/${GROUND_TRUTH_SIC}" + + +preprocess_loader_init -v $FORECAST_DATASET +preprocess_add_mask -v $LOADER_CONFIGURATION $AMSR2_DATASET land "icenet.data.masks.nsidc:Masks" + +preprocess_dataset $PROC_ARGS_SIC -v \ + -r $GROUND_TRUTH_SIC_DIR \ + -sn "test" -ss "$FORECAST_START" -se "$FORECAST_END" \ + -i "icenet.data.processors.amsr:AMSR2PreProcessor" \ + -sh $LAG \ + $AMSR2_DATASET ${FORECAST_NAME}_amsr + +preprocess_regrid -v \ + -sn "test" -ss "$FORECAST_START" -se "$FORECAST_END" \ + $ERA5_DATASET ref.amsr.${HEMI}.nc ${FORECAST_NAME}_era5 + +preprocess_dataset $PROC_ARGS_ERA5 -v \ + -r $ATMOS_PROC_DIR \ + -sn "test" -ss "$FORECAST_START" -se "$FORECAST_END" \ + -i "icenet.data.processors.cds:ERA5PreProcessor" \ + -sh $LAG \ + ${PROCESSED_DATA_STORE}/${FORECAST_NAME}_era5/${SOURCE_CONFIG_NAME} ${FORECAST_NAME}_era5 + +preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PROCESSED_DATASET}_amsr.json processed.${PROCESSED_DATASET}_era5.json + +preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC sin "icenet.data.meta:SinProcessor" +preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC cos "icenet.data.meta:CosProcessor" +preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "icenet.data.masks.nsidc:Masks" + +icenet_dataset_create -v -c -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $FORECAST_DATASET + +FIRST_DATE=${PLOT_DATE:-`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.test[0]' | tr -d '"'`} +icenet_plot_input -p -v dataset_config.${FORECAST_DATASET}.json $FIRST_DATE ./plot/input.${HEMI}.${FIRST_DATE}.png +icenet_plot_input --outputs -v dataset_config.${FORECAST_DATASET}.json $FIRST_DATE ./plot/outputs.${HEMI}.${FIRST_DATE}.png +icenet_plot_input --weights -v dataset_config.${FORECAST_DATASET}.json $FIRST_DATE ./plot/weights.${HEMI}.${FIRST_DATE}.png + diff --git a/prep_amsr_training_data.sh b/prep_amsr_training_data.sh index 71989d4..7dc03c2 100755 --- a/prep_amsr_training_data.sh +++ b/prep_amsr_training_data.sh @@ -3,6 +3,10 @@ DOWNLOAD=${2:-0} source ENVS +## +# TODO: Usable as is for training, but for prediction we need to restrict this to relevant activities and dates +# ./run_prediction.sh amsr_fc.09_12.2024 amsr_6k_6m_120125.south south + # download-toolbox integration # This updates our source if [ $DOWNLOAD -eq 1 ]; then @@ -12,8 +16,6 @@ if [ $DOWNLOAD -eq 1 ]; then download_cmip --source MRI-ESM2-0 --member r1i1p1f1 $DATA_ARGS $HEMI $CMIP6_DATES $CMIP6_VAR_ARGS fi 2>&1 | tee logs/download.training.log -### TODO: - DATASET_CONFIG_NAME="dataset_config.${DATA_FREQUENCY}.hemi.${HEMI}.json" # preprocess-toolbox integration From acf37e64c94d5d7504aa9b78b6f44eb30905ff28 Mon Sep 17 00:00:00 2001 From: James Byrne Date: Wed, 15 Jan 2025 16:41:08 +0000 Subject: [PATCH 44/44] Updating for revised split names --- prep_amsr_prediction_data.sh | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/prep_amsr_prediction_data.sh b/prep_amsr_prediction_data.sh index 4d66381..f2d764a 100755 --- a/prep_amsr_prediction_data.sh +++ b/prep_amsr_prediction_data.sh @@ -44,32 +44,30 @@ preprocess_add_mask -v $LOADER_CONFIGURATION $AMSR2_DATASET land "icenet.data.ma preprocess_dataset $PROC_ARGS_SIC -v \ -r $GROUND_TRUTH_SIC_DIR \ - -sn "test" -ss "$FORECAST_START" -se "$FORECAST_END" \ + -sn "prediction" -ss "$FORECAST_START" -se "$FORECAST_END" \ -i "icenet.data.processors.amsr:AMSR2PreProcessor" \ -sh $LAG \ $AMSR2_DATASET ${FORECAST_NAME}_amsr preprocess_regrid -v \ - -sn "test" -ss "$FORECAST_START" -se "$FORECAST_END" \ + -sn "prediction" -ss "$FORECAST_START" -se "$FORECAST_END" \ $ERA5_DATASET ref.amsr.${HEMI}.nc ${FORECAST_NAME}_era5 preprocess_dataset $PROC_ARGS_ERA5 -v \ -r $ATMOS_PROC_DIR \ - -sn "test" -ss "$FORECAST_START" -se "$FORECAST_END" \ + -sn "prediction" -ss "$FORECAST_START" -se "$FORECAST_END" \ -i "icenet.data.processors.cds:ERA5PreProcessor" \ -sh $LAG \ ${PROCESSED_DATA_STORE}/${FORECAST_NAME}_era5/${SOURCE_CONFIG_NAME} ${FORECAST_NAME}_era5 -preprocess_add_processed -v $LOADER_CONFIGURATION processed.${PROCESSED_DATASET}_amsr.json processed.${PROCESSED_DATASET}_era5.json +preprocess_add_processed -v $LOADER_CONFIGURATION processed.${FORECAST_NAME}_amsr.json processed.${FORECAST_NAME}_era5.json -preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC sin "icenet.data.meta:SinProcessor" -preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC cos "icenet.data.meta:CosProcessor" -preprocess_add_channel -v $LOADER_CONFIGURATION $GROUND_TRUTH_SIC_DSC land_map "icenet.data.masks.nsidc:Masks" +preprocess_add_channel -v $LOADER_CONFIGURATION $AMSR2_DATASET sin "icenet.data.meta:SinProcessor" +preprocess_add_channel -v $LOADER_CONFIGURATION $AMSR2_DATASET cos "icenet.data.meta:CosProcessor" +preprocess_add_channel -v $LOADER_CONFIGURATION $AMSR2_DATASET land_map "icenet.data.masks.nsidc:Masks" icenet_dataset_create -v -c -p -ob $BATCH_SIZE -w $WORKERS -fl $FORECAST_LENGTH $LOADER_CONFIGURATION $FORECAST_DATASET -FIRST_DATE=${PLOT_DATE:-`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.test[0]' | tr -d '"'`} +FIRST_DATE=${PLOT_DATE:-`cat ${LOADER_CONFIGURATION} | jq '.sources[.sources|keys[0]].splits.prediction[0]' | tr -d '"'`} icenet_plot_input -p -v dataset_config.${FORECAST_DATASET}.json $FIRST_DATE ./plot/input.${HEMI}.${FIRST_DATE}.png -icenet_plot_input --outputs -v dataset_config.${FORECAST_DATASET}.json $FIRST_DATE ./plot/outputs.${HEMI}.${FIRST_DATE}.png -icenet_plot_input --weights -v dataset_config.${FORECAST_DATASET}.json $FIRST_DATE ./plot/weights.${HEMI}.${FIRST_DATE}.png