marcnnn · marcnnn · Nov 26, 2024 · Dec 7, 2024 · Dec 7, 2024 · Jan 28, 2025
diff --git a/README.md b/README.md
@@ -1,5 +1,6 @@
 # FLAMESlurmBackend
 
+The Flame Slurm Backend allows to use FLAME on SLURM HPC Clusters.
 
 
 ## Installation
@@ -26,10 +27,46 @@ Configure the flame backend in our configuration or application setup:
       max: 10,
       max_concurrency: 5,
       idle_shutdown_after: 30_000,
-      log: :debug}
+      slurm_job: """
+      #!/bin/bash
+      #SBATCH -o flame.%j.out
+      #SBATCH --nodes=1
+      #SBATCH --ntasks-per-node=1
+      #SBATCH --time=01:00:00
+
+      export SLURM_FLAME_HOST=$(ip -f inet addr show ib0 | awk '/inet/ {print $2}' | cut -d/ -f1)
+      """
+    }
   ]
 ```
 
+This part of the Job definition defines the Host part of the Flame Child Beam.
+In this case the IP of teh infiniband interface is used so that the beam disribution Protokoll
+is communicating over the infiniband IP interface to allow low latency and high bandwith communication.
+
+```bash
+export SLURM_FLAME_HOST=$(ip -f inet addr show ib0 | awk '/inet/ {print $2}' | cut -d/ -f1)
+```
+You need to start the Parent Beam with the same configuration.
+
+Example Livebook start on a CUDA 12.5 Node with CUDNN in $HOME for NX ELXA Backend:
+```bash
+#!/bin/bash
+export CUDA=/usr/local/cuda-12.5/
+export CUDNN=$HOME/cudnn-linux-x86_64-9.5.0.50_cuda12-archive/
+export PATH=$PATH:$CUDA/bin
+export CPATH=$CPATH:$CUDNN/include:$CUDA/include
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDNN/lib
+export MIX_INSTALL_DIR=$WORK/mix-cache
+export LB_HF_TOKEN=hf_*****
+
+export SLURM_FLAME_HOST=$(ip -f inet addr show ib0 | awk '/inet/ {print $2}' | cut -d/ -f1)
+
+export BUMBLEBEE_CACHE_DIR=$WORK/bumblebee/
+epmd -daemon
+LIVEBOOK_IP=0.0.0.0 LIVEBOOK_PASSWORD=***** MIX_ENV=prod livebook server --name livebook@$SLURM_FLAME_HOST
+```
+
 ## Prerequisites
 
 The Flame Parent and the Slurm cluster need to be able to connect via Erlang PRC.
@@ -48,5 +85,18 @@ This Job will than be scheduled if ressoucces are avaiable.
 If it not scheduled within the timeout the job is canceled to not block ressourcess that are no longer needed.
 To be able to run the runner with the correct enviorment cluster specific bash file needs to be created.
 
+# Cleanup
+
+On some clusters TMPDIR is cleaned per Job on others not.
+This is why TMPDIR is changed to one based on the Job ID and deleted after SIGUSR1 is send by SLURM this is Configured by default in Flame SLurm to 30 Seconds before kill.
-This is why TMPDIR is changed to one based on the Job ID and deleted after SIGUSR1 is send by SLURM this is Configured by default in Flame SLurm to 30 Seconds before kill.
+This is why `TMPDIR` is changed to one based on the Job ID, which is automatically cleared when the Job terminates. More precisely, Slurm is automatically configured to send a SIGUSR1 signal to FLAME 30 seconds before it terminates the Job, which then deletes the directory.
-This is why TMPDIR is changed to one based on the Job ID and deleted after SIGUSR1 is send by SLURM this is Configured by default in Flame SLurm to 30 Seconds before kill.
+This is why `TMPDIR` is changed to one based on the Job ID, which is automatically cleared when the Job terminates. More precisely, Slurm is automatically configured to send a SIGUSR1 signal to FLAME 30 seconds before it terminates the Job, which then deletes the directory.
+
+# Long running Jobs
+
+If your Job time is Limited Slurm will kill the Job with the Flame runner.
+There are no mechanisims in place to not use the runner if time is about to run out.
+
+This would be a well appriciated contribution!
+
+Be aware that you might loose Data because of this.
 
 ## Troubleshooting
diff --git a/lib/flame_slurm_backend/slurm_client.ex b/lib/flame_slurm_backend/slurm_client.ex
@@ -60,10 +60,22 @@ defmodule FLAMESlurmBackend.SlurmClient do
     file = File.open!("flame_auto.sh", [:write])
 
     IO.puts(file, slurm_job)
+    # Ask SLURM to signal 30 sec before kill to send SIGUSR1 to shutdown BEAM
+    # Adds a folder in the TMPDIR and changes $TMPDIR to it
+    # to be able to delete it after the job
     IO.puts(file, """
-    elixir -e "$ELIXIR_SLURM_SCRIPT"
-    """)
-
+#SBATCH --signal=B:SIGUSR1@30
+mkdir $TMPDIR/$SLURMJOBID
+export TMPDIR=$TMPDIR/$SLURMJOBID
+""")
+    IO.puts(file, """
+elixir -e "$ELIXIR_SLURM_SCRIPT"
+""")
+    # remove the TMP folder
+    # this should be called since SLURM sends
+    IO.puts(file, """
+rm -rf $TMPDIR/$SLURMJOBID
+""")
     File.close(file)
     System.cmd("chmod", ["+x","flame_auto.sh"])