From de70079608d4d1a7410a67f10eac0dd5d1cebd6e Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Wed, 26 Jun 2024 11:55:33 +0200
Subject: [PATCH 01/14] WIP

---
 README.md                             |  3 +
 ai4papi/conf.py                       |  6 ++
 ai4papi/routers/v1/__init__.py        |  3 +-
 ai4papi/routers/v1/try_me/__init__.py | 10 +++
 ai4papi/routers/v1/try_me/nomad.py    | 73 ++++++++++++++++++++++
 etc/try_me/nomad.hcl                  | 90 +++++++++++++++++++++++++++
 6 files changed, 184 insertions(+), 1 deletion(-)
 create mode 100644 ai4papi/routers/v1/try_me/__init__.py
 create mode 100644 ai4papi/routers/v1/try_me/nomad.py
 create mode 100644 etc/try_me/nomad.hcl

diff --git a/README.md b/README.md
index 1687f03..cc8d1c1 100644
--- a/README.md
+++ b/README.md
@@ -199,6 +199,9 @@ More details can be found in the [API docs](https://api.cloud.ai4eosc.eu/docs).
   **Notes**: The catalog caches results for up to 6 hours to improve UX (see
   [doctring](./ai4papi/routers/v1/modules.py)).
 
+* `/v1/try_me/`:
+   endpoint where anyone can deploy a short-lived container to try a module
+
 * `/v1/deployments/`: (🔒)
    deploy modules/tools in the platform to perform trainings
 
diff --git a/ai4papi/conf.py b/ai4papi/conf.py
index b4d784c..512d30b 100644
--- a/ai4papi/conf.py
+++ b/ai4papi/conf.py
@@ -79,3 +79,9 @@ def load_yaml_conf(fpath):
             'values': yml[1],
         }
     }
+
+# Try-me endpoints
+nmd = load_nomad_job(paths['conf'] / 'try_me' / 'nomad.hcl')
+TRY_ME = {
+    'nomad': nmd,
+}
diff --git a/ai4papi/routers/v1/__init__.py b/ai4papi/routers/v1/__init__.py
index 6bfcfb7..0071451 100644
--- a/ai4papi/routers/v1/__init__.py
+++ b/ai4papi/routers/v1/__init__.py
@@ -1,11 +1,12 @@
 import fastapi
 
-from . import catalog, deployments, secrets
+from . import catalog, deployments, secrets, try_me
 
 app = fastapi.APIRouter()
 app.include_router(catalog.app)
 app.include_router(deployments.app)
 app.include_router(secrets.router)
+app.include_router(try_me.app)
 
 
 @app.get(
diff --git a/ai4papi/routers/v1/try_me/__init__.py b/ai4papi/routers/v1/try_me/__init__.py
new file mode 100644
index 0000000..a86c86b
--- /dev/null
+++ b/ai4papi/routers/v1/try_me/__init__.py
@@ -0,0 +1,10 @@
+import fastapi
+
+from . import nomad
+
+
+app = fastapi.APIRouter()
+app.include_router(
+    router=nomad.router,
+    prefix='/try_me',
+    )
diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py
new file mode 100644
index 0000000..cf5b028
--- /dev/null
+++ b/ai4papi/routers/v1/try_me/nomad.py
@@ -0,0 +1,73 @@
+from copy import deepcopy
+import uuid
+
+from fastapi import APIRouter
+from fastapi.security import HTTPBearer
+
+from ai4papi import utils
+import ai4papi.conf as papiconf
+from ai4papi.routers.v1.catalog.modules import Modules
+import ai4papi.nomad.common as nomad
+
+
+router = APIRouter(
+    prefix="/nomad",
+    tags=["Nomad trials"],
+    responses={404: {"description": "Not found"}},
+)
+security = HTTPBearer()
+
+
+@router.post("/")
+def create_deployment(
+    module_name: str,
+    ):
+    """
+    Submit a try-me deployment to Nomad.
+    The deployment will automatically kill himself after a short amount of time.
+
+    This endpoint is meant to be public for everyone to try (no authorization required).
+    We deploy jobs by default in the AI4EOSC namespace.
+
+    Returns a string with the endpoint to access the API.
+    """
+    # Retrieve docker_image from module_name
+    meta = Modules.get_metadata(module_name)
+    docker_image = meta['sources']['docker_registry_repo']
+    # docker_image = "deephdc/image-classification-tf"  # todo: remove
+
+    # Load module configuration
+    nomad_conf = deepcopy(papiconf.TRY_ME['nomad'])
+
+    # Generate UUID from (MAC address+timestamp) so it's unique
+    job_uuid = uuid.uuid1()
+
+    # Generate a domain for user-app and check nothing is running there
+    domain = utils.generate_domain(
+        hostname='',
+        base_domain=papiconf.MAIN_CONF['lb']['domain']['vo.ai4eosc.eu'],
+        job_uuid=job_uuid,
+    )
+    utils.check_domain(domain)
+
+    # Replace the Nomad job template
+    nomad_conf = nomad_conf.safe_substitute(
+        {
+            'JOB_UUID': job_uuid,
+            'DOMAIN': domain,
+            'DOCKER_IMAGE': docker_image,
+        }
+    )
+
+    # Convert template to Nomad conf
+    nomad_conf = nomad.load_job_conf(nomad_conf)
+
+    # Submit job
+    r = nomad.create_deployment(nomad_conf)
+
+    return r
+
+
+# TODO: implement a get method to retrieve endpoint
+# This is implemented in a separate method because we cannot know what is the final
+# endpoint before knowing in which datacenter it has landed
diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl
new file mode 100644
index 0000000..1e0971a
--- /dev/null
+++ b/etc/try_me/nomad.hcl
@@ -0,0 +1,90 @@
+/*
+Convention:
+-----------
+* ${UPPERCASE} are replaced by the user
+* ${lowercase} are replace by Nomad at launchtime
+* remaining is default, same for everybody
+
+When replacing user values we use safe_substitute() so that ge don't get an error for not
+replacing Nomad values
+*/
+
+job "usertest-${JOB_UUID}" {
+  namespace = "default"
+  type      = "service"
+  region    = "global"
+  id        = "${JOB_UUID}"
+  priority  = "0"  # "Try-me" jobs have low priority
+
+  # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid
+  # overloading GPU clients with CPU-only jobs.
+  affinity {
+    attribute = "${node.unique.name}"
+    operator  = "regexp"
+    value     = "gpu"
+    weight    = -50  # anti-affinity for GPU clients
+  }
+  #TODO: *force* CPU for try-me deployments
+
+  # Avoid rescheduling the job on **other** nodes during a network cut
+  # Command not working due to https://github.com/hashicorp/nomad/issues/16515
+  reschedule {
+    attempts  = 0
+    unlimited = false
+  }
+
+  group "usergroup" {
+
+    # Recover the job in the **original** node when the network comes back
+    # (after a network cut).
+    # If network cut lasts more than 10 days (240 hrs), job is restarted anyways.
+    # Do not increase too much this limit because we want to still be able to notice
+    # when nodes are truly removed from the cluster (not just temporarily lost).
+    max_client_disconnect = "240h"
+
+    network {
+
+      port "ide" {
+        to = 8888  # -1 will assign random port
+      }
+
+    }
+
+    service {
+      name = "${JOB_UUID}-api"
+      port = "api"
+      tags = [
+        "traefik.enable=true",
+        "traefik.http.routers.${JOB_UUID}-api.tls=true",
+        "traefik.http.routers.${JOB_UUID}-api.rule=Host(`api-${DOMAIN}`, `www.api-${DOMAIN}`)",
+      ]
+    }
+
+    ephemeral_disk {
+      size = 300  # MB
+    }
+
+    task "usertask" {
+      // Task configured by the user
+
+      driver = "docker"
+
+      config {
+        force_pull = true
+        image      = "${DOCKER_IMAGE}:latest"
+        command    = "curl"
+        args       = ["-s", "https://raw.githubusercontent.com/ai4os/deepaas_ui/nomad/nomad.sh", "|", "bash"]
+        ports      = ["ide"]
+        shm_size   = 500000000  # 500MB
+        memory_hard_limit = 1000  # 1GB
+      }
+
+      resources {
+        cores  = 1
+        memory = 1000  # 1GB
+        memory_max = 1000  # 1GB
+      }
+    }
+
+  }
+}

From 718c051fe08fd9fd0045d2dcd44138028a762240 Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Fri, 28 Jun 2024 18:41:44 +0200
Subject: [PATCH 02/14] feat: add `get_deployment`

---
 ai4papi/routers/v1/try_me/nomad.py | 25 ++++++++++++++++++---
 etc/try_me/nomad.hcl               | 36 ++++++++++++++++++++----------
 2 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py
index cf5b028..80cdf14 100644
--- a/ai4papi/routers/v1/try_me/nomad.py
+++ b/ai4papi/routers/v1/try_me/nomad.py
@@ -68,6 +68,25 @@ def create_deployment(
     return r
 
 
-# TODO: implement a get method to retrieve endpoint
-# This is implemented in a separate method because we cannot know what is the final
-# endpoint before knowing in which datacenter it has landed
+@router.get("/{deployment_uuid}")
+def get_deployment(
+    deployment_uuid: str,
+    ):
+    """
+    This function is used mainly to be able to retrieve the endpoint of the try_me job.
+    We cannot return the endpoint when creating the job, because the final endpoint will
+    on which datacenter the job ends up landing.
+
+    Parameters:
+    * **deployment_uuid**: uuid of deployment to gather info about
+
+    Returns a dict with info
+    """
+    job = nomad.get_deployment(
+        deployment_uuid=deployment_uuid,
+        namespace="ai4eosc",
+        owner="",  # try-me endpoints have no owner
+        full_info=True,
+    )
+
+    return job
diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl
index 1e0971a..e4f9700 100644
--- a/etc/try_me/nomad.hcl
+++ b/etc/try_me/nomad.hcl
@@ -9,12 +9,21 @@ When replacing user values we use safe_substitute() so that ge don't get an erro
 replacing Nomad values
 */
 
-job "usertest-${JOB_UUID}" {
-  namespace = "default"
+job "userjob-${JOB_UUID}" {
+  namespace = "ai4eosc"     # try-me jobs are always deployed in ai4eosc
   type      = "service"
   region    = "global"
   id        = "${JOB_UUID}"
-  priority  = "0"  # "Try-me" jobs have low priority
+  priority  = "0"           # try-me jobs have low priority
+
+  # Try-me jobs have no owner
+  meta {
+    owner       = ""
+    owner_name  = ""
+    owner_email = ""
+    title       = ""
+    description = ""
+  }
 
   # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid
   # overloading GPU clients with CPU-only jobs.
@@ -24,7 +33,8 @@ job "usertest-${JOB_UUID}" {
     value     = "gpu"
     weight    = -50  # anti-affinity for GPU clients
   }
-  #TODO: *force* CPU for try-me deployments
+  #TODO: *force* CPU for try-me deployments.
+  # Wait until we move to federated cluster because this will be easier to implement.
 
   # Avoid rescheduling the job on **other** nodes during a network cut
   # Command not working due to https://github.com/hashicorp/nomad/issues/16515
@@ -44,19 +54,19 @@ job "usertest-${JOB_UUID}" {
 
     network {
 
-      port "ide" {
+      port "ui" {
         to = 8888  # -1 will assign random port
       }
 
     }
 
     service {
-      name = "${JOB_UUID}-api"
-      port = "api"
+      name = "${JOB_UUID}-ui"
+      port = "ui"
       tags = [
         "traefik.enable=true",
-        "traefik.http.routers.${JOB_UUID}-api.tls=true",
-        "traefik.http.routers.${JOB_UUID}-api.rule=Host(`api-${DOMAIN}`, `www.api-${DOMAIN}`)",
+        "traefik.http.routers.${JOB_UUID}-ui.tls=true",
+        "traefik.http.routers.${JOB_UUID}-ui.rule=Host(`ui-${DOMAIN}`, `www.ui-${DOMAIN}`)",
       ]
     }
 
@@ -67,14 +77,16 @@ job "usertest-${JOB_UUID}" {
     task "usertask" {
       // Task configured by the user
 
+      # TODO: kill after 10 mins and do *not* restart
+
       driver = "docker"
 
       config {
         force_pull = true
         image      = "${DOCKER_IMAGE}:latest"
-        command    = "curl"
-        args       = ["-s", "https://raw.githubusercontent.com/ai4os/deepaas_ui/nomad/nomad.sh", "|", "bash"]
-        ports      = ["ide"]
+        command    = "sh"
+        args       = ["-c", "curl https://raw.githubusercontent.com/ai4os/deepaas_ui/nomad/nomad.sh | bash"]
+        ports      = ["ui"]
         shm_size   = 500000000  # 500MB
         memory_hard_limit = 1000  # 1GB
       }

From 8d6af651c0d34c690f24b4ac4ac0291cacc7ca51 Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Mon, 1 Jul 2024 15:34:25 +0200
Subject: [PATCH 03/14] feat: limit job duration

---
 etc/try_me/nomad.hcl | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl
index e4f9700..39f80e1 100644
--- a/etc/try_me/nomad.hcl
+++ b/etc/try_me/nomad.hcl
@@ -11,7 +11,7 @@ replacing Nomad values
 
 job "userjob-${JOB_UUID}" {
   namespace = "ai4eosc"     # try-me jobs are always deployed in ai4eosc
-  type      = "service"
+  type      = "batch"       # try-me jobs should not be redeployed when exit_code=0
   region    = "global"
   id        = "${JOB_UUID}"
   priority  = "0"           # try-me jobs have low priority
@@ -36,8 +36,7 @@ job "userjob-${JOB_UUID}" {
   #TODO: *force* CPU for try-me deployments.
   # Wait until we move to federated cluster because this will be easier to implement.
 
-  # Avoid rescheduling the job on **other** nodes during a network cut
-  # Command not working due to https://github.com/hashicorp/nomad/issues/16515
+  # Do not try to restart a try-me job if it raised an error (eg. module incompatible with Gradio UI)
   reschedule {
     attempts  = 0
     unlimited = false
@@ -45,13 +44,6 @@ job "userjob-${JOB_UUID}" {
 
   group "usergroup" {
 
-    # Recover the job in the **original** node when the network comes back
-    # (after a network cut).
-    # If network cut lasts more than 10 days (240 hrs), job is restarted anyways.
-    # Do not increase too much this limit because we want to still be able to notice
-    # when nodes are truly removed from the cluster (not just temporarily lost).
-    max_client_disconnect = "240h"
-
     network {
 
       port "ui" {
@@ -75,9 +67,7 @@ job "userjob-${JOB_UUID}" {
     }
 
     task "usertask" {
-      // Task configured by the user
-
-      # TODO: kill after 10 mins and do *not* restart
+      # Task configured by the user
 
       driver = "docker"
 
@@ -91,12 +81,23 @@ job "userjob-${JOB_UUID}" {
         memory_hard_limit = 1000  # 1GB
       }
 
+      env {
+        DURATION = "10m"  # try-me job killed after 10 mins (with exit_code=0)
+        UI_PORT  = 8888
+      }
+
       resources {
-        cores  = 1
-        memory = 1000  # 1GB
+        cores      = 1
+        memory     = 1000  # 1GB
         memory_max = 1000  # 1GB
       }
-    }
 
+      # Do not try to restart a try-me job if it raised an error (eg. module incompatible with Gradio UI)
+      restart {
+        attempts = 0
+        mode     = "fail"
+      }
+
+    }
   }
 }

From 8b4438febc8589ef7c21be4946e9a0211457721e Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Fri, 19 Jul 2024 16:13:12 +0200
Subject: [PATCH 04/14] feat: launch UI in a separate container

---
 etc/try_me/nomad.hcl | 66 ++++++++++++++++++++++++++++++++------------
 1 file changed, 48 insertions(+), 18 deletions(-)

diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl
index 39f80e1..9e5652f 100644
--- a/etc/try_me/nomad.hcl
+++ b/etc/try_me/nomad.hcl
@@ -36,20 +36,21 @@ job "userjob-${JOB_UUID}" {
   #TODO: *force* CPU for try-me deployments.
   # Wait until we move to federated cluster because this will be easier to implement.
 
-  # Do not try to restart a try-me job if it raised an error (eg. module incompatible with Gradio UI)
-  reschedule {
-    attempts  = 0
-    unlimited = false
-  }
-
   group "usergroup" {
 
-    network {
+    # Do not try to restart a try-me job if it raised an error (eg. module incompatible with Gradio UI)
+    reschedule {
+      attempts  = 0
+      unlimited = false
+    }
 
+    network {
       port "ui" {
-        to = 8888  # -1 will assign random port
+        to = 80  # -1 will assign random port
+      }
+      port "api" {
+        to = 5000  # -1 will assign random port
       }
-
     }
 
     service {
@@ -61,6 +62,7 @@ job "userjob-${JOB_UUID}" {
         "traefik.http.routers.${JOB_UUID}-ui.rule=Host(`ui-${DOMAIN}`, `www.ui-${DOMAIN}`)",
       ]
     }
+    #TODO: adapt for federated cluster
 
     ephemeral_disk {
       size = 300  # MB
@@ -69,35 +71,63 @@ job "userjob-${JOB_UUID}" {
     task "usertask" {
       # Task configured by the user
 
+      # Run as a prestart task to make sure deepaas has already launched when launching the deepaas UI
+      lifecycle {
+        hook    = "prestart"
+        sidecar = true
+      }
+
       driver = "docker"
 
       config {
         force_pull = true
         image      = "${DOCKER_IMAGE}:latest"
-        command    = "sh"
-        args       = ["-c", "curl https://raw.githubusercontent.com/ai4os/deepaas_ui/nomad/nomad.sh | bash"]
-        ports      = ["ui"]
+        command    = "deep-start"
+        args       = ["--deepaas"]
+        ports      = ["api"]
         shm_size   = 500000000  # 500MB
         memory_hard_limit = 1000  # 1GB
       }
 
-      env {
-        DURATION = "10m"  # try-me job killed after 10 mins (with exit_code=0)
-        UI_PORT  = 8888
-      }
-
       resources {
         cores      = 1
         memory     = 1000  # 1GB
         memory_max = 1000  # 1GB
       }
 
-      # Do not try to restart a try-me job if it raised an error (eg. module incompatible with Gradio UI)
+    }
+
+    task "ui" {
+      # DEEPaaS UI
+
+      driver = "docker"
+
+      config {
+        force_pull = true
+        image      = "registry.services.ai4os.eu/ai4os/deepaas_ui"
+        ports      = ["ui"]
+        shm_size   = 250000000   # 250MB
+        memory_hard_limit = 500  # MB
+      }
+
+      env {
+        DURATION = "10m"  # kill job after 10 mins
+        UI_PORT  = 80
+      }
+
+      resources {
+        cpu        = 500  # MHz
+        memory     = 500  # MB
+        memory_max = 500  # MB
+      }
+
+      # Do not try to restart a try-me job if it raises error (module incompatible with Gradio UI)
       restart {
         attempts = 0
         mode     = "fail"
       }
 
     }
+
   }
 }

From b86c196c76df217750d03b83893811aee09d25be Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Mon, 12 Aug 2024 12:42:12 +0200
Subject: [PATCH 05/14] feat: make try-me deployments authenticated

---
 ai4papi/routers/v1/try_me/nomad.py | 18 ++++++++++++++----
 etc/try_me/nomad.hcl               |  7 +++----
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py
index 80cdf14..d123382 100644
--- a/ai4papi/routers/v1/try_me/nomad.py
+++ b/ai4papi/routers/v1/try_me/nomad.py
@@ -1,10 +1,10 @@
 from copy import deepcopy
 import uuid
 
-from fastapi import APIRouter
+from fastapi import APIRouter, Depends
 from fastapi.security import HTTPBearer
 
-from ai4papi import utils
+from ai4papi import auth
 import ai4papi.conf as papiconf
 from ai4papi.routers.v1.catalog.modules import Modules
 import ai4papi.nomad.common as nomad
@@ -21,6 +21,7 @@
 @router.post("/")
 def create_deployment(
     module_name: str,
+    authorization=Depends(security),
     ):
     """
     Submit a try-me deployment to Nomad.
@@ -31,6 +32,9 @@ def create_deployment(
 
     Returns a string with the endpoint to access the API.
     """
+    # Retrieve authenticated user info
+    auth_info = auth.get_user_info(token=authorization.credentials)
+
     # Retrieve docker_image from module_name
     meta = Modules.get_metadata(module_name)
     docker_image = meta['sources']['docker_registry_repo']
@@ -54,7 +58,9 @@ def create_deployment(
     nomad_conf = nomad_conf.safe_substitute(
         {
             'JOB_UUID': job_uuid,
-            'DOMAIN': domain,
+            'OWNER': auth_info['id'],
+            'OWNER_NAME': auth_info['name'],
+            'OWNER_EMAIL': auth_info['email'],
             'DOCKER_IMAGE': docker_image,
         }
     )
@@ -71,6 +77,7 @@ def create_deployment(
 @router.get("/{deployment_uuid}")
 def get_deployment(
     deployment_uuid: str,
+    authorization=Depends(security),
     ):
     """
     This function is used mainly to be able to retrieve the endpoint of the try_me job.
@@ -82,10 +89,13 @@ def get_deployment(
 
     Returns a dict with info
     """
+    # Retrieve authenticated user info
+    auth_info = auth.get_user_info(token=authorization.credentials)
+
     job = nomad.get_deployment(
         deployment_uuid=deployment_uuid,
         namespace="ai4eosc",
-        owner="",  # try-me endpoints have no owner
+        owner=auth_info['id'],
         full_info=True,
     )
 
diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl
index 9e5652f..f152007 100644
--- a/etc/try_me/nomad.hcl
+++ b/etc/try_me/nomad.hcl
@@ -16,11 +16,10 @@ job "userjob-${JOB_UUID}" {
   id        = "${JOB_UUID}"
   priority  = "0"           # try-me jobs have low priority
 
-  # Try-me jobs have no owner
   meta {
-    owner       = ""
-    owner_name  = ""
-    owner_email = ""
+    owner       = "${OWNER}"  # user-id from OIDC
+    owner_name  = "${OWNER_NAME}"
+    owner_email = "${OWNER_EMAIL}"
     title       = ""
     description = ""
   }

From ad3fd390fd7716c07e8b637af8ec2f1d5ac05b72 Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Mon, 12 Aug 2024 12:43:34 +0200
Subject: [PATCH 06/14] feat: adapt try-me job to new federated cluster

---
 ai4papi/routers/v1/try_me/nomad.py | 14 +++-----
 etc/try_me/nomad.hcl               | 52 ++++++++++++++++++++----------
 2 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py
index d123382..0e4361d 100644
--- a/ai4papi/routers/v1/try_me/nomad.py
+++ b/ai4papi/routers/v1/try_me/nomad.py
@@ -38,7 +38,6 @@ def create_deployment(
     # Retrieve docker_image from module_name
     meta = Modules.get_metadata(module_name)
     docker_image = meta['sources']['docker_registry_repo']
-    # docker_image = "deephdc/image-classification-tf"  # todo: remove
 
     # Load module configuration
     nomad_conf = deepcopy(papiconf.TRY_ME['nomad'])
@@ -46,21 +45,16 @@ def create_deployment(
     # Generate UUID from (MAC address+timestamp) so it's unique
     job_uuid = uuid.uuid1()
 
-    # Generate a domain for user-app and check nothing is running there
-    domain = utils.generate_domain(
-        hostname='',
-        base_domain=papiconf.MAIN_CONF['lb']['domain']['vo.ai4eosc.eu'],
-        job_uuid=job_uuid,
-    )
-    utils.check_domain(domain)
-
     # Replace the Nomad job template
     nomad_conf = nomad_conf.safe_substitute(
         {
             'JOB_UUID': job_uuid,
+            'NAMESPACE': 'ai4eosc',  # (!) try-me jobs are always deployed in "ai4eosc"
             'OWNER': auth_info['id'],
             'OWNER_NAME': auth_info['name'],
             'OWNER_EMAIL': auth_info['email'],
+            'BASE_DOMAIN': papiconf.MAIN_CONF['lb']['domain']['vo.ai4eosc.eu'],  # idem
+            'HOSTNAME': job_uuid,
             'DOCKER_IMAGE': docker_image,
         }
     )
@@ -94,7 +88,7 @@ def get_deployment(
 
     job = nomad.get_deployment(
         deployment_uuid=deployment_uuid,
-        namespace="ai4eosc",
+        namespace="ai4eosc",  # (!) try-me jobs are always deployed in "ai4eosc"
         owner=auth_info['id'],
         full_info=True,
     )
diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl
index f152007..924e5b4 100644
--- a/etc/try_me/nomad.hcl
+++ b/etc/try_me/nomad.hcl
@@ -9,8 +9,8 @@ When replacing user values we use safe_substitute() so that ge don't get an erro
 replacing Nomad values
 */
 
-job "userjob-${JOB_UUID}" {
-  namespace = "ai4eosc"     # try-me jobs are always deployed in ai4eosc
+job "try-${JOB_UUID}" {
+  namespace = "${NAMESPACE}"
   type      = "batch"       # try-me jobs should not be redeployed when exit_code=0
   region    = "global"
   id        = "${JOB_UUID}"
@@ -24,26 +24,47 @@ job "userjob-${JOB_UUID}" {
     description = ""
   }
 
-  # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid
-  # overloading GPU clients with CPU-only jobs.
-  affinity {
-    attribute = "${node.unique.name}"
+  # Only use nodes that have succesfully passed the ai4-nomad_tests (ie. meta.status=ready)
+  constraint {
+    attribute = "${meta.status}"
     operator  = "regexp"
-    value     = "gpu"
-    weight    = -50  # anti-affinity for GPU clients
+    value     = "ready"
+  }
+
+  # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik)
+  constraint {
+    attribute = "${meta.compute}"
+    operator  = "="
+    value     = "true"
+  }
+
+  # Only deploy in nodes serving that namespace (we use metadata instead of node-pools
+  # because Nomad does not allow a node to belong to several node pools)
+  constraint {
+    attribute = "${meta.namespace}"
+    operator  = "regexp"
+    value     = "${NAMESPACE}"
+  }
+
+  # Force that try-me jobs land in CPU-only nodes to avoid impacting the GPU trainings
+  # of our real users
+  constraint {
+    attribute = "${meta.tags}"
+    operator  = "regexp"
+    value     = "cpu"
   }
-  #TODO: *force* CPU for try-me deployments.
-  # Wait until we move to federated cluster because this will be easier to implement.
 
   group "usergroup" {
 
-    # Do not try to restart a try-me job if it raised an error (eg. module incompatible with Gradio UI)
+    # Do not try to restart a try-me job if it raised an error (eg. module incompatible
+    # with Gradio UI)
     reschedule {
       attempts  = 0
       unlimited = false
     }
 
     network {
+
       port "ui" {
         to = 80  # -1 will assign random port
       }
@@ -58,17 +79,15 @@ job "userjob-${JOB_UUID}" {
       tags = [
         "traefik.enable=true",
         "traefik.http.routers.${JOB_UUID}-ui.tls=true",
-        "traefik.http.routers.${JOB_UUID}-ui.rule=Host(`ui-${DOMAIN}`, `www.ui-${DOMAIN}`)",
+        "traefik.http.routers.${JOB_UUID}-ui.rule=Host(`ui-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.ui-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)",
       ]
     }
-    #TODO: adapt for federated cluster
 
     ephemeral_disk {
       size = 300  # MB
     }
 
-    task "usertask" {
-      # Task configured by the user
+    task "main" { # DEEPaaS API
 
       # Run as a prestart task to make sure deepaas has already launched when launching the deepaas UI
       lifecycle {
@@ -96,8 +115,7 @@ job "userjob-${JOB_UUID}" {
 
     }
 
-    task "ui" {
-      # DEEPaaS UI
+    task "ui" { # DEEPaaS UI (Gradio)
 
       driver = "docker"
 

From cc9a4a928362d20707f80a14d7b3eb676ddc2fd0 Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Mon, 12 Aug 2024 13:39:31 +0200
Subject: [PATCH 07/14] feat: add additional safeguards for resource usage

---
 ai4papi/routers/v1/try_me/nomad.py | 37 +++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py
index 0e4361d..1a67abb 100644
--- a/ai4papi/routers/v1/try_me/nomad.py
+++ b/ai4papi/routers/v1/try_me/nomad.py
@@ -1,12 +1,13 @@
 from copy import deepcopy
 import uuid
 
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, HTTPException
 from fastapi.security import HTTPBearer
 
 from ai4papi import auth
 import ai4papi.conf as papiconf
 from ai4papi.routers.v1.catalog.modules import Modules
+from ai4papi.routers.v1.stats.deployments import get_cluster_stats
 import ai4papi.nomad.common as nomad
 
 
@@ -62,6 +63,40 @@ def create_deployment(
     # Convert template to Nomad conf
     nomad_conf = nomad.load_job_conf(nomad_conf)
 
+    # Check that at least 20% of the candidate node resources (CPU nodes belonging to
+    # ai4eosc) are free, to avoid impacting too much on our real users.
+    # We check for every resource metric (cpu, disk, ram)
+    stats = get_cluster_stats(vo='vo.ai4eosc.eu')
+    resources = ['cpu', 'ram', 'disk']
+    keys = [f"{i}_used" for i in resources] + [f"{i}_total" for i in resources]
+    status = {k: 0 for k in keys}
+
+    for _, datacenter  in stats['datacenters'].items():
+        for _, node in datacenter['nodes'].items():
+            for k in keys:
+                status[k] += node[k]
+    for r in resources:
+        if status[f"{r}_used"] / status[f"{r}_total"] > 0.8:
+            raise HTTPException(
+                status_code=503,
+                detail="Sorry, but there seem to be no resources available right " \
+                    "now to test the module. Please try later.",
+                )
+
+    # Check that the user hasn't too many "try-me" jobs currently running
+    jobs = nomad.get_deployments(
+        namespace="ai4eosc",  # (!) try-me jobs are always deployed in "ai4eosc"
+        owner=auth_info['id'],
+        prefix="try",
+    )
+    if len(jobs) > 2:
+        raise HTTPException(
+            status_code=503,
+            detail="Sorry, but you seem to be currently running two `Try-me` environments already." \
+                "Before launching a new one, you will need to wait till one of your " \
+                "existing environments gets automatically deleted (ca. 10 min)."
+            )
+
     # Submit job
     r = nomad.create_deployment(nomad_conf)
 

From 4716bd938c0f98662c20d5e182276bec8e0d11c1 Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Tue, 13 Aug 2024 15:05:23 +0200
Subject: [PATCH 08/14] fix: overwrite main endpoint

---
 ai4papi/routers/v1/try_me/nomad.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py
index 1a67abb..5425f1b 100644
--- a/ai4papi/routers/v1/try_me/nomad.py
+++ b/ai4papi/routers/v1/try_me/nomad.py
@@ -128,4 +128,7 @@ def get_deployment(
         full_info=True,
     )
 
+    # Rewrite main endpoint, otherwise it automatically selects DEEPaaS API
+    job['main_endpoint'] = 'ui'
+
     return job

From bc136589d4d44122f52a77813f951a24d8c4b5ac Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Fri, 23 Aug 2024 12:44:14 +0200
Subject: [PATCH 09/14] fix: use `latest` for `deepaas_ui` Docker image

---
 etc/try_me/nomad.hcl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl
index 924e5b4..c977fe8 100644
--- a/etc/try_me/nomad.hcl
+++ b/etc/try_me/nomad.hcl
@@ -121,7 +121,7 @@ job "try-${JOB_UUID}" {
 
       config {
         force_pull = true
-        image      = "registry.services.ai4os.eu/ai4os/deepaas_ui"
+        image      = "registry.services.ai4os.eu/ai4os/deepaas_ui:latest"
         ports      = ["ui"]
         shm_size   = 250000000   # 250MB
         memory_hard_limit = 500  # MB

From 0659d54ff4f5dd88c5787ff4a795b08a877703d8 Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Mon, 26 Aug 2024 12:48:28 +0200
Subject: [PATCH 10/14] fix: fix limit to 2 try-me deployments

---
 ai4papi/routers/v1/try_me/nomad.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py
index 5425f1b..d76dc7f 100644
--- a/ai4papi/routers/v1/try_me/nomad.py
+++ b/ai4papi/routers/v1/try_me/nomad.py
@@ -89,10 +89,10 @@ def create_deployment(
         owner=auth_info['id'],
         prefix="try",
     )
-    if len(jobs) > 2:
+    if len(jobs) >= 2:
         raise HTTPException(
             status_code=503,
-            detail="Sorry, but you seem to be currently running two `Try-me` environments already." \
+            detail="Sorry, but you seem to be currently running two `Try-me` environments already. " \
                 "Before launching a new one, you will need to wait till one of your " \
                 "existing environments gets automatically deleted (ca. 10 min)."
             )

From 903615ae6c803b629d186c3e9e36e5206bbc1a68 Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Mon, 26 Aug 2024 14:34:54 +0200
Subject: [PATCH 11/14] fix: increase module resources

---
 etc/try_me/nomad.hcl | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl
index c977fe8..7a36732 100644
--- a/etc/try_me/nomad.hcl
+++ b/etc/try_me/nomad.hcl
@@ -103,14 +103,15 @@ job "try-${JOB_UUID}" {
         command    = "deep-start"
         args       = ["--deepaas"]
         ports      = ["api"]
-        shm_size   = 500000000  # 500MB
-        memory_hard_limit = 1000  # 1GB
+        shm_size   = 1000000000  # 1GB
+        memory_hard_limit = 2000  # 2GB
       }
 
       resources {
         cores      = 1
-        memory     = 1000  # 1GB
-        memory_max = 1000  # 1GB
+        memory     = 2000  # 2GB
+        memory_max = 2000  # 2GB
+      }
       }
 
     }

From c7e093236291e785f03b1409507dabc9e8509f6e Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Mon, 26 Aug 2024 14:35:25 +0200
Subject: [PATCH 12/14] fix: avoid restarting module if download failure

---
 etc/try_me/nomad.hcl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl
index 7a36732..f938aae 100644
--- a/etc/try_me/nomad.hcl
+++ b/etc/try_me/nomad.hcl
@@ -112,6 +112,14 @@ job "try-${JOB_UUID}" {
         memory     = 2000  # 2GB
         memory_max = 2000  # 2GB
       }
+
+      # Do not try to restart a try-me job if it failis to launch deepaas
+      # This is usually due to the fact that the Docker image took too long to download
+      # and failed with error: `Failed to pull `ai4oshub/...`: context deadline` exceeded
+      # Restarting in the same node won't fix the connectivity issues
+      restart {
+        attempts = 0
+        mode     = "fail"
       }
 
     }

From 1c50aea1d6e9faf530a138944810355a72e1502f Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Tue, 27 Aug 2024 12:35:26 +0200
Subject: [PATCH 13/14] docs: add message to my future self on why try-me might
 break

---
 etc/try_me/nomad.hcl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl
index f938aae..34ef287 100644
--- a/etc/try_me/nomad.hcl
+++ b/etc/try_me/nomad.hcl
@@ -107,6 +107,8 @@ job "try-${JOB_UUID}" {
         memory_hard_limit = 2000  # 2GB
       }
 
+      # (!) Keep in mind that if a module works locally but isn't working in Nomad,
+      # the reason is likely that these resources are too low and the module freezes
       resources {
         cores      = 1
         memory     = 2000  # 2GB

From 5325940b1b317071e66755b27481f0cfda05526f Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <iheredia@ifca.unican.es>
Date: Thu, 29 Aug 2024 15:00:13 +0200
Subject: [PATCH 14/14] feat: deploy nomad jobs in only in `tryme` nodes

This is done because we want the Nomad jobs to launch very fast (smooth try experience), so we have created specific  `tryme` nodes where the Docker images are being pulled continuously in the background
---
 ai4papi/routers/v1/stats/deployments.py |  3 ++-
 ai4papi/routers/v1/try_me/nomad.py      | 11 ++++++-----
 etc/try_me/nomad.hcl                    | 13 +++----------
 3 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py
index 6c579f3..4383b04 100644
--- a/ai4papi/routers/v1/stats/deployments.py
+++ b/ai4papi/routers/v1/stats/deployments.py
@@ -223,7 +223,7 @@ def get_cluster_stats(
             for k, v in n_stats.items():
 
                 # Ignore keys
-                if k in ['name', 'namespaces', 'eligibility', 'status']:
+                if k in ['name', 'namespaces', 'eligibility', 'status', 'tags']:
                     continue
 
                 # Aggregate nested gpu_models dict
@@ -286,6 +286,7 @@ def get_cluster_stats_bg():
         n_stats['gpu_models'] = {}
         n_stats['namespaces'] = node['Meta'].get('namespace', '')
         n_stats['status'] = node['Meta'].get('status', '')
+        n_stats['tags'] = node['Meta'].get('tags', '')
 
         if n['NodeResources']['Devices']:
             for devices in n['NodeResources']['Devices']:
diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py
index d76dc7f..ef560fb 100644
--- a/ai4papi/routers/v1/try_me/nomad.py
+++ b/ai4papi/routers/v1/try_me/nomad.py
@@ -63,8 +63,8 @@ def create_deployment(
     # Convert template to Nomad conf
     nomad_conf = nomad.load_job_conf(nomad_conf)
 
-    # Check that at least 20% of the candidate node resources (CPU nodes belonging to
-    # ai4eosc) are free, to avoid impacting too much on our real users.
+    # Check that the target node (ie. tag='tryme') resources are available because
+    # these jobs cannot be left queueing
     # We check for every resource metric (cpu, disk, ram)
     stats = get_cluster_stats(vo='vo.ai4eosc.eu')
     resources = ['cpu', 'ram', 'disk']
@@ -73,10 +73,11 @@ def create_deployment(
 
     for _, datacenter  in stats['datacenters'].items():
         for _, node in datacenter['nodes'].items():
-            for k in keys:
-                status[k] += node[k]
+            if 'tryme' in node['tags']:
+                for k in keys:
+                    status[k] += node[k]
     for r in resources:
-        if status[f"{r}_used"] / status[f"{r}_total"] > 0.8:
+        if status[f"{r}_used"] / status[f"{r}_total"] > 0.95:
             raise HTTPException(
                 status_code=503,
                 detail="Sorry, but there seem to be no resources available right " \
diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl
index 34ef287..d11b580 100644
--- a/etc/try_me/nomad.hcl
+++ b/etc/try_me/nomad.hcl
@@ -31,13 +31,6 @@ job "try-${JOB_UUID}" {
     value     = "ready"
   }
 
-  # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik)
-  constraint {
-    attribute = "${meta.compute}"
-    operator  = "="
-    value     = "true"
-  }
-
   # Only deploy in nodes serving that namespace (we use metadata instead of node-pools
   # because Nomad does not allow a node to belong to several node pools)
   constraint {
@@ -46,12 +39,12 @@ job "try-${JOB_UUID}" {
     value     = "${NAMESPACE}"
   }
 
-  # Force that try-me jobs land in CPU-only nodes to avoid impacting the GPU trainings
-  # of our real users
+  # Force that try-me jobs land in "tryme" nodes (that are the ones that have the docker
+  # images pre-fetched for fast deployment)
   constraint {
     attribute = "${meta.tags}"
     operator  = "regexp"
-    value     = "cpu"
+    value     = "tryme"
   }
 
   group "usergroup" {