From de70079608d4d1a7410a67f10eac0dd5d1cebd6e Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Wed, 26 Jun 2024 11:55:33 +0200 Subject: [PATCH 01/14] WIP --- README.md | 3 + ai4papi/conf.py | 6 ++ ai4papi/routers/v1/__init__.py | 3 +- ai4papi/routers/v1/try_me/__init__.py | 10 +++ ai4papi/routers/v1/try_me/nomad.py | 73 ++++++++++++++++++++++ etc/try_me/nomad.hcl | 90 +++++++++++++++++++++++++++ 6 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 ai4papi/routers/v1/try_me/__init__.py create mode 100644 ai4papi/routers/v1/try_me/nomad.py create mode 100644 etc/try_me/nomad.hcl diff --git a/README.md b/README.md index 1687f03..cc8d1c1 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,9 @@ More details can be found in the [API docs](https://api.cloud.ai4eosc.eu/docs). **Notes**: The catalog caches results for up to 6 hours to improve UX (see [doctring](./ai4papi/routers/v1/modules.py)). +* `/v1/try_me/`: + endpoint where anyone can deploy a short-lived container to try a module + * `/v1/deployments/`: (🔒) deploy modules/tools in the platform to perform trainings diff --git a/ai4papi/conf.py b/ai4papi/conf.py index b4d784c..512d30b 100644 --- a/ai4papi/conf.py +++ b/ai4papi/conf.py @@ -79,3 +79,9 @@ def load_yaml_conf(fpath): 'values': yml[1], } } + +# Try-me endpoints +nmd = load_nomad_job(paths['conf'] / 'try_me' / 'nomad.hcl') +TRY_ME = { + 'nomad': nmd, +} diff --git a/ai4papi/routers/v1/__init__.py b/ai4papi/routers/v1/__init__.py index 6bfcfb7..0071451 100644 --- a/ai4papi/routers/v1/__init__.py +++ b/ai4papi/routers/v1/__init__.py @@ -1,11 +1,12 @@ import fastapi -from . import catalog, deployments, secrets +from . import catalog, deployments, secrets, try_me app = fastapi.APIRouter() app.include_router(catalog.app) app.include_router(deployments.app) app.include_router(secrets.router) +app.include_router(try_me.app) @app.get( diff --git a/ai4papi/routers/v1/try_me/__init__.py b/ai4papi/routers/v1/try_me/__init__.py new file mode 100644 index 0000000..a86c86b --- /dev/null +++ b/ai4papi/routers/v1/try_me/__init__.py @@ -0,0 +1,10 @@ +import fastapi + +from . import nomad + + +app = fastapi.APIRouter() +app.include_router( + router=nomad.router, + prefix='/try_me', + ) diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py new file mode 100644 index 0000000..cf5b028 --- /dev/null +++ b/ai4papi/routers/v1/try_me/nomad.py @@ -0,0 +1,73 @@ +from copy import deepcopy +import uuid + +from fastapi import APIRouter +from fastapi.security import HTTPBearer + +from ai4papi import utils +import ai4papi.conf as papiconf +from ai4papi.routers.v1.catalog.modules import Modules +import ai4papi.nomad.common as nomad + + +router = APIRouter( + prefix="/nomad", + tags=["Nomad trials"], + responses={404: {"description": "Not found"}}, +) +security = HTTPBearer() + + +@router.post("/") +def create_deployment( + module_name: str, + ): + """ + Submit a try-me deployment to Nomad. + The deployment will automatically kill himself after a short amount of time. + + This endpoint is meant to be public for everyone to try (no authorization required). + We deploy jobs by default in the AI4EOSC namespace. + + Returns a string with the endpoint to access the API. + """ + # Retrieve docker_image from module_name + meta = Modules.get_metadata(module_name) + docker_image = meta['sources']['docker_registry_repo'] + # docker_image = "deephdc/image-classification-tf" # todo: remove + + # Load module configuration + nomad_conf = deepcopy(papiconf.TRY_ME['nomad']) + + # Generate UUID from (MAC address+timestamp) so it's unique + job_uuid = uuid.uuid1() + + # Generate a domain for user-app and check nothing is running there + domain = utils.generate_domain( + hostname='', + base_domain=papiconf.MAIN_CONF['lb']['domain']['vo.ai4eosc.eu'], + job_uuid=job_uuid, + ) + utils.check_domain(domain) + + # Replace the Nomad job template + nomad_conf = nomad_conf.safe_substitute( + { + 'JOB_UUID': job_uuid, + 'DOMAIN': domain, + 'DOCKER_IMAGE': docker_image, + } + ) + + # Convert template to Nomad conf + nomad_conf = nomad.load_job_conf(nomad_conf) + + # Submit job + r = nomad.create_deployment(nomad_conf) + + return r + + +# TODO: implement a get method to retrieve endpoint +# This is implemented in a separate method because we cannot know what is the final +# endpoint before knowing in which datacenter it has landed diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl new file mode 100644 index 0000000..1e0971a --- /dev/null +++ b/etc/try_me/nomad.hcl @@ -0,0 +1,90 @@ +/* +Convention: +----------- +* ${UPPERCASE} are replaced by the user +* ${lowercase} are replace by Nomad at launchtime +* remaining is default, same for everybody + +When replacing user values we use safe_substitute() so that ge don't get an error for not +replacing Nomad values +*/ + +job "usertest-${JOB_UUID}" { + namespace = "default" + type = "service" + region = "global" + id = "${JOB_UUID}" + priority = "0" # "Try-me" jobs have low priority + + # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid + # overloading GPU clients with CPU-only jobs. + affinity { + attribute = "${node.unique.name}" + operator = "regexp" + value = "gpu" + weight = -50 # anti-affinity for GPU clients + } + #TODO: *force* CPU for try-me deployments + + # Avoid rescheduling the job on **other** nodes during a network cut + # Command not working due to https://github.com/hashicorp/nomad/issues/16515 + reschedule { + attempts = 0 + unlimited = false + } + + group "usergroup" { + + # Recover the job in the **original** node when the network comes back + # (after a network cut). + # If network cut lasts more than 10 days (240 hrs), job is restarted anyways. + # Do not increase too much this limit because we want to still be able to notice + # when nodes are truly removed from the cluster (not just temporarily lost). + max_client_disconnect = "240h" + + network { + + port "ide" { + to = 8888 # -1 will assign random port + } + + } + + service { + name = "${JOB_UUID}-api" + port = "api" + tags = [ + "traefik.enable=true", + "traefik.http.routers.${JOB_UUID}-api.tls=true", + "traefik.http.routers.${JOB_UUID}-api.rule=Host(`api-${DOMAIN}`, `www.api-${DOMAIN}`)", + ] + } + + ephemeral_disk { + size = 300 # MB + } + + task "usertask" { + // Task configured by the user + + driver = "docker" + + config { + force_pull = true + image = "${DOCKER_IMAGE}:latest" + command = "curl" + args = ["-s", "https://raw.githubusercontent.com/ai4os/deepaas_ui/nomad/nomad.sh", "|", "bash"] + ports = ["ide"] + shm_size = 500000000 # 500MB + memory_hard_limit = 1000 # 1GB + } + + resources { + cores = 1 + memory = 1000 # 1GB + memory_max = 1000 # 1GB + } + } + + } +} From 718c051fe08fd9fd0045d2dcd44138028a762240 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Fri, 28 Jun 2024 18:41:44 +0200 Subject: [PATCH 02/14] feat: add `get_deployment` --- ai4papi/routers/v1/try_me/nomad.py | 25 ++++++++++++++++++--- etc/try_me/nomad.hcl | 36 ++++++++++++++++++++---------- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py index cf5b028..80cdf14 100644 --- a/ai4papi/routers/v1/try_me/nomad.py +++ b/ai4papi/routers/v1/try_me/nomad.py @@ -68,6 +68,25 @@ def create_deployment( return r -# TODO: implement a get method to retrieve endpoint -# This is implemented in a separate method because we cannot know what is the final -# endpoint before knowing in which datacenter it has landed +@router.get("/{deployment_uuid}") +def get_deployment( + deployment_uuid: str, + ): + """ + This function is used mainly to be able to retrieve the endpoint of the try_me job. + We cannot return the endpoint when creating the job, because the final endpoint will + on which datacenter the job ends up landing. + + Parameters: + * **deployment_uuid**: uuid of deployment to gather info about + + Returns a dict with info + """ + job = nomad.get_deployment( + deployment_uuid=deployment_uuid, + namespace="ai4eosc", + owner="", # try-me endpoints have no owner + full_info=True, + ) + + return job diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl index 1e0971a..e4f9700 100644 --- a/etc/try_me/nomad.hcl +++ b/etc/try_me/nomad.hcl @@ -9,12 +9,21 @@ When replacing user values we use safe_substitute() so that ge don't get an erro replacing Nomad values */ -job "usertest-${JOB_UUID}" { - namespace = "default" +job "userjob-${JOB_UUID}" { + namespace = "ai4eosc" # try-me jobs are always deployed in ai4eosc type = "service" region = "global" id = "${JOB_UUID}" - priority = "0" # "Try-me" jobs have low priority + priority = "0" # try-me jobs have low priority + + # Try-me jobs have no owner + meta { + owner = "" + owner_name = "" + owner_email = "" + title = "" + description = "" + } # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid # overloading GPU clients with CPU-only jobs. @@ -24,7 +33,8 @@ job "usertest-${JOB_UUID}" { value = "gpu" weight = -50 # anti-affinity for GPU clients } - #TODO: *force* CPU for try-me deployments + #TODO: *force* CPU for try-me deployments. + # Wait until we move to federated cluster because this will be easier to implement. # Avoid rescheduling the job on **other** nodes during a network cut # Command not working due to https://github.com/hashicorp/nomad/issues/16515 @@ -44,19 +54,19 @@ job "usertest-${JOB_UUID}" { network { - port "ide" { + port "ui" { to = 8888 # -1 will assign random port } } service { - name = "${JOB_UUID}-api" - port = "api" + name = "${JOB_UUID}-ui" + port = "ui" tags = [ "traefik.enable=true", - "traefik.http.routers.${JOB_UUID}-api.tls=true", - "traefik.http.routers.${JOB_UUID}-api.rule=Host(`api-${DOMAIN}`, `www.api-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-ui.tls=true", + "traefik.http.routers.${JOB_UUID}-ui.rule=Host(`ui-${DOMAIN}`, `www.ui-${DOMAIN}`)", ] } @@ -67,14 +77,16 @@ job "usertest-${JOB_UUID}" { task "usertask" { // Task configured by the user + # TODO: kill after 10 mins and do *not* restart + driver = "docker" config { force_pull = true image = "${DOCKER_IMAGE}:latest" - command = "curl" - args = ["-s", "https://raw.githubusercontent.com/ai4os/deepaas_ui/nomad/nomad.sh", "|", "bash"] - ports = ["ide"] + command = "sh" + args = ["-c", "curl https://raw.githubusercontent.com/ai4os/deepaas_ui/nomad/nomad.sh | bash"] + ports = ["ui"] shm_size = 500000000 # 500MB memory_hard_limit = 1000 # 1GB } From 8d6af651c0d34c690f24b4ac4ac0291cacc7ca51 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Mon, 1 Jul 2024 15:34:25 +0200 Subject: [PATCH 03/14] feat: limit job duration --- etc/try_me/nomad.hcl | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl index e4f9700..39f80e1 100644 --- a/etc/try_me/nomad.hcl +++ b/etc/try_me/nomad.hcl @@ -11,7 +11,7 @@ replacing Nomad values job "userjob-${JOB_UUID}" { namespace = "ai4eosc" # try-me jobs are always deployed in ai4eosc - type = "service" + type = "batch" # try-me jobs should not be redeployed when exit_code=0 region = "global" id = "${JOB_UUID}" priority = "0" # try-me jobs have low priority @@ -36,8 +36,7 @@ job "userjob-${JOB_UUID}" { #TODO: *force* CPU for try-me deployments. # Wait until we move to federated cluster because this will be easier to implement. - # Avoid rescheduling the job on **other** nodes during a network cut - # Command not working due to https://github.com/hashicorp/nomad/issues/16515 + # Do not try to restart a try-me job if it raised an error (eg. module incompatible with Gradio UI) reschedule { attempts = 0 unlimited = false @@ -45,13 +44,6 @@ job "userjob-${JOB_UUID}" { group "usergroup" { - # Recover the job in the **original** node when the network comes back - # (after a network cut). - # If network cut lasts more than 10 days (240 hrs), job is restarted anyways. - # Do not increase too much this limit because we want to still be able to notice - # when nodes are truly removed from the cluster (not just temporarily lost). - max_client_disconnect = "240h" - network { port "ui" { @@ -75,9 +67,7 @@ job "userjob-${JOB_UUID}" { } task "usertask" { - // Task configured by the user - - # TODO: kill after 10 mins and do *not* restart + # Task configured by the user driver = "docker" @@ -91,12 +81,23 @@ job "userjob-${JOB_UUID}" { memory_hard_limit = 1000 # 1GB } + env { + DURATION = "10m" # try-me job killed after 10 mins (with exit_code=0) + UI_PORT = 8888 + } + resources { - cores = 1 - memory = 1000 # 1GB + cores = 1 + memory = 1000 # 1GB memory_max = 1000 # 1GB } - } + # Do not try to restart a try-me job if it raised an error (eg. module incompatible with Gradio UI) + restart { + attempts = 0 + mode = "fail" + } + + } } } From 8b4438febc8589ef7c21be4946e9a0211457721e Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Fri, 19 Jul 2024 16:13:12 +0200 Subject: [PATCH 04/14] feat: launch UI in a separate container --- etc/try_me/nomad.hcl | 66 ++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 18 deletions(-) diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl index 39f80e1..9e5652f 100644 --- a/etc/try_me/nomad.hcl +++ b/etc/try_me/nomad.hcl @@ -36,20 +36,21 @@ job "userjob-${JOB_UUID}" { #TODO: *force* CPU for try-me deployments. # Wait until we move to federated cluster because this will be easier to implement. - # Do not try to restart a try-me job if it raised an error (eg. module incompatible with Gradio UI) - reschedule { - attempts = 0 - unlimited = false - } - group "usergroup" { - network { + # Do not try to restart a try-me job if it raised an error (eg. module incompatible with Gradio UI) + reschedule { + attempts = 0 + unlimited = false + } + network { port "ui" { - to = 8888 # -1 will assign random port + to = 80 # -1 will assign random port + } + port "api" { + to = 5000 # -1 will assign random port } - } service { @@ -61,6 +62,7 @@ job "userjob-${JOB_UUID}" { "traefik.http.routers.${JOB_UUID}-ui.rule=Host(`ui-${DOMAIN}`, `www.ui-${DOMAIN}`)", ] } + #TODO: adapt for federated cluster ephemeral_disk { size = 300 # MB @@ -69,35 +71,63 @@ job "userjob-${JOB_UUID}" { task "usertask" { # Task configured by the user + # Run as a prestart task to make sure deepaas has already launched when launching the deepaas UI + lifecycle { + hook = "prestart" + sidecar = true + } + driver = "docker" config { force_pull = true image = "${DOCKER_IMAGE}:latest" - command = "sh" - args = ["-c", "curl https://raw.githubusercontent.com/ai4os/deepaas_ui/nomad/nomad.sh | bash"] - ports = ["ui"] + command = "deep-start" + args = ["--deepaas"] + ports = ["api"] shm_size = 500000000 # 500MB memory_hard_limit = 1000 # 1GB } - env { - DURATION = "10m" # try-me job killed after 10 mins (with exit_code=0) - UI_PORT = 8888 - } - resources { cores = 1 memory = 1000 # 1GB memory_max = 1000 # 1GB } - # Do not try to restart a try-me job if it raised an error (eg. module incompatible with Gradio UI) + } + + task "ui" { + # DEEPaaS UI + + driver = "docker" + + config { + force_pull = true + image = "registry.services.ai4os.eu/ai4os/deepaas_ui" + ports = ["ui"] + shm_size = 250000000 # 250MB + memory_hard_limit = 500 # MB + } + + env { + DURATION = "10m" # kill job after 10 mins + UI_PORT = 80 + } + + resources { + cpu = 500 # MHz + memory = 500 # MB + memory_max = 500 # MB + } + + # Do not try to restart a try-me job if it raises error (module incompatible with Gradio UI) restart { attempts = 0 mode = "fail" } } + } } From b86c196c76df217750d03b83893811aee09d25be Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Mon, 12 Aug 2024 12:42:12 +0200 Subject: [PATCH 05/14] feat: make try-me deployments authenticated --- ai4papi/routers/v1/try_me/nomad.py | 18 ++++++++++++++---- etc/try_me/nomad.hcl | 7 +++---- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py index 80cdf14..d123382 100644 --- a/ai4papi/routers/v1/try_me/nomad.py +++ b/ai4papi/routers/v1/try_me/nomad.py @@ -1,10 +1,10 @@ from copy import deepcopy import uuid -from fastapi import APIRouter +from fastapi import APIRouter, Depends from fastapi.security import HTTPBearer -from ai4papi import utils +from ai4papi import auth import ai4papi.conf as papiconf from ai4papi.routers.v1.catalog.modules import Modules import ai4papi.nomad.common as nomad @@ -21,6 +21,7 @@ @router.post("/") def create_deployment( module_name: str, + authorization=Depends(security), ): """ Submit a try-me deployment to Nomad. @@ -31,6 +32,9 @@ def create_deployment( Returns a string with the endpoint to access the API. """ + # Retrieve authenticated user info + auth_info = auth.get_user_info(token=authorization.credentials) + # Retrieve docker_image from module_name meta = Modules.get_metadata(module_name) docker_image = meta['sources']['docker_registry_repo'] @@ -54,7 +58,9 @@ def create_deployment( nomad_conf = nomad_conf.safe_substitute( { 'JOB_UUID': job_uuid, - 'DOMAIN': domain, + 'OWNER': auth_info['id'], + 'OWNER_NAME': auth_info['name'], + 'OWNER_EMAIL': auth_info['email'], 'DOCKER_IMAGE': docker_image, } ) @@ -71,6 +77,7 @@ def create_deployment( @router.get("/{deployment_uuid}") def get_deployment( deployment_uuid: str, + authorization=Depends(security), ): """ This function is used mainly to be able to retrieve the endpoint of the try_me job. @@ -82,10 +89,13 @@ def get_deployment( Returns a dict with info """ + # Retrieve authenticated user info + auth_info = auth.get_user_info(token=authorization.credentials) + job = nomad.get_deployment( deployment_uuid=deployment_uuid, namespace="ai4eosc", - owner="", # try-me endpoints have no owner + owner=auth_info['id'], full_info=True, ) diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl index 9e5652f..f152007 100644 --- a/etc/try_me/nomad.hcl +++ b/etc/try_me/nomad.hcl @@ -16,11 +16,10 @@ job "userjob-${JOB_UUID}" { id = "${JOB_UUID}" priority = "0" # try-me jobs have low priority - # Try-me jobs have no owner meta { - owner = "" - owner_name = "" - owner_email = "" + owner = "${OWNER}" # user-id from OIDC + owner_name = "${OWNER_NAME}" + owner_email = "${OWNER_EMAIL}" title = "" description = "" } From ad3fd390fd7716c07e8b637af8ec2f1d5ac05b72 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Mon, 12 Aug 2024 12:43:34 +0200 Subject: [PATCH 06/14] feat: adapt try-me job to new federated cluster --- ai4papi/routers/v1/try_me/nomad.py | 14 +++----- etc/try_me/nomad.hcl | 52 ++++++++++++++++++++---------- 2 files changed, 39 insertions(+), 27 deletions(-) diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py index d123382..0e4361d 100644 --- a/ai4papi/routers/v1/try_me/nomad.py +++ b/ai4papi/routers/v1/try_me/nomad.py @@ -38,7 +38,6 @@ def create_deployment( # Retrieve docker_image from module_name meta = Modules.get_metadata(module_name) docker_image = meta['sources']['docker_registry_repo'] - # docker_image = "deephdc/image-classification-tf" # todo: remove # Load module configuration nomad_conf = deepcopy(papiconf.TRY_ME['nomad']) @@ -46,21 +45,16 @@ def create_deployment( # Generate UUID from (MAC address+timestamp) so it's unique job_uuid = uuid.uuid1() - # Generate a domain for user-app and check nothing is running there - domain = utils.generate_domain( - hostname='', - base_domain=papiconf.MAIN_CONF['lb']['domain']['vo.ai4eosc.eu'], - job_uuid=job_uuid, - ) - utils.check_domain(domain) - # Replace the Nomad job template nomad_conf = nomad_conf.safe_substitute( { 'JOB_UUID': job_uuid, + 'NAMESPACE': 'ai4eosc', # (!) try-me jobs are always deployed in "ai4eosc" 'OWNER': auth_info['id'], 'OWNER_NAME': auth_info['name'], 'OWNER_EMAIL': auth_info['email'], + 'BASE_DOMAIN': papiconf.MAIN_CONF['lb']['domain']['vo.ai4eosc.eu'], # idem + 'HOSTNAME': job_uuid, 'DOCKER_IMAGE': docker_image, } ) @@ -94,7 +88,7 @@ def get_deployment( job = nomad.get_deployment( deployment_uuid=deployment_uuid, - namespace="ai4eosc", + namespace="ai4eosc", # (!) try-me jobs are always deployed in "ai4eosc" owner=auth_info['id'], full_info=True, ) diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl index f152007..924e5b4 100644 --- a/etc/try_me/nomad.hcl +++ b/etc/try_me/nomad.hcl @@ -9,8 +9,8 @@ When replacing user values we use safe_substitute() so that ge don't get an erro replacing Nomad values */ -job "userjob-${JOB_UUID}" { - namespace = "ai4eosc" # try-me jobs are always deployed in ai4eosc +job "try-${JOB_UUID}" { + namespace = "${NAMESPACE}" type = "batch" # try-me jobs should not be redeployed when exit_code=0 region = "global" id = "${JOB_UUID}" @@ -24,26 +24,47 @@ job "userjob-${JOB_UUID}" { description = "" } - # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid - # overloading GPU clients with CPU-only jobs. - affinity { - attribute = "${node.unique.name}" + # Only use nodes that have succesfully passed the ai4-nomad_tests (ie. meta.status=ready) + constraint { + attribute = "${meta.status}" operator = "regexp" - value = "gpu" - weight = -50 # anti-affinity for GPU clients + value = "ready" + } + + # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) + constraint { + attribute = "${meta.compute}" + operator = "=" + value = "true" + } + + # Only deploy in nodes serving that namespace (we use metadata instead of node-pools + # because Nomad does not allow a node to belong to several node pools) + constraint { + attribute = "${meta.namespace}" + operator = "regexp" + value = "${NAMESPACE}" + } + + # Force that try-me jobs land in CPU-only nodes to avoid impacting the GPU trainings + # of our real users + constraint { + attribute = "${meta.tags}" + operator = "regexp" + value = "cpu" } - #TODO: *force* CPU for try-me deployments. - # Wait until we move to federated cluster because this will be easier to implement. group "usergroup" { - # Do not try to restart a try-me job if it raised an error (eg. module incompatible with Gradio UI) + # Do not try to restart a try-me job if it raised an error (eg. module incompatible + # with Gradio UI) reschedule { attempts = 0 unlimited = false } network { + port "ui" { to = 80 # -1 will assign random port } @@ -58,17 +79,15 @@ job "userjob-${JOB_UUID}" { tags = [ "traefik.enable=true", "traefik.http.routers.${JOB_UUID}-ui.tls=true", - "traefik.http.routers.${JOB_UUID}-ui.rule=Host(`ui-${DOMAIN}`, `www.ui-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-ui.rule=Host(`ui-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.ui-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", ] } - #TODO: adapt for federated cluster ephemeral_disk { size = 300 # MB } - task "usertask" { - # Task configured by the user + task "main" { # DEEPaaS API # Run as a prestart task to make sure deepaas has already launched when launching the deepaas UI lifecycle { @@ -96,8 +115,7 @@ job "userjob-${JOB_UUID}" { } - task "ui" { - # DEEPaaS UI + task "ui" { # DEEPaaS UI (Gradio) driver = "docker" From cc9a4a928362d20707f80a14d7b3eb676ddc2fd0 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Mon, 12 Aug 2024 13:39:31 +0200 Subject: [PATCH 07/14] feat: add additional safeguards for resource usage --- ai4papi/routers/v1/try_me/nomad.py | 37 +++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py index 0e4361d..1a67abb 100644 --- a/ai4papi/routers/v1/try_me/nomad.py +++ b/ai4papi/routers/v1/try_me/nomad.py @@ -1,12 +1,13 @@ from copy import deepcopy import uuid -from fastapi import APIRouter, Depends +from fastapi import APIRouter, Depends, HTTPException from fastapi.security import HTTPBearer from ai4papi import auth import ai4papi.conf as papiconf from ai4papi.routers.v1.catalog.modules import Modules +from ai4papi.routers.v1.stats.deployments import get_cluster_stats import ai4papi.nomad.common as nomad @@ -62,6 +63,40 @@ def create_deployment( # Convert template to Nomad conf nomad_conf = nomad.load_job_conf(nomad_conf) + # Check that at least 20% of the candidate node resources (CPU nodes belonging to + # ai4eosc) are free, to avoid impacting too much on our real users. + # We check for every resource metric (cpu, disk, ram) + stats = get_cluster_stats(vo='vo.ai4eosc.eu') + resources = ['cpu', 'ram', 'disk'] + keys = [f"{i}_used" for i in resources] + [f"{i}_total" for i in resources] + status = {k: 0 for k in keys} + + for _, datacenter in stats['datacenters'].items(): + for _, node in datacenter['nodes'].items(): + for k in keys: + status[k] += node[k] + for r in resources: + if status[f"{r}_used"] / status[f"{r}_total"] > 0.8: + raise HTTPException( + status_code=503, + detail="Sorry, but there seem to be no resources available right " \ + "now to test the module. Please try later.", + ) + + # Check that the user hasn't too many "try-me" jobs currently running + jobs = nomad.get_deployments( + namespace="ai4eosc", # (!) try-me jobs are always deployed in "ai4eosc" + owner=auth_info['id'], + prefix="try", + ) + if len(jobs) > 2: + raise HTTPException( + status_code=503, + detail="Sorry, but you seem to be currently running two `Try-me` environments already." \ + "Before launching a new one, you will need to wait till one of your " \ + "existing environments gets automatically deleted (ca. 10 min)." + ) + # Submit job r = nomad.create_deployment(nomad_conf) From 4716bd938c0f98662c20d5e182276bec8e0d11c1 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Tue, 13 Aug 2024 15:05:23 +0200 Subject: [PATCH 08/14] fix: overwrite main endpoint --- ai4papi/routers/v1/try_me/nomad.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py index 1a67abb..5425f1b 100644 --- a/ai4papi/routers/v1/try_me/nomad.py +++ b/ai4papi/routers/v1/try_me/nomad.py @@ -128,4 +128,7 @@ def get_deployment( full_info=True, ) + # Rewrite main endpoint, otherwise it automatically selects DEEPaaS API + job['main_endpoint'] = 'ui' + return job From bc136589d4d44122f52a77813f951a24d8c4b5ac Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Fri, 23 Aug 2024 12:44:14 +0200 Subject: [PATCH 09/14] fix: use `latest` for `deepaas_ui` Docker image --- etc/try_me/nomad.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl index 924e5b4..c977fe8 100644 --- a/etc/try_me/nomad.hcl +++ b/etc/try_me/nomad.hcl @@ -121,7 +121,7 @@ job "try-${JOB_UUID}" { config { force_pull = true - image = "registry.services.ai4os.eu/ai4os/deepaas_ui" + image = "registry.services.ai4os.eu/ai4os/deepaas_ui:latest" ports = ["ui"] shm_size = 250000000 # 250MB memory_hard_limit = 500 # MB From 0659d54ff4f5dd88c5787ff4a795b08a877703d8 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Mon, 26 Aug 2024 12:48:28 +0200 Subject: [PATCH 10/14] fix: fix limit to 2 try-me deployments --- ai4papi/routers/v1/try_me/nomad.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py index 5425f1b..d76dc7f 100644 --- a/ai4papi/routers/v1/try_me/nomad.py +++ b/ai4papi/routers/v1/try_me/nomad.py @@ -89,10 +89,10 @@ def create_deployment( owner=auth_info['id'], prefix="try", ) - if len(jobs) > 2: + if len(jobs) >= 2: raise HTTPException( status_code=503, - detail="Sorry, but you seem to be currently running two `Try-me` environments already." \ + detail="Sorry, but you seem to be currently running two `Try-me` environments already. " \ "Before launching a new one, you will need to wait till one of your " \ "existing environments gets automatically deleted (ca. 10 min)." ) From 903615ae6c803b629d186c3e9e36e5206bbc1a68 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Mon, 26 Aug 2024 14:34:54 +0200 Subject: [PATCH 11/14] fix: increase module resources --- etc/try_me/nomad.hcl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl index c977fe8..7a36732 100644 --- a/etc/try_me/nomad.hcl +++ b/etc/try_me/nomad.hcl @@ -103,14 +103,15 @@ job "try-${JOB_UUID}" { command = "deep-start" args = ["--deepaas"] ports = ["api"] - shm_size = 500000000 # 500MB - memory_hard_limit = 1000 # 1GB + shm_size = 1000000000 # 1GB + memory_hard_limit = 2000 # 2GB } resources { cores = 1 - memory = 1000 # 1GB - memory_max = 1000 # 1GB + memory = 2000 # 2GB + memory_max = 2000 # 2GB + } } } From c7e093236291e785f03b1409507dabc9e8509f6e Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Mon, 26 Aug 2024 14:35:25 +0200 Subject: [PATCH 12/14] fix: avoid restarting module if download failure --- etc/try_me/nomad.hcl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl index 7a36732..f938aae 100644 --- a/etc/try_me/nomad.hcl +++ b/etc/try_me/nomad.hcl @@ -112,6 +112,14 @@ job "try-${JOB_UUID}" { memory = 2000 # 2GB memory_max = 2000 # 2GB } + + # Do not try to restart a try-me job if it failis to launch deepaas + # This is usually due to the fact that the Docker image took too long to download + # and failed with error: `Failed to pull `ai4oshub/...`: context deadline` exceeded + # Restarting in the same node won't fix the connectivity issues + restart { + attempts = 0 + mode = "fail" } } From 1c50aea1d6e9faf530a138944810355a72e1502f Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Tue, 27 Aug 2024 12:35:26 +0200 Subject: [PATCH 13/14] docs: add message to my future self on why try-me might break --- etc/try_me/nomad.hcl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl index f938aae..34ef287 100644 --- a/etc/try_me/nomad.hcl +++ b/etc/try_me/nomad.hcl @@ -107,6 +107,8 @@ job "try-${JOB_UUID}" { memory_hard_limit = 2000 # 2GB } + # (!) Keep in mind that if a module works locally but isn't working in Nomad, + # the reason is likely that these resources are too low and the module freezes resources { cores = 1 memory = 2000 # 2GB From 5325940b1b317071e66755b27481f0cfda05526f Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Thu, 29 Aug 2024 15:00:13 +0200 Subject: [PATCH 14/14] feat: deploy nomad jobs in only in `tryme` nodes This is done because we want the Nomad jobs to launch very fast (smooth try experience), so we have created specific `tryme` nodes where the Docker images are being pulled continuously in the background --- ai4papi/routers/v1/stats/deployments.py | 3 ++- ai4papi/routers/v1/try_me/nomad.py | 11 ++++++----- etc/try_me/nomad.hcl | 13 +++---------- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py index 6c579f3..4383b04 100644 --- a/ai4papi/routers/v1/stats/deployments.py +++ b/ai4papi/routers/v1/stats/deployments.py @@ -223,7 +223,7 @@ def get_cluster_stats( for k, v in n_stats.items(): # Ignore keys - if k in ['name', 'namespaces', 'eligibility', 'status']: + if k in ['name', 'namespaces', 'eligibility', 'status', 'tags']: continue # Aggregate nested gpu_models dict @@ -286,6 +286,7 @@ def get_cluster_stats_bg(): n_stats['gpu_models'] = {} n_stats['namespaces'] = node['Meta'].get('namespace', '') n_stats['status'] = node['Meta'].get('status', '') + n_stats['tags'] = node['Meta'].get('tags', '') if n['NodeResources']['Devices']: for devices in n['NodeResources']['Devices']: diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py index d76dc7f..ef560fb 100644 --- a/ai4papi/routers/v1/try_me/nomad.py +++ b/ai4papi/routers/v1/try_me/nomad.py @@ -63,8 +63,8 @@ def create_deployment( # Convert template to Nomad conf nomad_conf = nomad.load_job_conf(nomad_conf) - # Check that at least 20% of the candidate node resources (CPU nodes belonging to - # ai4eosc) are free, to avoid impacting too much on our real users. + # Check that the target node (ie. tag='tryme') resources are available because + # these jobs cannot be left queueing # We check for every resource metric (cpu, disk, ram) stats = get_cluster_stats(vo='vo.ai4eosc.eu') resources = ['cpu', 'ram', 'disk'] @@ -73,10 +73,11 @@ def create_deployment( for _, datacenter in stats['datacenters'].items(): for _, node in datacenter['nodes'].items(): - for k in keys: - status[k] += node[k] + if 'tryme' in node['tags']: + for k in keys: + status[k] += node[k] for r in resources: - if status[f"{r}_used"] / status[f"{r}_total"] > 0.8: + if status[f"{r}_used"] / status[f"{r}_total"] > 0.95: raise HTTPException( status_code=503, detail="Sorry, but there seem to be no resources available right " \ diff --git a/etc/try_me/nomad.hcl b/etc/try_me/nomad.hcl index 34ef287..d11b580 100644 --- a/etc/try_me/nomad.hcl +++ b/etc/try_me/nomad.hcl @@ -31,13 +31,6 @@ job "try-${JOB_UUID}" { value = "ready" } - # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) - constraint { - attribute = "${meta.compute}" - operator = "=" - value = "true" - } - # Only deploy in nodes serving that namespace (we use metadata instead of node-pools # because Nomad does not allow a node to belong to several node pools) constraint { @@ -46,12 +39,12 @@ job "try-${JOB_UUID}" { value = "${NAMESPACE}" } - # Force that try-me jobs land in CPU-only nodes to avoid impacting the GPU trainings - # of our real users + # Force that try-me jobs land in "tryme" nodes (that are the ones that have the docker + # images pre-fetched for fast deployment) constraint { attribute = "${meta.tags}" operator = "regexp" - value = "cpu" + value = "tryme" } group "usergroup" {