-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: support try-me endpoints in Nomad (#59)
* WIP * feat: add `get_deployment` * feat: limit job duration * feat: launch UI in a separate container * feat: make try-me deployments authenticated * feat: adapt try-me job to new federated cluster * feat: add additional safeguards for resource usage * fix: overwrite main endpoint * fix: use `latest` for `deepaas_ui` Docker image * fix: fix limit to 2 try-me deployments * fix: increase module resources * fix: avoid restarting module if download failure * docs: add message to my future self on why try-me might break * feat: deploy nomad jobs in only in `tryme` nodes This is done because we want the Nomad jobs to launch very fast (smooth try experience), so we have created specific `tryme` nodes where the Docker images are being pulled continuously in the background
- Loading branch information
1 parent
d8a3161
commit de9b373
Showing
7 changed files
with
311 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
import fastapi | ||
|
||
from . import nomad | ||
|
||
|
||
app = fastapi.APIRouter() | ||
app.include_router( | ||
router=nomad.router, | ||
prefix='/try_me', | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
from copy import deepcopy | ||
import uuid | ||
|
||
from fastapi import APIRouter, Depends, HTTPException | ||
from fastapi.security import HTTPBearer | ||
|
||
from ai4papi import auth | ||
import ai4papi.conf as papiconf | ||
from ai4papi.routers.v1.catalog.modules import Modules | ||
from ai4papi.routers.v1.stats.deployments import get_cluster_stats | ||
import ai4papi.nomad.common as nomad | ||
|
||
|
||
router = APIRouter( | ||
prefix="/nomad", | ||
tags=["Nomad trials"], | ||
responses={404: {"description": "Not found"}}, | ||
) | ||
security = HTTPBearer() | ||
|
||
|
||
@router.post("/") | ||
def create_deployment( | ||
module_name: str, | ||
authorization=Depends(security), | ||
): | ||
""" | ||
Submit a try-me deployment to Nomad. | ||
The deployment will automatically kill himself after a short amount of time. | ||
This endpoint is meant to be public for everyone to try (no authorization required). | ||
We deploy jobs by default in the AI4EOSC namespace. | ||
Returns a string with the endpoint to access the API. | ||
""" | ||
# Retrieve authenticated user info | ||
auth_info = auth.get_user_info(token=authorization.credentials) | ||
|
||
# Retrieve docker_image from module_name | ||
meta = Modules.get_metadata(module_name) | ||
docker_image = meta['sources']['docker_registry_repo'] | ||
|
||
# Load module configuration | ||
nomad_conf = deepcopy(papiconf.TRY_ME['nomad']) | ||
|
||
# Generate UUID from (MAC address+timestamp) so it's unique | ||
job_uuid = uuid.uuid1() | ||
|
||
# Replace the Nomad job template | ||
nomad_conf = nomad_conf.safe_substitute( | ||
{ | ||
'JOB_UUID': job_uuid, | ||
'NAMESPACE': 'ai4eosc', # (!) try-me jobs are always deployed in "ai4eosc" | ||
'OWNER': auth_info['id'], | ||
'OWNER_NAME': auth_info['name'], | ||
'OWNER_EMAIL': auth_info['email'], | ||
'BASE_DOMAIN': papiconf.MAIN_CONF['lb']['domain']['vo.ai4eosc.eu'], # idem | ||
'HOSTNAME': job_uuid, | ||
'DOCKER_IMAGE': docker_image, | ||
} | ||
) | ||
|
||
# Convert template to Nomad conf | ||
nomad_conf = nomad.load_job_conf(nomad_conf) | ||
|
||
# Check that the target node (ie. tag='tryme') resources are available because | ||
# these jobs cannot be left queueing | ||
# We check for every resource metric (cpu, disk, ram) | ||
stats = get_cluster_stats(vo='vo.ai4eosc.eu') | ||
resources = ['cpu', 'ram', 'disk'] | ||
keys = [f"{i}_used" for i in resources] + [f"{i}_total" for i in resources] | ||
status = {k: 0 for k in keys} | ||
|
||
for _, datacenter in stats['datacenters'].items(): | ||
for _, node in datacenter['nodes'].items(): | ||
if 'tryme' in node['tags']: | ||
for k in keys: | ||
status[k] += node[k] | ||
for r in resources: | ||
if status[f"{r}_used"] / status[f"{r}_total"] > 0.95: | ||
raise HTTPException( | ||
status_code=503, | ||
detail="Sorry, but there seem to be no resources available right " \ | ||
"now to test the module. Please try later.", | ||
) | ||
|
||
# Check that the user hasn't too many "try-me" jobs currently running | ||
jobs = nomad.get_deployments( | ||
namespace="ai4eosc", # (!) try-me jobs are always deployed in "ai4eosc" | ||
owner=auth_info['id'], | ||
prefix="try", | ||
) | ||
if len(jobs) >= 2: | ||
raise HTTPException( | ||
status_code=503, | ||
detail="Sorry, but you seem to be currently running two `Try-me` environments already. " \ | ||
"Before launching a new one, you will need to wait till one of your " \ | ||
"existing environments gets automatically deleted (ca. 10 min)." | ||
) | ||
|
||
# Submit job | ||
r = nomad.create_deployment(nomad_conf) | ||
|
||
return r | ||
|
||
|
||
@router.get("/{deployment_uuid}") | ||
def get_deployment( | ||
deployment_uuid: str, | ||
authorization=Depends(security), | ||
): | ||
""" | ||
This function is used mainly to be able to retrieve the endpoint of the try_me job. | ||
We cannot return the endpoint when creating the job, because the final endpoint will | ||
on which datacenter the job ends up landing. | ||
Parameters: | ||
* **deployment_uuid**: uuid of deployment to gather info about | ||
Returns a dict with info | ||
""" | ||
# Retrieve authenticated user info | ||
auth_info = auth.get_user_info(token=authorization.credentials) | ||
|
||
job = nomad.get_deployment( | ||
deployment_uuid=deployment_uuid, | ||
namespace="ai4eosc", # (!) try-me jobs are always deployed in "ai4eosc" | ||
owner=auth_info['id'], | ||
full_info=True, | ||
) | ||
|
||
# Rewrite main endpoint, otherwise it automatically selects DEEPaaS API | ||
job['main_endpoint'] = 'ui' | ||
|
||
return job |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
/* | ||
Convention: | ||
----------- | ||
* ${UPPERCASE} are replaced by the user | ||
* ${lowercase} are replace by Nomad at launchtime | ||
* remaining is default, same for everybody | ||
When replacing user values we use safe_substitute() so that ge don't get an error for not | ||
replacing Nomad values | ||
*/ | ||
|
||
job "try-${JOB_UUID}" { | ||
namespace = "${NAMESPACE}" | ||
type = "batch" # try-me jobs should not be redeployed when exit_code=0 | ||
region = "global" | ||
id = "${JOB_UUID}" | ||
priority = "0" # try-me jobs have low priority | ||
|
||
meta { | ||
owner = "${OWNER}" # user-id from OIDC | ||
owner_name = "${OWNER_NAME}" | ||
owner_email = "${OWNER_EMAIL}" | ||
title = "" | ||
description = "" | ||
} | ||
|
||
# Only use nodes that have succesfully passed the ai4-nomad_tests (ie. meta.status=ready) | ||
constraint { | ||
attribute = "${meta.status}" | ||
operator = "regexp" | ||
value = "ready" | ||
} | ||
|
||
# Only deploy in nodes serving that namespace (we use metadata instead of node-pools | ||
# because Nomad does not allow a node to belong to several node pools) | ||
constraint { | ||
attribute = "${meta.namespace}" | ||
operator = "regexp" | ||
value = "${NAMESPACE}" | ||
} | ||
|
||
# Force that try-me jobs land in "tryme" nodes (that are the ones that have the docker | ||
# images pre-fetched for fast deployment) | ||
constraint { | ||
attribute = "${meta.tags}" | ||
operator = "regexp" | ||
value = "tryme" | ||
} | ||
|
||
group "usergroup" { | ||
|
||
# Do not try to restart a try-me job if it raised an error (eg. module incompatible | ||
# with Gradio UI) | ||
reschedule { | ||
attempts = 0 | ||
unlimited = false | ||
} | ||
|
||
network { | ||
|
||
port "ui" { | ||
to = 80 # -1 will assign random port | ||
} | ||
port "api" { | ||
to = 5000 # -1 will assign random port | ||
} | ||
} | ||
|
||
service { | ||
name = "${JOB_UUID}-ui" | ||
port = "ui" | ||
tags = [ | ||
"traefik.enable=true", | ||
"traefik.http.routers.${JOB_UUID}-ui.tls=true", | ||
"traefik.http.routers.${JOB_UUID}-ui.rule=Host(`ui-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.ui-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", | ||
] | ||
} | ||
|
||
ephemeral_disk { | ||
size = 300 # MB | ||
} | ||
|
||
task "main" { # DEEPaaS API | ||
|
||
# Run as a prestart task to make sure deepaas has already launched when launching the deepaas UI | ||
lifecycle { | ||
hook = "prestart" | ||
sidecar = true | ||
} | ||
|
||
driver = "docker" | ||
|
||
config { | ||
force_pull = true | ||
image = "${DOCKER_IMAGE}:latest" | ||
command = "deep-start" | ||
args = ["--deepaas"] | ||
ports = ["api"] | ||
shm_size = 1000000000 # 1GB | ||
memory_hard_limit = 2000 # 2GB | ||
} | ||
|
||
# (!) Keep in mind that if a module works locally but isn't working in Nomad, | ||
# the reason is likely that these resources are too low and the module freezes | ||
resources { | ||
cores = 1 | ||
memory = 2000 # 2GB | ||
memory_max = 2000 # 2GB | ||
} | ||
|
||
# Do not try to restart a try-me job if it failis to launch deepaas | ||
# This is usually due to the fact that the Docker image took too long to download | ||
# and failed with error: `Failed to pull `ai4oshub/...`: context deadline` exceeded | ||
# Restarting in the same node won't fix the connectivity issues | ||
restart { | ||
attempts = 0 | ||
mode = "fail" | ||
} | ||
|
||
} | ||
|
||
task "ui" { # DEEPaaS UI (Gradio) | ||
|
||
driver = "docker" | ||
|
||
config { | ||
force_pull = true | ||
image = "registry.services.ai4os.eu/ai4os/deepaas_ui:latest" | ||
ports = ["ui"] | ||
shm_size = 250000000 # 250MB | ||
memory_hard_limit = 500 # MB | ||
} | ||
|
||
env { | ||
DURATION = "10m" # kill job after 10 mins | ||
UI_PORT = 80 | ||
} | ||
|
||
resources { | ||
cpu = 500 # MHz | ||
memory = 500 # MB | ||
memory_max = 500 # MB | ||
} | ||
|
||
# Do not try to restart a try-me job if it raises error (module incompatible with Gradio UI) | ||
restart { | ||
attempts = 0 | ||
mode = "fail" | ||
} | ||
|
||
} | ||
|
||
} | ||
} |