Skip to content

Commit

Permalink
feat: allow listing/deleting try-me (#68)
Browse files Browse the repository at this point in the history
* feat: allow listing/deleting try-me

* feat: add titles to try-me deployments
  • Loading branch information
IgnacioHeredia authored Oct 28, 2024
1 parent 17563f5 commit 6fa977a
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 26 deletions.
137 changes: 112 additions & 25 deletions ai4papi/routers/v1/try_me/nomad.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
"""
This route is meant to be public for everyone authenticated to try (no VO membership
required). We deploy jobs by default in the AI4EOSC namespace.
"""

from copy import deepcopy
from typing import Tuple, Union
import types
import uuid

from fastapi import APIRouter, Depends, HTTPException
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.security import HTTPBearer

from ai4papi import auth
Expand All @@ -18,19 +25,101 @@
)
security = HTTPBearer()

# (!) try-me jobs are always deployed in AI4EOSC
VO = "vo.ai4eosc.eu"
NAMESPACE = papiconf.MAIN_CONF['nomad']['namespaces'][VO]

@router.get("/")
def get_deployments(
full_info: bool = Query(default=False),
authorization=Depends(security),
):
"""
Returns a list of all deployments belonging to a user.
Parameters:
* **vo**: Virtual Organizations from where you want to retrieve your deployments.
If no vo is provided, it will retrieve the deployments of all VOs.
* **full_info**: retrieve the full information of each deployment.
Disabled by default, as it will increase latency too much if there are many
deployments.
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)

# Retrieve all jobs in namespace
jobs = nomad.get_deployments(
namespace=NAMESPACE,
owner=auth_info['id'],
prefix='try',
)
user_jobs = []
for j in jobs:
try:
job_info = get_deployment(
deployment_uuid=j['ID'],
full_info=full_info,
authorization=types.SimpleNamespace(
credentials=authorization.credentials # token
),
)
except HTTPException: # not a try-me
continue
except Exception as e: # unexpected error
raise(e)

user_jobs.append(job_info)

# Sort deployments by creation date
seq = [j['submit_time'] for j in user_jobs]
args = sorted(range(len(seq)), key=seq.__getitem__)[::-1]
sorted_jobs = [user_jobs[i] for i in args]

return sorted_jobs


@router.get("/{deployment_uuid}")
def get_deployment(
deployment_uuid: str,
full_info: bool = Query(default=True),
authorization=Depends(security),
):
"""
This function is used mainly to be able to retrieve the endpoint of the try_me job.
We cannot return the endpoint when creating the job, because the final endpoint will
on which datacenter the job ends up landing.
Parameters:
* **deployment_uuid**: uuid of deployment to gather info about
Returns a dict with info
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)

job = nomad.get_deployment(
deployment_uuid=deployment_uuid,
namespace=NAMESPACE,
owner=auth_info['id'],
full_info=full_info,
)

# Rewrite main endpoint, otherwise it automatically selects DEEPaaS API
job['main_endpoint'] = 'ui'

return job


@router.post("")
def create_deployment(
module_name: str,
title: str = Query(default=""),
authorization=Depends(security),
):
"""
Submit a try-me deployment to Nomad.
The deployment will automatically kill himself after a short amount of time.
This endpoint is meant to be public for everyone to try (no authorization required).
We deploy jobs by default in the AI4EOSC namespace.
Returns a string with the endpoint to access the API.
"""
# Retrieve authenticated user info
Expand All @@ -51,11 +140,12 @@ def create_deployment(
nomad_conf = nomad_conf.safe_substitute(
{
'JOB_UUID': job_uuid,
'NAMESPACE': 'ai4eosc', # (!) try-me jobs are always deployed in "ai4eosc"
'NAMESPACE': NAMESPACE,
'TITLE': title[:45],
'OWNER': auth_info['id'],
'OWNER_NAME': auth_info['name'],
'OWNER_EMAIL': auth_info['email'],
'BASE_DOMAIN': papiconf.MAIN_CONF['lb']['domain']['vo.ai4eosc.eu'], # idem
'BASE_DOMAIN': papiconf.MAIN_CONF['lb']['domain'][VO],
'HOSTNAME': job_uuid,
'DOCKER_IMAGE': docker_image,
}
Expand All @@ -67,7 +157,7 @@ def create_deployment(
# Check that the target node (ie. tag='tryme') resources are available because
# these jobs cannot be left queueing
# We check for every resource metric (cpu, disk, ram)
stats = get_cluster_stats(vo='vo.ai4eosc.eu')
stats = get_cluster_stats(vo=VO)
resources = ['cpu', 'ram', 'disk']
keys = [f"{i}_used" for i in resources] + [f"{i}_total" for i in resources]
status = {k: 0 for k in keys}
Expand All @@ -89,16 +179,17 @@ def create_deployment(

# Check that the user hasn't too many "try-me" jobs currently running
jobs = nomad.get_deployments(
namespace="ai4eosc", # (!) try-me jobs are always deployed in "ai4eosc"
namespace=NAMESPACE,
owner=auth_info['id'],
prefix="try",
)
if len(jobs) >= 2:
if len(jobs) >= 3:
raise HTTPException(
status_code=503,
detail="Sorry, but you seem to be currently running two `Try-me` environments already. " \
detail="Sorry, but you seem to be currently running 3 `try-me` environments already. " \
"Before launching a new one, you will need to wait till one of your " \
"existing environments gets automatically deleted (ca. 10 min)."
"existing environments gets automatically deleted (ca. 10 min) or delete it manually " \
"in the Dashboard."
)

# Submit job
Expand All @@ -107,32 +198,28 @@ def create_deployment(
return r


@router.get("/{deployment_uuid}")
def get_deployment(
@router.delete("/{deployment_uuid}")
def delete_deployment(
deployment_uuid: str,
authorization=Depends(security),
):
"""
This function is used mainly to be able to retrieve the endpoint of the try_me job.
We cannot return the endpoint when creating the job, because the final endpoint will
on which datacenter the job ends up landing.
Delete a deployment. Users can only delete their own deployments.
Parameters:
* **deployment_uuid**: uuid of deployment to gather info about
* **vo**: Virtual Organization where your deployment is located
* **deployment_uuid**: uuid of deployment to delete
Returns a dict with info
Returns a dict with status
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)

job = nomad.get_deployment(
# Delete deployment
r = nomad.delete_deployment(
deployment_uuid=deployment_uuid,
namespace="ai4eosc", # (!) try-me jobs are always deployed in "ai4eosc"
namespace=NAMESPACE,
owner=auth_info['id'],
full_info=True,
)

# Rewrite main endpoint, otherwise it automatically selects DEEPaaS API
job['main_endpoint'] = 'ui'

return job
return r
2 changes: 1 addition & 1 deletion etc/try_me/nomad.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ job "try-${JOB_UUID}" {
owner = "${OWNER}" # user-id from OIDC
owner_name = "${OWNER_NAME}"
owner_email = "${OWNER_EMAIL}"
title = ""
title = "${TITLE}"
description = ""
}

Expand Down
1 change: 1 addition & 0 deletions tests/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import catalog.tools
import deployments.modules
import deployments.tools
import try_me.test_nomad
import routes
import test_secrets
import test_stats
Expand Down
69 changes: 69 additions & 0 deletions tests/try_me/test_nomad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import os
import time
from types import SimpleNamespace

from ai4papi.routers.v1.try_me import nomad


# Retrieve EGI token (not generated on the fly in case the are rate limiting issues
# if too many queries)
token = os.getenv('TMP_EGI_TOKEN')
if not token:
raise Exception(
'Please remember to set a token as ENV variable before executing \
the tests! \n\n \
export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \
If running from VScode make sure to launch `code` from that terminal so it can access \
that ENV variable.'
)

# Create deployment
rcreate = nomad.create_deployment(
module_name="ai4os-demo-app",
authorization=SimpleNamespace(
credentials=token
),
)
assert isinstance(rcreate, dict)
assert 'job_ID' in rcreate.keys()

# Retrieve that deployment
rdep = nomad.get_deployment(
deployment_uuid=rcreate['job_ID'],
authorization=SimpleNamespace(
credentials=token
),
)
assert isinstance(rdep, dict)
assert 'job_ID' in rdep.keys()
assert rdep['job_ID']==rcreate['job_ID']

# Retrieve all deployments
rdeps = nomad.get_deployments(
authorization=SimpleNamespace(
credentials=token
),
)
assert isinstance(rdeps, list)
assert any([d['job_ID']==rcreate['job_ID'] for d in rdeps])

# Delete deployment
rdel = nomad.delete_deployment(
deployment_uuid=rcreate['job_ID'],
authorization=SimpleNamespace(
credentials=token
),
)
time.sleep(3) # Nomad takes some time to delete
assert isinstance(rdel, dict)
assert 'status' in rdel.keys()

# Check module no longer exists
rdeps3 = nomad.get_deployments(
authorization=SimpleNamespace(
credentials=token
),
)
assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps3])

print('Try-me (nomad) tests passed!')

0 comments on commit 6fa977a

Please sign in to comment.