Skip to content

Commit

Permalink
Merge branch 'master' into ai4os-vault-egi-demo
Browse files Browse the repository at this point in the history
  • Loading branch information
IgnacioHeredia committed Nov 4, 2024
2 parents fc3365f + 111f20f commit 008af70
Show file tree
Hide file tree
Showing 13 changed files with 245 additions and 193 deletions.
18 changes: 1 addition & 17 deletions ai4papi/routers/v1/deployments/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,23 +204,7 @@ def create_deployment(
else:
priority = 50

# Remove non-compliant characters from hostname
base_domain = papiconf.MAIN_CONF['lb']['domain'][vo]
hostname = utils.safe_hostname(
hostname=user_conf['general']['hostname'],
job_uuid=job_uuid,
)

#TODO: reenable custom hostname, when we are able to parse all node metadata
# (domain key) to build the true domain
hostname = job_uuid

# # Check the hostname is available in all data-centers
# # (we don't know beforehand where the job will land)
# #TODO: make sure this does not break if the datacenter is unavailable
# #TODO: disallow custom hostname, pain in the ass, slower deploys
# for datacenter in papiconf.MAIN_CONF['nomad']['datacenters']:
# utils.check_domain(f"{hostname}.{datacenter}-{base_domain}")

# Replace the Nomad job template
nomad_conf = nomad_conf.safe_substitute(
Expand All @@ -234,7 +218,7 @@ def create_deployment(
'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters
'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters
'BASE_DOMAIN': base_domain,
'HOSTNAME': hostname,
'HOSTNAME': job_uuid,
'DOCKER_IMAGE': user_conf['general']['docker_image'],
'DOCKER_TAG': user_conf['general']['docker_tag'],
'SERVICE': user_conf['general']['service'],
Expand Down
28 changes: 8 additions & 20 deletions ai4papi/routers/v1/deployments/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,24 +200,8 @@ def create_deployment(
else:
priority = 50

# Remove non-compliant characters from hostname
base_domain = papiconf.MAIN_CONF['lb']['domain'][vo]
hostname = utils.safe_hostname(
hostname=user_conf['general']['hostname'],
job_uuid=job_uuid,
)

#TODO: reenable custom hostname, when we are able to parse all node metadata
# (domain key) to build the true domain
hostname = job_uuid

# # Check the hostname is available in all data-centers
# # (we don't know beforehand where the job will land)
# #TODO: make sure this does not break if the datacenter is unavailable
# #TODO: disallow custom hostname, pain in the ass, slower deploys
# for datacenter in papiconf.MAIN_CONF['nomad']['datacenters']:
# utils.check_domain(f"{hostname}.{datacenter}-{base_domain}")


# Create a default secret for the Federated Server
_ = ai4secrets.create_secret(
vo=vo,
Expand Down Expand Up @@ -247,7 +231,7 @@ def create_deployment(
'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters
'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters
'BASE_DOMAIN': base_domain,
'HOSTNAME': hostname,
'HOSTNAME': job_uuid,
'DOCKER_IMAGE': user_conf['general']['docker_image'],
'DOCKER_TAG': user_conf['general']['docker_tag'],
'CPU_NUM': user_conf['hardware']['cpu_num'],
Expand All @@ -263,8 +247,12 @@ def create_deployment(
'FEDERATED_MIN_AVAILABLE_CLIENTS': user_conf['configuration']['min_available_clients'],
'FEDERATED_STRATEGY': user_conf['configuration']['strategy'],
'MU_FEDPROX': user_conf['configuration']['mu'],
'FEDAVGM_SERVER_FL' : user_conf['configuration']['momentum'],
'FEDAVGM_SERVER_MOMENTUM': user_conf['configuration']['fl']
'FEDAVGM_SERVER_FL' : user_conf['configuration']['fl'],
'FEDAVGM_SERVER_MOMENTUM': user_conf['configuration']['momentum'],
'DP': user_conf['configuration']['dp'],
'NOISE_MULT': user_conf['configuration']['noise_mult'],
'SAMPLED_CLIENTS': user_conf['configuration']['sampled_clients'],
'CLIP_NORM': user_conf['configuration']['clip_norm']
}
)

Expand Down
6 changes: 3 additions & 3 deletions ai4papi/routers/v1/secrets.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def get_secrets(
subpath += '/'

# Retrieve initial level-0 secrets
user_path = f"users/{auth_info['id']}"
user_path = f"users/{auth_info['id']}/{vo}"
try:
r = client.secrets.kv.v1.list_secrets(
path = user_path + subpath,
Expand Down Expand Up @@ -201,7 +201,7 @@ def create_secret(

# Create secret
client.secrets.kv.v1.create_or_update_secret(
path=f"users/{auth_info['id']}/{secret_path}",
path=f"users/{auth_info['id']}/{vo}/{secret_path}",
mount_point='/secrets/',
secret=secret_data,
)
Expand Down Expand Up @@ -237,7 +237,7 @@ def delete_secret(

# Delete secret
client.secrets.kv.v1.delete_secret(
path=f"users/{auth_info['id']}/{secret_path}",
path=f"users/{auth_info['id']}/{vo}/{secret_path}",
mount_point=VAULT_MOUNT_POINT,
)

Expand Down
137 changes: 112 additions & 25 deletions ai4papi/routers/v1/try_me/nomad.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
"""
This route is meant to be public for everyone authenticated to try (no VO membership
required). We deploy jobs by default in the AI4EOSC namespace.
"""

from copy import deepcopy
import types
import uuid

from fastapi import APIRouter, Depends, HTTPException
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.security import HTTPBearer

from ai4papi import auth
Expand All @@ -18,19 +24,102 @@
)
security = HTTPBearer()

# (!) try-me jobs are always deployed in AI4EOSC
VO = "vo.ai4eosc.eu"
NAMESPACE = papiconf.MAIN_CONF['nomad']['namespaces'][VO]


@router.get("")
def get_deployments(
full_info: bool = Query(default=False),
authorization=Depends(security),
):
"""
Returns a list of all deployments belonging to a user.
Parameters:
* **vo**: Virtual Organizations from where you want to retrieve your deployments.
If no vo is provided, it will retrieve the deployments of all VOs.
* **full_info**: retrieve the full information of each deployment.
Disabled by default, as it will increase latency too much if there are many
deployments.
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)

# Retrieve all jobs in namespace
jobs = nomad.get_deployments(
namespace=NAMESPACE,
owner=auth_info['id'],
prefix='try',
)
user_jobs = []
for j in jobs:
try:
job_info = get_deployment(
deployment_uuid=j['ID'],
full_info=full_info,
authorization=types.SimpleNamespace(
credentials=authorization.credentials # token
),
)
except HTTPException: # not a try-me
continue
except Exception as e: # unexpected error
raise(e)

user_jobs.append(job_info)

# Sort deployments by creation date
seq = [j['submit_time'] for j in user_jobs]
args = sorted(range(len(seq)), key=seq.__getitem__)[::-1]
sorted_jobs = [user_jobs[i] for i in args]

return sorted_jobs


@router.get("/{deployment_uuid}")
def get_deployment(
deployment_uuid: str,
full_info: bool = Query(default=True),
authorization=Depends(security),
):
"""
This function is used mainly to be able to retrieve the endpoint of the try_me job.
We cannot return the endpoint when creating the job, because the final endpoint will
on which datacenter the job ends up landing.
Parameters:
* **deployment_uuid**: uuid of deployment to gather info about
Returns a dict with info
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)

job = nomad.get_deployment(
deployment_uuid=deployment_uuid,
namespace=NAMESPACE,
owner=auth_info['id'],
full_info=full_info,
)

# Rewrite main endpoint, otherwise it automatically selects DEEPaaS API
job['main_endpoint'] = 'ui'

return job


@router.post("")
def create_deployment(
module_name: str,
title: str = Query(default=""),
authorization=Depends(security),
):
"""
Submit a try-me deployment to Nomad.
The deployment will automatically kill himself after a short amount of time.
This endpoint is meant to be public for everyone to try (no authorization required).
We deploy jobs by default in the AI4EOSC namespace.
Returns a string with the endpoint to access the API.
"""
# Retrieve authenticated user info
Expand All @@ -51,11 +140,12 @@ def create_deployment(
nomad_conf = nomad_conf.safe_substitute(
{
'JOB_UUID': job_uuid,
'NAMESPACE': 'ai4eosc', # (!) try-me jobs are always deployed in "ai4eosc"
'NAMESPACE': NAMESPACE,
'TITLE': title[:45],
'OWNER': auth_info['id'],
'OWNER_NAME': auth_info['name'],
'OWNER_EMAIL': auth_info['email'],
'BASE_DOMAIN': papiconf.MAIN_CONF['lb']['domain']['vo.ai4eosc.eu'], # idem
'BASE_DOMAIN': papiconf.MAIN_CONF['lb']['domain'][VO],
'HOSTNAME': job_uuid,
'DOCKER_IMAGE': docker_image,
}
Expand All @@ -67,7 +157,7 @@ def create_deployment(
# Check that the target node (ie. tag='tryme') resources are available because
# these jobs cannot be left queueing
# We check for every resource metric (cpu, disk, ram)
stats = get_cluster_stats(vo='vo.ai4eosc.eu')
stats = get_cluster_stats(vo=VO)
resources = ['cpu', 'ram', 'disk']
keys = [f"{i}_used" for i in resources] + [f"{i}_total" for i in resources]
status = {k: 0 for k in keys}
Expand All @@ -89,16 +179,17 @@ def create_deployment(

# Check that the user hasn't too many "try-me" jobs currently running
jobs = nomad.get_deployments(
namespace="ai4eosc", # (!) try-me jobs are always deployed in "ai4eosc"
namespace=NAMESPACE,
owner=auth_info['id'],
prefix="try",
)
if len(jobs) >= 2:
if len(jobs) >= 3:
raise HTTPException(
status_code=503,
detail="Sorry, but you seem to be currently running two `Try-me` environments already. " \
detail="Sorry, but you seem to be currently running 3 `try-me` environments already. " \
"Before launching a new one, you will need to wait till one of your " \
"existing environments gets automatically deleted (ca. 10 min)."
"existing environments gets automatically deleted (ca. 10 min) or delete it manually " \
"in the Dashboard."
)

# Submit job
Expand All @@ -107,32 +198,28 @@ def create_deployment(
return r


@router.get("/{deployment_uuid}")
def get_deployment(
@router.delete("/{deployment_uuid}")
def delete_deployment(
deployment_uuid: str,
authorization=Depends(security),
):
"""
This function is used mainly to be able to retrieve the endpoint of the try_me job.
We cannot return the endpoint when creating the job, because the final endpoint will
on which datacenter the job ends up landing.
Delete a deployment. Users can only delete their own deployments.
Parameters:
* **deployment_uuid**: uuid of deployment to gather info about
* **vo**: Virtual Organization where your deployment is located
* **deployment_uuid**: uuid of deployment to delete
Returns a dict with info
Returns a dict with status
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)

job = nomad.get_deployment(
# Delete deployment
r = nomad.delete_deployment(
deployment_uuid=deployment_uuid,
namespace="ai4eosc", # (!) try-me jobs are always deployed in "ai4eosc"
namespace=NAMESPACE,
owner=auth_info['id'],
full_info=True,
)

# Rewrite main endpoint, otherwise it automatically selects DEEPaaS API
job['main_endpoint'] = 'ui'

return job
return r
Loading

0 comments on commit 008af70

Please sign in to comment.