From 8888527f9ee10932dc96854f273c0b0b96c53b2c Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Thu, 23 Nov 2023 15:27:45 +0100 Subject: [PATCH 01/21] feat: update to new IP of federated cluster --- docker/Dockerfile | 2 +- docker/docker-compose.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index a773326..47c081d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -28,7 +28,7 @@ RUN apt-get update && \ apt-get install nomad && \ rm -rf /var/lib/apt/lists/* -ENV NOMAD_ADDR=https://193.146.75.221:4646 +ENV NOMAD_ADDR=https://193.146.75.205:4646 ENV NOMAD_CACERT=/home/nomad-certs/nomad-ca.pem ENV NOMAD_CLIENT_CERT=/home/nomad-certs/cli.pem ENV NOMAD_CLIENT_KEY=/home/nomad-certs/cli-key.pem diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index bae0690..a70516b 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -6,7 +6,7 @@ services: image: "ignacioheredia/ai4-papi:prod" restart: always environment: - - NOMAD_ADDR=https://193.146.75.221:4646 + - NOMAD_ADDR=https://193.146.75.205:4646 volumes: - /home/ubuntu/nomad-certs/nomad-prod:/home/nomad-certs - /mnt/ai4os-logs/ai4-accounting:/home/ai4-accounting @@ -19,7 +19,7 @@ services: image: "ignacioheredia/ai4-papi:prod" restart: always environment: - - NOMAD_ADDR=https://193.146.75.221:4646 + - NOMAD_ADDR=https://193.146.75.205:4646 volumes: - /home/ubuntu/nomad-certs/nomad-prod:/home/nomad-certs - /mnt/ai4os-logs/ai4-accounting:/home/ai4-accounting From 712b7cbe0ae1368e07a7feb43b9c082114d31596 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Thu, 23 Nov 2023 15:29:30 +0100 Subject: [PATCH 02/21] feat: endpoints are now node-dependent --- ai4papi/nomad/common.py | 27 ++++++++++++-------- ai4papi/routers/v1/deployments/modules.py | 21 +++++++++++---- ai4papi/routers/v1/deployments/tools.py | 21 +++++++++++---- ai4papi/utils.py | 11 +++----- etc/modules/nomad.hcl | 6 ++--- etc/tools/deep-oc-federated-server/nomad.hcl | 4 +-- 6 files changed, 57 insertions(+), 33 deletions(-) diff --git a/ai4papi/nomad/common.py b/ai4papi/nomad/common.py index 9e991ee..a1c63ea 100644 --- a/ai4papi/nomad/common.py +++ b/ai4papi/nomad/common.py @@ -185,17 +185,6 @@ def get_deployment( except Exception: # return first endpoint info['main_endpoint'] = list(info['endpoints'].values())[0] - # Add active endpoints - if full_info: - info['active_endpoints'] = [] - for k, v in info['endpoints'].items(): - try: - r = session.get(v, timeout=2) - if r.status_code == 200: - info['active_endpoints'].append(k) - except requests.exceptions.Timeout: - continue - # Only fill resources if the job is allocated allocs = Nomad.job.get_allocations( id_=j['ID'], @@ -278,6 +267,22 @@ def get_deployment( 'disk_MB': a['AllocatedResources']['Shared']['DiskMB'], } + # Retrieve the node the jobs landed at in order to properly fill the endpoints + n = Nomad.node.get_node(a['NodeID']) + for k, v in info['endpoints'].items(): + info['endpoints'][k] = v.replace('${meta.domain}', n['Meta']['domain']) + + # Add active endpoints + if full_info: + info['active_endpoints'] = [] + for k, v in info['endpoints'].items(): + try: + r = session.get(v, timeout=2) + if r.status_code == 200: + info['active_endpoints'].append(k) + except requests.exceptions.Timeout: + continue + elif evals: # Something happened, job didn't deploy (eg. job needs port that's currently being used) # We have to return `placement failures message`. diff --git a/ai4papi/routers/v1/deployments/modules.py b/ai4papi/routers/v1/deployments/modules.py index 54c72fd..96939f2 100644 --- a/ai4papi/routers/v1/deployments/modules.py +++ b/ai4papi/routers/v1/deployments/modules.py @@ -192,13 +192,23 @@ def create_deployment( else: priority = 50 - # Generate a domain for user-app and check nothing is running there - domain = utils.generate_domain( + # Remove non-compliant characters from hostname + base_domain = papiconf.MAIN_CONF['lb']['domain'][vo] + hostname = utils.safe_hostname( hostname=user_conf['general']['hostname'], - base_domain=papiconf.MAIN_CONF['lb']['domain'][vo], job_uuid=job_uuid, ) - utils.check_domain(domain) + + #TODO: reenable custom hostname, when we are able to parse all node metadata + # (domain key) to build the true domain + hostname = job_uuid + + # # Check the hostname is available in all data-centers + # # (we don't know beforehand where the job will land) + # #TODO: make sure this does not break if the datacenter is unavailable + # #TODO: disallow custom hostname, pain in the ass, slower deploys + # for datacenter in papiconf.MAIN_CONF['nomad']['datacenters']: + # utils.check_domain(f"{hostname}.{datacenter}-{base_domain}") #TODO: remove when we solve disk issues # For now on we fix disk here because, if not fixed, jobs are not being deployed @@ -217,7 +227,8 @@ def create_deployment( 'OWNER_EMAIL': auth_info['email'], 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters - 'DOMAIN': domain, + 'BASE_DOMAIN': base_domain, + 'HOSTNAME': hostname, 'DOCKER_IMAGE': user_conf['general']['docker_image'], 'DOCKER_TAG': user_conf['general']['docker_tag'], 'SERVICE': user_conf['general']['service'], diff --git a/ai4papi/routers/v1/deployments/tools.py b/ai4papi/routers/v1/deployments/tools.py index 2deb310..a9b8119 100644 --- a/ai4papi/routers/v1/deployments/tools.py +++ b/ai4papi/routers/v1/deployments/tools.py @@ -198,13 +198,23 @@ def create_deployment( else: priority = 50 - # Generate a domain for user-app and check nothing is running there - domain = utils.generate_domain( + # Remove non-compliant characters from hostname + base_domain = papiconf.MAIN_CONF['lb']['domain'][vo] + hostname = utils.safe_hostname( hostname=user_conf['general']['hostname'], - base_domain=papiconf.MAIN_CONF['lb']['domain'][vo], job_uuid=job_uuid, ) - utils.check_domain(domain) + + #TODO: reenable custom hostname, when we are able to parse all node metadata + # (domain key) to build the true domain + hostname = job_uuid + + # # Check the hostname is available in all data-centers + # # (we don't know beforehand where the job will land) + # #TODO: make sure this does not break if the datacenter is unavailable + # #TODO: disallow custom hostname, pain in the ass, slower deploys + # for datacenter in papiconf.MAIN_CONF['nomad']['datacenters']: + # utils.check_domain(f"{hostname}.{datacenter}-{base_domain}") # Replace the Nomad job template nomad_conf = nomad_conf.safe_substitute( @@ -217,7 +227,8 @@ def create_deployment( 'OWNER_EMAIL': auth_info['email'], 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters - 'DOMAIN': domain, + 'BASE_DOMAIN': base_domain, + 'HOSTNAME': hostname, 'DOCKER_IMAGE': user_conf['general']['docker_image'], 'DOCKER_TAG': user_conf['general']['docker_tag'], 'CPU_NUM': user_conf['hardware']['cpu_num'], diff --git a/ai4papi/utils.py b/ai4papi/utils.py index 36ca27d..634e1d4 100644 --- a/ai4papi/utils.py +++ b/ai4papi/utils.py @@ -11,9 +11,8 @@ session = requests.Session() -def generate_domain( +def safe_hostname( hostname: str, - base_domain: str, job_uuid: str, ): @@ -47,12 +46,10 @@ def generate_domain( detail="Hostname should be shorter than 40 characters." ) - domain = f"{hostname}.{base_domain}" + return hostname - else: # we use job_ID as default subdomain - domain = f"{job_uuid}.{base_domain}" - - return domain + else: # we use job_ID as default hostname + return job_uuid def check_domain(base_url): diff --git a/etc/modules/nomad.hcl b/etc/modules/nomad.hcl index 35700d7..be607a5 100644 --- a/etc/modules/nomad.hcl +++ b/etc/modules/nomad.hcl @@ -68,7 +68,7 @@ job "userjob-${JOB_UUID}" { tags = [ "traefik.enable=true", "traefik.http.routers.${JOB_UUID}-api.tls=true", - "traefik.http.routers.${JOB_UUID}-api.rule=Host(`api-${DOMAIN}`, `www.api-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-api.rule=Host(`api-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.api-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", ] } @@ -78,7 +78,7 @@ job "userjob-${JOB_UUID}" { tags = [ "traefik.enable=true", "traefik.http.routers.${JOB_UUID}-monitor.tls=true", - "traefik.http.routers.${JOB_UUID}-monitor.rule=Host(`monitor-${DOMAIN}`, `www.monitor-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-monitor.rule=Host(`monitor-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.monitor-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", ] } @@ -88,7 +88,7 @@ job "userjob-${JOB_UUID}" { tags = [ "traefik.enable=true", "traefik.http.routers.${JOB_UUID}-ide.tls=true", - "traefik.http.routers.${JOB_UUID}-ide.rule=Host(`ide-${DOMAIN}`, `www.ide-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-ide.rule=Host(`ide-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.ide-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", ] } diff --git a/etc/tools/deep-oc-federated-server/nomad.hcl b/etc/tools/deep-oc-federated-server/nomad.hcl index cb52a79..2efba4c 100644 --- a/etc/tools/deep-oc-federated-server/nomad.hcl +++ b/etc/tools/deep-oc-federated-server/nomad.hcl @@ -65,7 +65,7 @@ job "userjob-${JOB_UUID}" { tags = [ "traefik.enable=true", "traefik.http.routers.${JOB_UUID}-fedserver.tls=true", - "traefik.http.routers.${JOB_UUID}-fedserver.rule=Host(`fedserver-${DOMAIN}`, `www.fedserver-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-fedserver.rule=Host(`fedserver-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.fedserver-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", "traefik.http.services.${JOB_UUID}-fedserver.loadbalancer.server.scheme=h2c", # grpc support ] } @@ -76,7 +76,7 @@ job "userjob-${JOB_UUID}" { tags = [ "traefik.enable=true", "traefik.http.routers.${JOB_UUID}-ide.tls=true", - "traefik.http.routers.${JOB_UUID}-ide.rule=Host(`ide-${DOMAIN}`, `www.ide-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-ide.rule=Host(`ide-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.ide-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", ] } From 61bb92e97b751e19e27a28bc04987144763eafa7 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Thu, 23 Nov 2023 15:30:38 +0100 Subject: [PATCH 03/21] feat: deploy only on nodes serving that namespace --- etc/modules/nomad.hcl | 7 +++++++ etc/tools/deep-oc-federated-server/nomad.hcl | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/etc/modules/nomad.hcl b/etc/modules/nomad.hcl index be607a5..ff8fadd 100644 --- a/etc/modules/nomad.hcl +++ b/etc/modules/nomad.hcl @@ -24,6 +24,13 @@ job "userjob-${JOB_UUID}" { description = "${DESCRIPTION}" } + # Only deploy in nodes serving that namespace + constraint { + attribute = "${meta.namespace}" + operator = "regexp" + value = "${NAMESPACE}" + } + # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid # overloading GPU clients with CPU-only jobs. affinity { diff --git a/etc/tools/deep-oc-federated-server/nomad.hcl b/etc/tools/deep-oc-federated-server/nomad.hcl index 2efba4c..49edaec 100644 --- a/etc/tools/deep-oc-federated-server/nomad.hcl +++ b/etc/tools/deep-oc-federated-server/nomad.hcl @@ -24,6 +24,13 @@ job "userjob-${JOB_UUID}" { description = "${DESCRIPTION}" } + # Only deploy in nodes serving that namespace + constraint { + attribute = "${meta.namespace}" + operator = "regexp" + value = "${NAMESPACE}" + } + # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid # overloading GPU clients with CPU-only jobs. affinity { From 18e37afd796a0d7ac9e10ae23d60a2781508be17 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Thu, 23 Nov 2023 15:31:50 +0100 Subject: [PATCH 04/21] feat: avoid deploying on same nodes as system jobs --- etc/modules/nomad.hcl | 7 +++++++ etc/tools/deep-oc-federated-server/nomad.hcl | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/etc/modules/nomad.hcl b/etc/modules/nomad.hcl index ff8fadd..6b90f5e 100644 --- a/etc/modules/nomad.hcl +++ b/etc/modules/nomad.hcl @@ -24,6 +24,13 @@ job "userjob-${JOB_UUID}" { description = "${DESCRIPTION}" } + # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) + constraint { + attribute = "${meta.compute}" + operator = "=" + value = "true" + } + # Only deploy in nodes serving that namespace constraint { attribute = "${meta.namespace}" diff --git a/etc/tools/deep-oc-federated-server/nomad.hcl b/etc/tools/deep-oc-federated-server/nomad.hcl index 49edaec..879d715 100644 --- a/etc/tools/deep-oc-federated-server/nomad.hcl +++ b/etc/tools/deep-oc-federated-server/nomad.hcl @@ -24,6 +24,13 @@ job "userjob-${JOB_UUID}" { description = "${DESCRIPTION}" } + # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) + constraint { + attribute = "${meta.compute}" + operator = "=" + value = "true" + } + # Only deploy in nodes serving that namespace constraint { attribute = "${meta.namespace}" From d1f56e1345d240793527934724e99cd71ba7d731 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Fri, 24 Nov 2023 13:49:11 +0100 Subject: [PATCH 05/21] feat: properly configure disk limits --- ai4papi/routers/v1/deployments/modules.py | 6 ------ etc/modules/nomad.hcl | 3 +++ etc/modules/user.yaml | 2 +- etc/tools/deep-oc-federated-server/nomad.hcl | 3 +++ 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ai4papi/routers/v1/deployments/modules.py b/ai4papi/routers/v1/deployments/modules.py index 96939f2..c57e8fd 100644 --- a/ai4papi/routers/v1/deployments/modules.py +++ b/ai4papi/routers/v1/deployments/modules.py @@ -210,12 +210,6 @@ def create_deployment( # for datacenter in papiconf.MAIN_CONF['nomad']['datacenters']: # utils.check_domain(f"{hostname}.{datacenter}-{base_domain}") - #TODO: remove when we solve disk issues - # For now on we fix disk here because, if not fixed, jobs are not being deployed - # (ie. "resource disk exhausted"). - # In any case, this limit is useless because it has not yet been passed to docker - user_conf['hardware']['disk'] = 500 - # Replace the Nomad job template nomad_conf = nomad_conf.safe_substitute( { diff --git a/etc/modules/nomad.hcl b/etc/modules/nomad.hcl index 6b90f5e..11fd53b 100644 --- a/etc/modules/nomad.hcl +++ b/etc/modules/nomad.hcl @@ -154,6 +154,9 @@ job "userjob-${JOB_UUID}" { volumes = [ "/nomad-storage/${JOB_UUID}:/storage:shared", ] + storage_opt = { + size = "${DISK}M" + } } env { diff --git a/etc/modules/user.yaml b/etc/modules/user.yaml index 371948b..4dd646c 100644 --- a/etc/modules/user.yaml +++ b/etc/modules/user.yaml @@ -74,7 +74,7 @@ hardware: disk: name: Disk memory (in MB) value: 10000 - range: [1000, 20000] + range: [1000, 50000] storage: diff --git a/etc/tools/deep-oc-federated-server/nomad.hcl b/etc/tools/deep-oc-federated-server/nomad.hcl index 879d715..3e60b3f 100644 --- a/etc/tools/deep-oc-federated-server/nomad.hcl +++ b/etc/tools/deep-oc-federated-server/nomad.hcl @@ -106,6 +106,9 @@ job "userjob-${JOB_UUID}" { image = "${DOCKER_IMAGE}:${DOCKER_TAG}" ports = ["fedserver", "ide"] shm_size = ${SHARED_MEMORY} + storage_opt = { + size = "${DISK}M" + } } env { From a7599ce1985bc0cba2c85025445cd2f245d65580 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Fri, 24 Nov 2023 18:26:28 +0100 Subject: [PATCH 06/21] fix: disable endpoints if client disconnected --- ai4papi/nomad/common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ai4papi/nomad/common.py b/ai4papi/nomad/common.py index a1c63ea..677870f 100644 --- a/ai4papi/nomad/common.py +++ b/ai4papi/nomad/common.py @@ -251,10 +251,6 @@ def get_deployment( "the network is restored and you should be able to fully recover " \ "your deployment." - # Disable access to endpoints if there is a network cut - if info['status'] == 'down' and info['active_endpoints']: - info['active_endpoints'] = [] - # Add resources res = a['AllocatedResources']['Tasks']['usertask'] gpu = [d for d in res['Devices'] if d['Type'] == 'gpu'][0] if res['Devices'] else None @@ -283,6 +279,10 @@ def get_deployment( except requests.exceptions.Timeout: continue + # Disable access to endpoints if there is a network cut + if info['status'] == 'down' and info['active_endpoints']: + info['active_endpoints'] = [] + elif evals: # Something happened, job didn't deploy (eg. job needs port that's currently being used) # We have to return `placement failures message`. From 8be583ada5bd82d6bb6fbae8401a0941532d6ae0 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Fri, 24 Nov 2023 18:30:44 +0100 Subject: [PATCH 07/21] docs: improve comment --- etc/modules/nomad.hcl | 3 ++- etc/tools/deep-oc-federated-server/nomad.hcl | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/etc/modules/nomad.hcl b/etc/modules/nomad.hcl index 11fd53b..4859d7e 100644 --- a/etc/modules/nomad.hcl +++ b/etc/modules/nomad.hcl @@ -31,7 +31,8 @@ job "userjob-${JOB_UUID}" { value = "true" } - # Only deploy in nodes serving that namespace + # Only deploy in nodes serving that namespace (we use metadata instead of node-pools + # because Nomad does not allow a node to belong to several node pools) constraint { attribute = "${meta.namespace}" operator = "regexp" diff --git a/etc/tools/deep-oc-federated-server/nomad.hcl b/etc/tools/deep-oc-federated-server/nomad.hcl index 3e60b3f..bda801c 100644 --- a/etc/tools/deep-oc-federated-server/nomad.hcl +++ b/etc/tools/deep-oc-federated-server/nomad.hcl @@ -31,7 +31,8 @@ job "userjob-${JOB_UUID}" { value = "true" } - # Only deploy in nodes serving that namespace + # Only deploy in nodes serving that namespace (we use metadata instead of node-pools + # because Nomad does not allow a node to belong to several node pools) constraint { attribute = "${meta.namespace}" operator = "regexp" From 08b3162166ccd27dff25ec23e150a96fffd37896 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Fri, 24 Nov 2023 18:31:30 +0100 Subject: [PATCH 08/21] fix: set max RAM memory --- etc/modules/nomad.hcl | 2 ++ etc/tools/deep-oc-federated-server/nomad.hcl | 2 ++ 2 files changed, 4 insertions(+) diff --git a/etc/modules/nomad.hcl b/etc/modules/nomad.hcl index 4859d7e..eee7eb7 100644 --- a/etc/modules/nomad.hcl +++ b/etc/modules/nomad.hcl @@ -152,6 +152,7 @@ job "userjob-${JOB_UUID}" { args = ["--${SERVICE}"] ports = ["api", "monitor", "ide"] shm_size = ${SHARED_MEMORY} + memory_hard_limit = ${RAM} volumes = [ "/nomad-storage/${JOB_UUID}:/storage:shared", ] @@ -173,6 +174,7 @@ job "userjob-${JOB_UUID}" { resources { cores = ${CPU_NUM} memory = ${RAM} + memory_max = ${RAM} device "gpu" { count = ${GPU_NUM} diff --git a/etc/tools/deep-oc-federated-server/nomad.hcl b/etc/tools/deep-oc-federated-server/nomad.hcl index bda801c..d683607 100644 --- a/etc/tools/deep-oc-federated-server/nomad.hcl +++ b/etc/tools/deep-oc-federated-server/nomad.hcl @@ -107,6 +107,7 @@ job "userjob-${JOB_UUID}" { image = "${DOCKER_IMAGE}:${DOCKER_TAG}" ports = ["fedserver", "ide"] shm_size = ${SHARED_MEMORY} + memory_hard_limit = ${RAM} storage_opt = { size = "${DISK}M" } @@ -124,6 +125,7 @@ job "userjob-${JOB_UUID}" { resources { cores = ${CPU_NUM} memory = ${RAM} + memory_max = ${RAM} } } } From 16b4183eec938e2d0bcab2701b27d9b1e02a8de6 Mon Sep 17 00:00:00 2001 From: MartaOB <151519478+MartaOB@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:13:14 +0200 Subject: [PATCH 09/21] feat: add job type (module/tool) to metadata (#43) * feat: add job type * fix: add job_type to info * fix: remove trailing comma Trailing comma was causing reading the `job_type` as a tuple, not a string --------- Co-authored-by: Ignacio Heredia --- ai4papi/nomad/common.py | 5 +++++ ai4papi/routers/v1/deployments/modules.py | 8 +++++++- ai4papi/routers/v1/deployments/tools.py | 8 +++++++- etc/modules/nomad.hcl | 1 + etc/tools/deep-oc-federated-server/nomad.hcl | 1 + 5 files changed, 21 insertions(+), 2 deletions(-) diff --git a/ai4papi/nomad/common.py b/ai4papi/nomad/common.py index b33579a..628ee49 100644 --- a/ai4papi/nomad/common.py +++ b/ai4papi/nomad/common.py @@ -99,6 +99,7 @@ def get_deployment( 'owner': j['Meta']['owner'], 'title': j['Meta']['title'], 'description': j['Meta']['description'], + 'job_type': None, 'docker_image': None, 'docker_command': None, 'submit_time': datetime.fromtimestamp( @@ -112,6 +113,10 @@ def get_deployment( 'datacenter': None, } + # TODO: temporal fix until all jobs have job type + if 'job_type' in j['Meta']: + info['job_type'] = j['Meta']['job_type'] + # Retrieve tasks tasks = j['TaskGroups'][0]['Tasks'] usertask = [t for t in tasks if t['Name'] == 'usertask'][0] diff --git a/ai4papi/routers/v1/deployments/modules.py b/ai4papi/routers/v1/deployments/modules.py index a8f69f3..551b407 100644 --- a/ai4papi/routers/v1/deployments/modules.py +++ b/ai4papi/routers/v1/deployments/modules.py @@ -124,7 +124,13 @@ def get_deployment( '/(.*):', # remove dockerhub account and tag job['docker_image'], ).group(1) - if module_name in tool_list: + # TODO: temporal fix until all the jobs have job_type + if 'job_type' in job and job['job_type'] == 'tool': + raise HTTPException( + status_code=400, + detail="This deployment is a tool, not a module.", + ) + elif module_name in tool_list: raise HTTPException( status_code=400, detail="This deployment is a tool, not a module.", diff --git a/ai4papi/routers/v1/deployments/tools.py b/ai4papi/routers/v1/deployments/tools.py index 66b5a91..56a5fae 100644 --- a/ai4papi/routers/v1/deployments/tools.py +++ b/ai4papi/routers/v1/deployments/tools.py @@ -124,7 +124,13 @@ def get_deployment( '/(.*):', # remove dockerhub account and tag job['docker_image'], ).group(1) - if tool_name not in tool_list: + # TODO: temporal fix until all the jobs have job_type + if 'job_type' in job and job['job_type'] == 'module': + raise HTTPException( + status_code=400, + detail="This deployment is a module, not a tool.", + ) + elif tool_name not in tool_list: raise HTTPException( status_code=400, detail="This deployment is a module, not a tool.", diff --git a/etc/modules/nomad.hcl b/etc/modules/nomad.hcl index c16e6ed..d516370 100644 --- a/etc/modules/nomad.hcl +++ b/etc/modules/nomad.hcl @@ -22,6 +22,7 @@ job "userjob-${JOB_UUID}" { owner_email = "${OWNER_EMAIL}" title = "${TITLE}" description = "${DESCRIPTION}" + job_type = "module" } # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) diff --git a/etc/tools/deep-oc-federated-server/nomad.hcl b/etc/tools/deep-oc-federated-server/nomad.hcl index e7f0828..9794399 100644 --- a/etc/tools/deep-oc-federated-server/nomad.hcl +++ b/etc/tools/deep-oc-federated-server/nomad.hcl @@ -22,6 +22,7 @@ job "userjob-${JOB_UUID}" { owner_email = "${OWNER_EMAIL}" title = "${TITLE}" description = "${DESCRIPTION}" + job_type = "tool" } # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) From dae82d2c29b5c740a9842f217f8e62fb53c6a5d3 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Wed, 24 Apr 2024 13:40:40 +0200 Subject: [PATCH 10/21] fix: available GPU models should be filtered by VO --- ai4papi/nomad/common.py | 11 +++++++++-- ai4papi/routers/v1/catalog/modules.py | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/ai4papi/nomad/common.py b/ai4papi/nomad/common.py index cb271f6..c1dc743 100644 --- a/ai4papi/nomad/common.py +++ b/ai4papi/nomad/common.py @@ -17,6 +17,7 @@ from nomad.api import exceptions import requests +import ai4papi.conf as papiconf import ai4papi.nomad.patches as nomad_patches @@ -367,13 +368,19 @@ def delete_deployment( return {'status': 'success'} -def get_gpu_models(): +@cached(cache=TTLCache(maxsize=1024, ttl=1*60*60)) +def get_gpu_models(vo): """ - Retrieve available GPU models in the cluster. + Retrieve available GPU models in the cluster, filtering nodes by VO. """ gpu_models = set() nodes = Nomad.nodes.get_nodes(resources=True) for node in nodes: + # Discard nodes that don't belong to the requested VO + meta = Nomad.node.get_node(node['ID'])['Meta'] + if papiconf.MAIN_CONF['nomad']['namespaces'][vo] not in meta['namespace']: + continue + # Discard GPU models of nodes that are not eligible if node['SchedulingEligibility'] != 'eligible': continue diff --git a/ai4papi/routers/v1/catalog/modules.py b/ai4papi/routers/v1/catalog/modules.py index e842776..b2f562a 100644 --- a/ai4papi/routers/v1/catalog/modules.py +++ b/ai4papi/routers/v1/catalog/modules.py @@ -73,7 +73,7 @@ def get_config( ) # Fill with available GPU models in the cluster - models = nomad.common.get_gpu_models() + models = nomad.common.get_gpu_models(vo) if models: conf["hardware"]["gpu_type"]["options"] += models From 4cc053d72e66c576f4ce0788065d51ea54fae597 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Mon, 29 Apr 2024 14:41:34 +0200 Subject: [PATCH 11/21] feat!: update naming in jobs and tasks * `userjob-***` has been renamed into either `module-***` or `tool-***` * `usertask` has been renamed to `main` --- ai4papi/nomad/common.py | 14 ++++++-------- ai4papi/routers/v1/deployments/modules.py | 20 +++++--------------- ai4papi/routers/v1/deployments/tools.py | 20 +++++--------------- etc/modules/nomad.hcl | 8 ++++---- etc/tools/deep-oc-federated-server/nomad.hcl | 4 ++-- 5 files changed, 22 insertions(+), 44 deletions(-) diff --git a/ai4papi/nomad/common.py b/ai4papi/nomad/common.py index c1dc743..29a069a 100644 --- a/ai4papi/nomad/common.py +++ b/ai4papi/nomad/common.py @@ -43,13 +43,14 @@ def get_deployments( namespace: str, owner: str, + prefix: str = "", ): """ Returns a list of all deployments belonging to a user, in a given namespace. """ job_filter = \ 'Status != "dead" and ' + \ - 'Name matches "^userjob" and ' + \ + f'Name matches "^{prefix}" and ' + \ 'Meta is not empty and ' + \ f'Meta.owner == "{owner}"' jobs = Nomad.jobs.get_jobs(namespace=namespace, filter_=job_filter) @@ -96,6 +97,7 @@ def get_deployment( # Create job info dict info = { 'job_ID': j['ID'], + 'name': j['Name'], 'status': '', # do not use j['Status'] as misleading 'owner': j['Meta']['owner'], 'title': j['Meta']['title'], @@ -114,13 +116,9 @@ def get_deployment( 'datacenter': None, } - # TODO: temporal fix until all jobs have job type - if 'job_type' in j['Meta']: - info['job_type'] = j['Meta']['job_type'] - # Retrieve tasks tasks = j['TaskGroups'][0]['Tasks'] - usertask = [t for t in tasks if t['Name'] == 'usertask'][0] + usertask = [t for t in tasks if t['Name'] == 'main'][0] # Retrieve Docker image info['docker_image'] = usertask['Config']['image'] @@ -224,7 +222,7 @@ def get_deployment( # Add error messages if needed if info['status'] == 'failed': - info['error_msg'] = a['TaskStates']['usertask']['Events'][0]['Message'] + info['error_msg'] = a['TaskStates']['main']['Events'][0]['Message'] # Replace with clearer message if info['error_msg'] == 'Docker container exited with non-zero exit code: 1': @@ -241,7 +239,7 @@ def get_deployment( "your deployment." # Add resources - res = a['AllocatedResources']['Tasks']['usertask'] + res = a['AllocatedResources']['Tasks']['main'] gpu = [d for d in res['Devices'] if d['Type'] == 'gpu'][0] if res['Devices'] else None cpu_cores = res['Cpu']['ReservedCores'] info['resources'] = { diff --git a/ai4papi/routers/v1/deployments/modules.py b/ai4papi/routers/v1/deployments/modules.py index 551b407..fb468c9 100644 --- a/ai4papi/routers/v1/deployments/modules.py +++ b/ai4papi/routers/v1/deployments/modules.py @@ -57,6 +57,7 @@ def get_deployments( jobs = nomad.get_deployments( namespace=papiconf.MAIN_CONF['nomad']['namespaces'][vo], owner=auth_info['id'], + prefix='module', ) # Retrieve info for jobs in namespace @@ -119,21 +120,10 @@ def get_deployment( ) # Check the deployment is indeed a module - tool_list = papiconf.TOOLS.keys() - module_name = re.search( - '/(.*):', # remove dockerhub account and tag - job['docker_image'], - ).group(1) - # TODO: temporal fix until all the jobs have job_type - if 'job_type' in job and job['job_type'] == 'tool': + if not job['name'].startswith('module'): raise HTTPException( status_code=400, - detail="This deployment is a tool, not a module.", - ) - elif module_name in tool_list: - raise HTTPException( - status_code=400, - detail="This deployment is a tool, not a module.", + detail="This deployment is not a module.", ) return job @@ -264,7 +254,7 @@ def create_deployment( nomad_conf = nomad.load_job_conf(nomad_conf) tasks = nomad_conf['TaskGroups'][0]['Tasks'] - usertask = [t for t in tasks if t['Name']=='usertask'][0] + usertask = [t for t in tasks if t['Name']=='main'][0] # Apply patches if needed usertask = module_patches.patch_nextcloud_mount( @@ -283,7 +273,7 @@ def create_deployment( # If storage credentials not provided, remove storage-related tasks if not all(user_conf['storage'].values()): - tasks[:] = [t for t in tasks if t['Name'] not in {'storagetask', 'storagecleanup'}] + tasks[:] = [t for t in tasks if t['Name'] not in {'storage_mount', 'storage_cleanup'}] # Submit job r = nomad.create_deployment(nomad_conf) diff --git a/ai4papi/routers/v1/deployments/tools.py b/ai4papi/routers/v1/deployments/tools.py index 8c9c944..e5764fe 100644 --- a/ai4papi/routers/v1/deployments/tools.py +++ b/ai4papi/routers/v1/deployments/tools.py @@ -60,6 +60,7 @@ def get_deployments( jobs = nomad.get_deployments( namespace=papiconf.MAIN_CONF['nomad']['namespaces'][vo], owner=auth_info['id'], + prefix='tool', ) # Retrieve info for jobs in namespace @@ -122,21 +123,10 @@ def get_deployment( ) # Check the deployment is indeed a tool - tool_list = papiconf.TOOLS.keys() - tool_name = re.search( - '/(.*):', # remove dockerhub account and tag - job['docker_image'], - ).group(1) - # TODO: temporal fix until all the jobs have job_type - if 'job_type' in job and job['job_type'] == 'module': - raise HTTPException( - status_code=400, - detail="This deployment is a module, not a tool.", - ) - elif tool_name not in tool_list: + if not job['name'].startswith('tool'): raise HTTPException( status_code=400, - detail="This deployment is a module, not a tool.", + detail="This deployment is not a tool.", ) return job @@ -215,7 +205,7 @@ def create_deployment( ) #TODO: reenable custom hostname, when we are able to parse all node metadata - # (domain key) to build the true domain + # (domain key) to build the true domain hostname = job_uuid # # Check the hostname is available in all data-centers @@ -275,7 +265,7 @@ def create_deployment( nomad_conf = nomad.load_job_conf(nomad_conf) tasks = nomad_conf['TaskGroups'][0]['Tasks'] - usertask = [t for t in tasks if t['Name']=='usertask'][0] + usertask = [t for t in tasks if t['Name']=='main'][0] # Launch `deep-start` compatible service if needed service = user_conf['general']['service'] diff --git a/etc/modules/nomad.hcl b/etc/modules/nomad.hcl index d516370..27e24e7 100644 --- a/etc/modules/nomad.hcl +++ b/etc/modules/nomad.hcl @@ -9,7 +9,7 @@ When replacing user values we use safe_substitute() so that ge don't get an erro replacing Nomad values */ -job "userjob-${JOB_UUID}" { +job "module-${JOB_UUID}" { namespace = "${NAMESPACE}" type = "service" region = "global" @@ -112,7 +112,7 @@ job "userjob-${JOB_UUID}" { size = ${DISK} } - task "storagetask" { + task "storage_mount" { // Running task in charge of mounting storage driver = "docker" @@ -142,7 +142,7 @@ job "userjob-${JOB_UUID}" { } } - task "usertask" { + task "main" { // Task configured by the user (deepaas, jupyter, vscode) driver = "docker" @@ -192,7 +192,7 @@ job "userjob-${JOB_UUID}" { } } - task "storagecleanup" { + task "storage_cleanup" { // Unmount empty storage folder and delete it from host lifecycle { diff --git a/etc/tools/deep-oc-federated-server/nomad.hcl b/etc/tools/deep-oc-federated-server/nomad.hcl index b41b859..a5c02fe 100644 --- a/etc/tools/deep-oc-federated-server/nomad.hcl +++ b/etc/tools/deep-oc-federated-server/nomad.hcl @@ -9,7 +9,7 @@ When replacing user values we use safe_substitute() so that ge don't get an erro replacing Nomad values */ -job "userjob-${JOB_UUID}" { +job "tool-fl-${JOB_UUID}" { namespace = "${NAMESPACE}" type = "service" region = "global" @@ -113,7 +113,7 @@ job "userjob-${JOB_UUID}" { size = ${DISK} } - task "usertask" { + task "main" { driver = "docker" # Use default command defined in the Dockerfile From 13c1dddbed81f8afaae4c24630a1c902de2704c6 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Mon, 29 Apr 2024 15:29:30 +0200 Subject: [PATCH 12/21] refactor: remove dangling `job_type` --- ai4papi/nomad/common.py | 1 - etc/modules/nomad.hcl | 1 - etc/tools/deep-oc-federated-server/nomad.hcl | 1 - 3 files changed, 3 deletions(-) diff --git a/ai4papi/nomad/common.py b/ai4papi/nomad/common.py index 29a069a..a794f39 100644 --- a/ai4papi/nomad/common.py +++ b/ai4papi/nomad/common.py @@ -102,7 +102,6 @@ def get_deployment( 'owner': j['Meta']['owner'], 'title': j['Meta']['title'], 'description': j['Meta']['description'], - 'job_type': None, 'docker_image': None, 'docker_command': None, 'submit_time': datetime.fromtimestamp( diff --git a/etc/modules/nomad.hcl b/etc/modules/nomad.hcl index 27e24e7..c2a1fd7 100644 --- a/etc/modules/nomad.hcl +++ b/etc/modules/nomad.hcl @@ -22,7 +22,6 @@ job "module-${JOB_UUID}" { owner_email = "${OWNER_EMAIL}" title = "${TITLE}" description = "${DESCRIPTION}" - job_type = "module" } # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) diff --git a/etc/tools/deep-oc-federated-server/nomad.hcl b/etc/tools/deep-oc-federated-server/nomad.hcl index a5c02fe..0109b45 100644 --- a/etc/tools/deep-oc-federated-server/nomad.hcl +++ b/etc/tools/deep-oc-federated-server/nomad.hcl @@ -22,7 +22,6 @@ job "tool-fl-${JOB_UUID}" { owner_email = "${OWNER_EMAIL}" title = "${TITLE}" description = "${DESCRIPTION}" - job_type = "tool" } # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) From a676ca6595efa92dc6e5892791950ffa9d6c8deb Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Fri, 24 May 2024 19:39:40 +0200 Subject: [PATCH 13/21] refactor: aggregate cluster gpu models at the end --- ai4papi/routers/v1/stats/deployments.py | 29 ++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py index b580c2d..6fd996b 100644 --- a/ai4papi/routers/v1/stats/deployments.py +++ b/ai4papi/routers/v1/stats/deployments.py @@ -224,11 +224,10 @@ def get_cluster_stats_bg(): 'datacenters' : datacenters, # aggregated datacenter usage 'cluster': {k: 0 for k in resources}, # aggregated cluster usage } - stats['cluster']['gpu_models'] = [] + stats['cluster']['gpu_models'] = {} # Load nodes nodes = Nomad.nodes.get_nodes(resources=True) - gpu_stats = {} nodes_dc = {} # dict(node, datacenter) # Get total stats for each node @@ -252,13 +251,9 @@ def get_cluster_stats_bg(): n_stats['gpu_total'] += len(devices['Instances']) # Track stats per GPU model type - if devices['Name'] not in gpu_stats.keys(): - gpu_stats[devices['Name']] = {'gpu_total': 0, 'gpu_used': 0} - if devices['Name'] not in n_stats['gpu_models'].keys(): n_stats['gpu_models'][devices['Name']] = {'gpu_total': 0, 'gpu_used': 0} - gpu_stats[devices['Name']]['gpu_total'] += len(devices['Instances']) n_stats['gpu_models'][devices['Name']]['gpu_total'] += len(devices['Instances']) # If datacenter is not in csv, load default info @@ -312,7 +307,6 @@ def get_cluster_stats_bg(): gpu = [d for d in res['Devices'] if d['Type'] == 'gpu'][0] gpu_num = len(gpu['DeviceIDs']) if gpu else 0 n_stats['gpu_used'] += gpu_num - gpu_stats[gpu['Name']]['gpu_used'] += gpu_num n_stats['gpu_models'][gpu['Name']]['gpu_used'] += gpu_num else: continue @@ -326,10 +320,25 @@ def get_cluster_stats_bg(): for dc_stats in stats['datacenters'].values(): for n_stats in dc_stats['nodes'].values(): for k, v in n_stats.items(): - if k not in ['name', 'jobs_num']: - stats['cluster'][k] += v - stats['cluster']['gpu_models'] = gpu_stats + # Ignore keys + if k in ['name', 'namespaces']: + continue + + # Aggregate nested gpu_models dict + elif k == 'gpu_models': + for k1, v1 in v.items(): + model_stats = stats['cluster']['gpu_models'].get( + k1, + {'gpu_total': 0, 'gpu_used': 0,} # init value + ) + for k2, v2 in v1.items(): + model_stats[k2] += v2 + stats['cluster']['gpu_models'][k1] = model_stats + + # Aggregate other resources + else: + stats['cluster'][k] += v # Set the new shared variable global cluster_stats From 2fa13725ed0cb4f3c876b753f8c82fa490a130ca Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Fri, 24 May 2024 19:41:07 +0200 Subject: [PATCH 14/21] fix: fix job_num in stats --- ai4papi/routers/v1/stats/deployments.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py index 6fd996b..03703de 100644 --- a/ai4papi/routers/v1/stats/deployments.py +++ b/ai4papi/routers/v1/stats/deployments.py @@ -210,6 +210,7 @@ def get_cluster_stats_bg(): """ resources = [ + 'jobs_num', 'cpu_total', 'cpu_used', 'gpu_total', @@ -235,7 +236,6 @@ def get_cluster_stats_bg(): node = Nomad.node.get_node(n['ID']) n_stats = {k: 0 for k in resources} n_stats['name'] = node['Name'] - n_stats['jobs_num'] = 0 n_stats['cpu_total'] = int(node['Attributes']['cpu.numcores']) n_stats['ram_total'] = int(node['Attributes']['memory.totalbytes']) / 2**20 n_stats['disk_total'] = int(node['Attributes']['unique.storage.bytestotal']) / 2**20 @@ -269,6 +269,7 @@ def get_cluster_stats_bg(): for namespace in namespaces: jobs = Nomad.jobs.get_jobs(namespace=namespace, filter_='Status == "running"') for j in jobs: + # Retrieve full job for meta job = Nomad.job.get_job( id_=j['ID'], @@ -288,10 +289,12 @@ def get_cluster_stats_bg(): # Add resources datacenter = nodes_dc[a['NodeID']] n_stats = stats['datacenters'][datacenter]['nodes'][a['NodeID']] - if 'userjob' in job['Name']: + + #TODO: we are ignoring resources consumed by other jobs + if job['Name'].startswith('module') or job['Name'].startswith('tool'): n_stats['jobs_num'] += 1 - #FIXME: we are ignoring resources consumed by other tasks + #TODO: we are ignoring resources consumed by other tasks if 'usertask' in a['AllocatedResources']['Tasks']: res = a['AllocatedResources']['Tasks']['usertask'] From 55961d6d70d05a58e528ef0731a24f60e3aa8375 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Fri, 24 May 2024 20:16:04 +0200 Subject: [PATCH 15/21] feat: filter node stats by VO --- ai4papi/routers/v1/stats/deployments.py | 77 ++++++++++++++----------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py index 03703de..d7237de 100644 --- a/ai4papi/routers/v1/stats/deployments.py +++ b/ai4papi/routers/v1/stats/deployments.py @@ -181,6 +181,7 @@ def load_datacenters(): return datacenters +@cached(cache=TTLCache(maxsize=1024, ttl=30)) @router.get("/cluster") def get_cluster_stats( vo: str, @@ -192,13 +193,49 @@ def get_cluster_stats( """ global cluster_stats + stats = cluster_stats - #TODO: filter cluster stats to only return stats of the nodes that support a - # given VO. This is blocked until we move to the federated cluster where VO support - # is specified in the node metadata. - # (!) Total cluster resources will need to be computed after this filtering is done + namespace = papiconf.MAIN_CONF['nomad']['namespaces'][vo] - return cluster_stats + for k, v in stats['datacenters'].copy().items(): + + # Filter out nodes that do not support the given VO + nodes = {} + for n_id, n_stats in v['nodes'].items(): + if namespace in n_stats['namespaces']: + nodes[n_id] = n_stats + + # Ignore datacenters with no nodes + if not nodes: + del stats['datacenters'][k] + else: + stats['datacenters'][k]['nodes'] = nodes + + # Compute cluster stats after node filtering is done + for dc_stats in stats['datacenters'].values(): + for n_stats in dc_stats['nodes'].values(): + for k, v in n_stats.items(): + + # Ignore keys + if k in ['name', 'namespaces']: + continue + + # Aggregate nested gpu_models dict + elif k == 'gpu_models': + for k1, v1 in v.items(): + model_stats = stats['cluster']['gpu_models'].get( + k1, + {'gpu_total': 0, 'gpu_used': 0,} # init value + ) + for k2, v2 in v1.items(): + model_stats[k2] += v2 + stats['cluster']['gpu_models'][k1] = model_stats + + # Aggregate other resources + else: + stats['cluster'][k] += v + + return stats @cached(cache=TTLCache(maxsize=1024, ttl=30)) @@ -244,6 +281,7 @@ def get_cluster_stats_bg(): - int(node['Attributes']['unique.storage.bytesfree'])) \ / 2**20 n_stats['gpu_models'] = {} + n_stats['namespaces'] = node['Meta']['namespace'] if n['NodeResources']['Devices']: for devices in n['NodeResources']['Devices']: @@ -314,35 +352,6 @@ def get_cluster_stats_bg(): else: continue - # Ignore datacenters with no nodes - for k, v in stats['datacenters'].copy().items(): - if not v['nodes']: - del stats['datacenters'][k] - - # Compute cluster stats - for dc_stats in stats['datacenters'].values(): - for n_stats in dc_stats['nodes'].values(): - for k, v in n_stats.items(): - - # Ignore keys - if k in ['name', 'namespaces']: - continue - - # Aggregate nested gpu_models dict - elif k == 'gpu_models': - for k1, v1 in v.items(): - model_stats = stats['cluster']['gpu_models'].get( - k1, - {'gpu_total': 0, 'gpu_used': 0,} # init value - ) - for k2, v2 in v1.items(): - model_stats[k2] += v2 - stats['cluster']['gpu_models'][k1] = model_stats - - # Aggregate other resources - else: - stats['cluster'][k] += v - # Set the new shared variable global cluster_stats cluster_stats = stats From a0f6a64b1d57e190e889943fe865f5f46bf36b69 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Wed, 19 Jun 2024 13:23:46 +0200 Subject: [PATCH 16/21] fix(stats): avoid overwriting global stats var --- ai4papi/routers/v1/stats/deployments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py index d7237de..52de16e 100644 --- a/ai4papi/routers/v1/stats/deployments.py +++ b/ai4papi/routers/v1/stats/deployments.py @@ -193,7 +193,7 @@ def get_cluster_stats( """ global cluster_stats - stats = cluster_stats + stats = copy.deepcopy(cluster_stats) namespace = papiconf.MAIN_CONF['nomad']['namespaces'][vo] From 654d65a996750eb62c9cb99e8466b79c60720b6a Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Thu, 20 Jun 2024 15:30:59 +0200 Subject: [PATCH 17/21] feat: add anti affinity for `ai4eosc` nodes Try to deploy iMagine jobs on nodes that are iMagine-exclusive. In this way, we leave AI4EOSC nodes for AI4EOSC users and for iMagine users only when iMagine nodes are fully booked. --- etc/modules/nomad.hcl | 10 ++++++++++ etc/tools/deep-oc-federated-server/nomad.hcl | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/etc/modules/nomad.hcl b/etc/modules/nomad.hcl index c2a1fd7..7e38c5f 100644 --- a/etc/modules/nomad.hcl +++ b/etc/modules/nomad.hcl @@ -39,6 +39,16 @@ job "module-${JOB_UUID}" { value = "${NAMESPACE}" } + # Try to deploy iMagine jobs on nodes that are iMagine-exclusive + # In this way, we leave AI4EOSC nodes for AI4EOSC users and for iMagine users only + # when iMagine nodes are fully booked. + affinity { + attribute = "${meta.namespace}" + operator = "regexp" + value = "ai4eosc" + weight = -50 # anti-affinity for ai4eosc clients + } + # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid # overloading GPU clients with CPU-only jobs. affinity { diff --git a/etc/tools/deep-oc-federated-server/nomad.hcl b/etc/tools/deep-oc-federated-server/nomad.hcl index 0109b45..c9bca58 100644 --- a/etc/tools/deep-oc-federated-server/nomad.hcl +++ b/etc/tools/deep-oc-federated-server/nomad.hcl @@ -39,6 +39,16 @@ job "tool-fl-${JOB_UUID}" { value = "${NAMESPACE}" } + # Try to deploy iMagine jobs on nodes that are iMagine-exclusive + # In this way, we leave AI4EOSC nodes for AI4EOSC users and for iMagine users only + # when iMagine nodes are fully booked. + affinity { + attribute = "${meta.namespace}" + operator = "regexp" + value = "ai4eosc" + weight = -50 # anti-affinity for ai4eosc clients + } + # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid # overloading GPU clients with CPU-only jobs. affinity { From eafa4fda1d344493efb195eeadca97c0aba76abd Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Thu, 20 Jun 2024 16:03:25 +0200 Subject: [PATCH 18/21] feat: increase RAM limit --- etc/modules/user.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/etc/modules/user.yaml b/etc/modules/user.yaml index b822f96..e0db1f5 100644 --- a/etc/modules/user.yaml +++ b/etc/modules/user.yaml @@ -9,6 +9,16 @@ # options: restricted set of values that the parameter can take (optional) # description: some comments on the parameter to be displayed to the end user (optional) +# CPU/RAM limits are based on the current have GPU flavours we (IFCA) have deployed: +# * g13-gpuib-8-86: +# - 8 GPUs Nvidia Tesla T4 +# - 86 VCPUs --> ~9.6 cores / gpu (reserving 10% for the node) +# - 351 GB RAM --> ~40 GB / gpu (reserving 10% for the node) +# * g12-gpuib-2-64: +# - 2 GPUs Nvidia Tesla V100 +# - 64 VCPUs --> ~28 cores / gpu (reserving 10% for the node) +# - 127 GB RAM --> ~57 GB / gpu (reserving 10% for the node) + general: title: @@ -69,7 +79,7 @@ hardware: ram: name: RAM memory (in MB) value: 8000 - range: [2000, 25000] + range: [2000, 40000] disk: name: Disk memory (in MB) From bb8ff7e89a66f077d608a38066cabeadf7d6e128 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Tue, 9 Jul 2024 15:54:20 +0200 Subject: [PATCH 19/21] feat: enforce `node.meta.status=ready` --- etc/modules/nomad.hcl | 7 +++++++ etc/tools/deep-oc-federated-server/nomad.hcl | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/etc/modules/nomad.hcl b/etc/modules/nomad.hcl index 9b0d2a7..21066d3 100644 --- a/etc/modules/nomad.hcl +++ b/etc/modules/nomad.hcl @@ -24,6 +24,13 @@ job "module-${JOB_UUID}" { description = "${DESCRIPTION}" } + # Only use nodes that have succesfully passed the ai4-nomad_tests (ie. meta.status=ready) + constraint { + attribute = "${meta.status}" + operator = "regexp" + value = "ready" + } + # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) constraint { attribute = "${meta.compute}" diff --git a/etc/tools/deep-oc-federated-server/nomad.hcl b/etc/tools/deep-oc-federated-server/nomad.hcl index c9bca58..b825659 100644 --- a/etc/tools/deep-oc-federated-server/nomad.hcl +++ b/etc/tools/deep-oc-federated-server/nomad.hcl @@ -24,6 +24,13 @@ job "tool-fl-${JOB_UUID}" { description = "${DESCRIPTION}" } + # Only use nodes that have succesfully passed the ai4-nomad_tests (ie. meta.status=ready) + constraint { + attribute = "${meta.status}" + operator = "regexp" + value = "ready" + } + # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) constraint { attribute = "${meta.compute}" From 0709f3b49c16c6ab0b4142f28da4f0feb12cfe08 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Wed, 10 Jul 2024 13:52:56 +0200 Subject: [PATCH 20/21] feat(stats): add node status to stats --- ai4papi/routers/v1/stats/deployments.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py index 2b91652..b971903 100644 --- a/ai4papi/routers/v1/stats/deployments.py +++ b/ai4papi/routers/v1/stats/deployments.py @@ -282,7 +282,8 @@ def get_cluster_stats_bg(): - int(node['Attributes']['unique.storage.bytesfree'])) \ / 2**20 n_stats['gpu_models'] = {} - n_stats['namespaces'] = node['Meta']['namespace'] + n_stats['namespaces'] = node['Meta'].get('namespace', '') + n_stats['status'] = node['Meta'].get('status', '') if n['NodeResources']['Devices']: for devices in n['NodeResources']['Devices']: From a95b9572b9977dff1bb0c655704b623012b5cc7f Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Wed, 17 Jul 2024 10:38:13 +0200 Subject: [PATCH 21/21] fix(stats): do not aggregate node status --- ai4papi/routers/v1/stats/deployments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py index b971903..29b701b 100644 --- a/ai4papi/routers/v1/stats/deployments.py +++ b/ai4papi/routers/v1/stats/deployments.py @@ -217,7 +217,7 @@ def get_cluster_stats( for k, v in n_stats.items(): # Ignore keys - if k in ['name', 'namespaces', 'eligibility']: + if k in ['name', 'namespaces', 'eligibility', 'status']: continue # Aggregate nested gpu_models dict