diff --git a/ai4papi/nomad/common.py b/ai4papi/nomad/common.py index 2250c31..83a50e3 100644 --- a/ai4papi/nomad/common.py +++ b/ai4papi/nomad/common.py @@ -17,6 +17,7 @@ from nomad.api import exceptions import requests +import ai4papi.conf as papiconf import ai4papi.nomad.patches as nomad_patches @@ -42,13 +43,14 @@ def get_deployments( namespace: str, owner: str, + prefix: str = "", ): """ Returns a list of all deployments belonging to a user, in a given namespace. """ job_filter = \ 'Status != "dead" and ' + \ - 'Name matches "^userjob" and ' + \ + f'Name matches "^{prefix}" and ' + \ 'Meta is not empty and ' + \ f'Meta.owner == "{owner}"' jobs = Nomad.jobs.get_jobs(namespace=namespace, filter_=job_filter) @@ -95,6 +97,7 @@ def get_deployment( # Create job info dict info = { 'job_ID': j['ID'], + 'name': j['Name'], 'status': '', # do not use j['Status'] as misleading 'owner': j['Meta']['owner'], 'title': j['Meta']['title'], @@ -114,7 +117,7 @@ def get_deployment( # Retrieve tasks tasks = j['TaskGroups'][0]['Tasks'] - usertask = [t for t in tasks if t['Name'] == 'usertask'][0] + usertask = [t for t in tasks if t['Name'] == 'main'][0] # Retrieve Docker image info['docker_image'] = usertask['Config']['image'] @@ -165,17 +168,6 @@ def get_deployment( except Exception: # return first endpoint info['main_endpoint'] = list(info['endpoints'].values())[0] - # Add active endpoints - if full_info: - info['active_endpoints'] = [] - for k, v in info['endpoints'].items(): - try: - r = session.get(v, timeout=2) - if r.status_code == 200: - info['active_endpoints'].append(k) - except (requests.exceptions.Timeout, requests.exceptions.ConnectionError): - continue - # Only fill resources if the job is allocated allocs = Nomad.job.get_allocations( id_=j['ID'], @@ -229,7 +221,7 @@ def get_deployment( # Add error messages if needed if info['status'] == 'failed': - info['error_msg'] = a['TaskStates']['usertask']['Events'][0]['Message'] + info['error_msg'] = a['TaskStates']['main']['Events'][0]['Message'] # Replace with clearer message if info['error_msg'] == 'Docker container exited with non-zero exit code: 1': @@ -245,12 +237,8 @@ def get_deployment( "the network is restored and you should be able to fully recover " \ "your deployment." - # Disable access to endpoints if there is a network cut - if info['status'] == 'down' and info['active_endpoints']: - info['active_endpoints'] = [] - # Add resources - res = a['AllocatedResources']['Tasks']['usertask'] + res = a['AllocatedResources']['Tasks']['main'] gpu = [d for d in res['Devices'] if d['Type'] == 'gpu'][0] if res['Devices'] else None cpu_cores = res['Cpu']['ReservedCores'] info['resources'] = { @@ -261,6 +249,26 @@ def get_deployment( 'disk_MB': a['AllocatedResources']['Shared']['DiskMB'], } + # Retrieve the node the jobs landed at in order to properly fill the endpoints + n = Nomad.node.get_node(a['NodeID']) + for k, v in info['endpoints'].items(): + info['endpoints'][k] = v.replace('${meta.domain}', n['Meta']['domain']) + + # Add active endpoints + if full_info: + info['active_endpoints'] = [] + for k, v in info['endpoints'].items(): + try: + r = session.get(v, timeout=2) + if r.status_code == 200: + info['active_endpoints'].append(k) + except (requests.exceptions.Timeout, requests.exceptions.ConnectionError): + continue + + # Disable access to endpoints if there is a network cut + if info['status'] == 'down' and info['active_endpoints']: + info['active_endpoints'] = [] + elif evals: # Something happened, job didn't deploy (eg. job needs port that's currently being used) # We have to return `placement failures message`. @@ -357,13 +365,19 @@ def delete_deployment( return {'status': 'success'} -def get_gpu_models(): +@cached(cache=TTLCache(maxsize=1024, ttl=1*60*60)) +def get_gpu_models(vo): """ - Retrieve available GPU models in the cluster. + Retrieve available GPU models in the cluster, filtering nodes by VO. """ gpu_models = set() nodes = Nomad.nodes.get_nodes(resources=True) for node in nodes: + # Discard nodes that don't belong to the requested VO + meta = Nomad.node.get_node(node['ID'])['Meta'] + if papiconf.MAIN_CONF['nomad']['namespaces'][vo] not in meta['namespace']: + continue + # Discard GPU models of nodes that are not eligible if node['SchedulingEligibility'] != 'eligible': continue diff --git a/ai4papi/routers/v1/catalog/modules.py b/ai4papi/routers/v1/catalog/modules.py index 065fdab..53544c9 100644 --- a/ai4papi/routers/v1/catalog/modules.py +++ b/ai4papi/routers/v1/catalog/modules.py @@ -87,7 +87,7 @@ def get_config( ) # Fill with available GPU models in the cluster - models = nomad.common.get_gpu_models() + models = nomad.common.get_gpu_models(vo) if models: conf["hardware"]["gpu_type"]["options"] += models diff --git a/ai4papi/routers/v1/deployments/modules.py b/ai4papi/routers/v1/deployments/modules.py index 10e2e9d..7a1f748 100644 --- a/ai4papi/routers/v1/deployments/modules.py +++ b/ai4papi/routers/v1/deployments/modules.py @@ -57,6 +57,7 @@ def get_deployments( jobs = nomad.get_deployments( namespace=papiconf.MAIN_CONF['nomad']['namespaces'][vo], owner=auth_info['id'], + prefix='module', ) # Retrieve info for jobs in namespace @@ -119,15 +120,10 @@ def get_deployment( ) # Check the deployment is indeed a module - tool_list = papiconf.TOOLS.keys() - module_name = re.search( - '/(.*):', # remove dockerhub account and tag - job['docker_image'], - ).group(1) - if module_name in tool_list: + if not job['name'].startswith('module'): raise HTTPException( status_code=400, - detail="This deployment is a tool, not a module.", + detail="This deployment is not a module.", ) return job @@ -207,19 +203,23 @@ def create_deployment( else: priority = 50 - # Generate a domain for user-app and check nothing is running there - domain = utils.generate_domain( + # Remove non-compliant characters from hostname + base_domain = papiconf.MAIN_CONF['lb']['domain'][vo] + hostname = utils.safe_hostname( hostname=user_conf['general']['hostname'], - base_domain=papiconf.MAIN_CONF['lb']['domain'][vo], job_uuid=job_uuid, ) - utils.check_domain(domain) - #TODO: remove when we solve disk issues - # For now on we fix disk here because, if not fixed, jobs are not being deployed - # (ie. "resource disk exhausted"). - # In any case, this limit is useless because it has not yet been passed to docker - user_conf['hardware']['disk'] = 500 + #TODO: reenable custom hostname, when we are able to parse all node metadata + # (domain key) to build the true domain + hostname = job_uuid + + # # Check the hostname is available in all data-centers + # # (we don't know beforehand where the job will land) + # #TODO: make sure this does not break if the datacenter is unavailable + # #TODO: disallow custom hostname, pain in the ass, slower deploys + # for datacenter in papiconf.MAIN_CONF['nomad']['datacenters']: + # utils.check_domain(f"{hostname}.{datacenter}-{base_domain}") # Replace the Nomad job template nomad_conf = nomad_conf.safe_substitute( @@ -232,7 +232,8 @@ def create_deployment( 'OWNER_EMAIL': auth_info['email'], 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters - 'DOMAIN': domain, + 'BASE_DOMAIN': base_domain, + 'HOSTNAME': hostname, 'DOCKER_IMAGE': user_conf['general']['docker_image'], 'DOCKER_TAG': user_conf['general']['docker_tag'], 'SERVICE': user_conf['general']['service'], @@ -256,7 +257,7 @@ def create_deployment( nomad_conf = nomad.load_job_conf(nomad_conf) tasks = nomad_conf['TaskGroups'][0]['Tasks'] - usertask = [t for t in tasks if t['Name']=='usertask'][0] + usertask = [t for t in tasks if t['Name']=='main'][0] # Apply patches if needed usertask = module_patches.patch_nextcloud_mount( diff --git a/ai4papi/routers/v1/deployments/tools.py b/ai4papi/routers/v1/deployments/tools.py index edb7064..e5764fe 100644 --- a/ai4papi/routers/v1/deployments/tools.py +++ b/ai4papi/routers/v1/deployments/tools.py @@ -60,6 +60,7 @@ def get_deployments( jobs = nomad.get_deployments( namespace=papiconf.MAIN_CONF['nomad']['namespaces'][vo], owner=auth_info['id'], + prefix='tool', ) # Retrieve info for jobs in namespace @@ -122,15 +123,10 @@ def get_deployment( ) # Check the deployment is indeed a tool - tool_list = papiconf.TOOLS.keys() - tool_name = re.search( - '/(.*):', # remove dockerhub account and tag - job['docker_image'], - ).group(1) - if tool_name not in tool_list: + if not job['name'].startswith('tool'): raise HTTPException( status_code=400, - detail="This deployment is a module, not a tool.", + detail="This deployment is not a tool.", ) return job @@ -201,13 +197,23 @@ def create_deployment( else: priority = 50 - # Generate a domain for user-app and check nothing is running there - domain = utils.generate_domain( + # Remove non-compliant characters from hostname + base_domain = papiconf.MAIN_CONF['lb']['domain'][vo] + hostname = utils.safe_hostname( hostname=user_conf['general']['hostname'], - base_domain=papiconf.MAIN_CONF['lb']['domain'][vo], job_uuid=job_uuid, ) - utils.check_domain(domain) + + #TODO: reenable custom hostname, when we are able to parse all node metadata + # (domain key) to build the true domain + hostname = job_uuid + + # # Check the hostname is available in all data-centers + # # (we don't know beforehand where the job will land) + # #TODO: make sure this does not break if the datacenter is unavailable + # #TODO: disallow custom hostname, pain in the ass, slower deploys + # for datacenter in papiconf.MAIN_CONF['nomad']['datacenters']: + # utils.check_domain(f"{hostname}.{datacenter}-{base_domain}") # Create a default secret for the Federated Server _ = ai4secrets.create_secret( @@ -237,7 +243,8 @@ def create_deployment( 'OWNER_EMAIL': auth_info['email'], 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters - 'DOMAIN': domain, + 'BASE_DOMAIN': base_domain, + 'HOSTNAME': hostname, 'DOCKER_IMAGE': user_conf['general']['docker_image'], 'DOCKER_TAG': user_conf['general']['docker_tag'], 'CPU_NUM': user_conf['hardware']['cpu_num'], @@ -258,7 +265,7 @@ def create_deployment( nomad_conf = nomad.load_job_conf(nomad_conf) tasks = nomad_conf['TaskGroups'][0]['Tasks'] - usertask = [t for t in tasks if t['Name']=='usertask'][0] + usertask = [t for t in tasks if t['Name']=='main'][0] # Launch `deep-start` compatible service if needed service = user_conf['general']['service'] diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py index c0cc764..29b701b 100644 --- a/ai4papi/routers/v1/stats/deployments.py +++ b/ai4papi/routers/v1/stats/deployments.py @@ -181,6 +181,7 @@ def load_datacenters(): return datacenters +@cached(cache=TTLCache(maxsize=1024, ttl=30)) @router.get("/cluster") def get_cluster_stats( vo: str, @@ -192,13 +193,49 @@ def get_cluster_stats( """ global cluster_stats + stats = copy.deepcopy(cluster_stats) - #TODO: filter cluster stats to only return stats of the nodes that support a - # given VO. This is blocked until we move to the federated cluster where VO support - # is specified in the node metadata. - # (!) Total cluster resources will need to be computed after this filtering is done + namespace = papiconf.MAIN_CONF['nomad']['namespaces'][vo] - return cluster_stats + for k, v in stats['datacenters'].copy().items(): + + # Filter out nodes that do not support the given VO + nodes = {} + for n_id, n_stats in v['nodes'].items(): + if namespace in n_stats['namespaces']: + nodes[n_id] = n_stats + + # Ignore datacenters with no nodes + if not nodes: + del stats['datacenters'][k] + else: + stats['datacenters'][k]['nodes'] = nodes + + # Compute cluster stats after node filtering is done + for dc_stats in stats['datacenters'].values(): + for n_stats in dc_stats['nodes'].values(): + for k, v in n_stats.items(): + + # Ignore keys + if k in ['name', 'namespaces', 'eligibility', 'status']: + continue + + # Aggregate nested gpu_models dict + elif k == 'gpu_models': + for k1, v1 in v.items(): + model_stats = stats['cluster']['gpu_models'].get( + k1, + {'gpu_total': 0, 'gpu_used': 0,} # init value + ) + for k2, v2 in v1.items(): + model_stats[k2] += v2 + stats['cluster']['gpu_models'][k1] = model_stats + + # Aggregate other resources + else: + stats['cluster'][k] += v + + return stats @cached(cache=TTLCache(maxsize=1024, ttl=30)) @@ -229,7 +266,6 @@ def get_cluster_stats_bg(): # Load nodes nodes = Nomad.nodes.get_nodes(resources=True) - gpu_stats = {} nodes_dc = {} # dict(node, datacenter) # Get total stats for each node @@ -238,7 +274,6 @@ def get_cluster_stats_bg(): n_stats = {k: 0 for k in resources} n_stats['name'] = node['Name'] n_stats['eligibility'] = node['SchedulingEligibility'] - n_stats['jobs_num'] = 0 n_stats['cpu_total'] = int(node['Attributes']['cpu.numcores']) n_stats['ram_total'] = int(node['Attributes']['memory.totalbytes']) / 2**20 n_stats['disk_total'] = int(node['Attributes']['unique.storage.bytestotal']) / 2**20 @@ -247,6 +282,8 @@ def get_cluster_stats_bg(): - int(node['Attributes']['unique.storage.bytesfree'])) \ / 2**20 n_stats['gpu_models'] = {} + n_stats['namespaces'] = node['Meta'].get('namespace', '') + n_stats['status'] = node['Meta'].get('status', '') if n['NodeResources']['Devices']: for devices in n['NodeResources']['Devices']: @@ -254,13 +291,9 @@ def get_cluster_stats_bg(): n_stats['gpu_total'] += len(devices['Instances']) # Track stats per GPU model type - if devices['Name'] not in gpu_stats.keys(): - gpu_stats[devices['Name']] = {'gpu_total': 0, 'gpu_used': 0} - if devices['Name'] not in n_stats['gpu_models'].keys(): n_stats['gpu_models'][devices['Name']] = {'gpu_total': 0, 'gpu_used': 0} - gpu_stats[devices['Name']]['gpu_total'] += len(devices['Instances']) n_stats['gpu_models'][devices['Name']]['gpu_total'] += len(devices['Instances']) # If datacenter is not in csv, load default info @@ -276,6 +309,7 @@ def get_cluster_stats_bg(): for namespace in namespaces: jobs = Nomad.jobs.get_jobs(namespace=namespace, filter_='Status == "running"') for j in jobs: + # Retrieve full job for meta job = Nomad.job.get_job( id_=j['ID'], @@ -295,10 +329,12 @@ def get_cluster_stats_bg(): # Add resources datacenter = nodes_dc[a['NodeID']] n_stats = stats['datacenters'][datacenter]['nodes'][a['NodeID']] - if 'userjob' in job['Name']: + + #TODO: we are ignoring resources consumed by other jobs + if job['Name'].startswith('module') or job['Name'].startswith('tool'): n_stats['jobs_num'] += 1 - #FIXME: we are ignoring resources consumed by other tasks + #TODO: we are ignoring resources consumed by other tasks if 'usertask' in a['AllocatedResources']['Tasks']: res = a['AllocatedResources']['Tasks']['usertask'] @@ -314,12 +350,11 @@ def get_cluster_stats_bg(): gpu = [d for d in res['Devices'] if d['Type'] == 'gpu'][0] gpu_num = len(gpu['DeviceIDs']) if gpu else 0 - # Sometime the node fails and GPUs are not detected [1]. + # Sometimes the node fails and GPUs are not detected [1]. # In that case, avoid counting that GPU in the stats. # [1]: https://docs.ai4os.eu/en/latest/user/others/faq.html#my-gpu-just-disappeared-from-my-deployment if n_stats['gpu_models']: n_stats['gpu_used'] += gpu_num - gpu_stats[gpu['Name']]['gpu_used'] += gpu_num n_stats['gpu_models'][gpu['Name']]['gpu_used'] += gpu_num else: continue @@ -333,36 +368,7 @@ def get_cluster_stats_bg(): for r in ['cpu', 'gpu', 'ram', 'disk']: n_stats[f'{r}_total'] = n_stats[f'{r}_used'] for g_stats in n_stats['gpu_models'].values(): - g_stats[f'gpu_total'] = n_stats[f'gpu_used'] - - # Ignore datacenters with no nodes - for k, v in stats['datacenters'].copy().items(): - if not v['nodes']: - del stats['datacenters'][k] - - # Compute cluster stats - for dc_stats in stats['datacenters'].values(): - for n_stats in dc_stats['nodes'].values(): - for k, v in n_stats.items(): - - # Ignore keys - if k in ['name', 'eligibility']: - continue - - # Aggregate nested gpu_models dict - elif k == 'gpu_models': - for k1, v1 in v.items(): - model_stats = stats['cluster']['gpu_models'].get( - k1, - {'gpu_total': 0, 'gpu_used': 0,} # init value - ) - for k2, v2 in v1.items(): - model_stats[k2] += v2 - stats['cluster']['gpu_models'][k1] = model_stats - - # Aggregate other resources - else: - stats['cluster'][k] += v + g_stats['gpu_total'] = n_stats['gpu_used'] # Set the new shared variable global cluster_stats diff --git a/ai4papi/utils.py b/ai4papi/utils.py index a05f3bc..a16e727 100644 --- a/ai4papi/utils.py +++ b/ai4papi/utils.py @@ -11,9 +11,8 @@ session = requests.Session() -def generate_domain( +def safe_hostname( hostname: str, - base_domain: str, job_uuid: str, ): @@ -47,12 +46,10 @@ def generate_domain( detail="Hostname should be shorter than 40 characters." ) - domain = f"{hostname}.{base_domain}" + return hostname - else: # we use job_ID as default subdomain - domain = f"{job_uuid}.{base_domain}" - - return domain + else: # we use job_ID as default hostname + return job_uuid def check_domain(base_url): diff --git a/docker/Dockerfile b/docker/Dockerfile index 3127da1..2e698b7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -20,7 +20,7 @@ RUN apt-get update && \ apt-get install nomad && \ rm -rf /var/lib/apt/lists/* -ENV NOMAD_ADDR=https://193.146.75.221:4646 +ENV NOMAD_ADDR=https://193.146.75.205:4646 ENV NOMAD_CACERT=/home/nomad-certs/nomad-ca.pem ENV NOMAD_CLIENT_CERT=/home/nomad-certs/cli.pem ENV NOMAD_CLIENT_KEY=/home/nomad-certs/cli-key.pem diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 13b7047..432e32c 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -6,7 +6,7 @@ services: image: "registry.services.ai4os.eu/ai4os/ai4-papi:prod" restart: always environment: - - NOMAD_ADDR=https://193.146.75.221:4646 + - NOMAD_ADDR=https://193.146.75.205:4646 - ACCOUNTING_PTH=/home/ai4-accounting - ZENODO_TOKEN=************************* volumes: @@ -21,7 +21,7 @@ services: image: "registry.services.ai4os.eu/ai4os/ai4-papi:prod" restart: always environment: - - NOMAD_ADDR=https://193.146.75.221:4646 + - NOMAD_ADDR=https://193.146.75.205:4646 - ACCOUNTING_PTH=/home/ai4-accounting - ZENODO_TOKEN=************************* volumes: diff --git a/etc/modules/nomad.hcl b/etc/modules/nomad.hcl index 4c2426f..21066d3 100644 --- a/etc/modules/nomad.hcl +++ b/etc/modules/nomad.hcl @@ -9,7 +9,7 @@ When replacing user values we use safe_substitute() so that ge don't get an erro replacing Nomad values */ -job "userjob-${JOB_UUID}" { +job "module-${JOB_UUID}" { namespace = "${NAMESPACE}" type = "service" region = "global" @@ -24,6 +24,38 @@ job "userjob-${JOB_UUID}" { description = "${DESCRIPTION}" } + # Only use nodes that have succesfully passed the ai4-nomad_tests (ie. meta.status=ready) + constraint { + attribute = "${meta.status}" + operator = "regexp" + value = "ready" + } + + # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) + constraint { + attribute = "${meta.compute}" + operator = "=" + value = "true" + } + + # Only deploy in nodes serving that namespace (we use metadata instead of node-pools + # because Nomad does not allow a node to belong to several node pools) + constraint { + attribute = "${meta.namespace}" + operator = "regexp" + value = "${NAMESPACE}" + } + + # Try to deploy iMagine jobs on nodes that are iMagine-exclusive + # In this way, we leave AI4EOSC nodes for AI4EOSC users and for iMagine users only + # when iMagine nodes are fully booked. + affinity { + attribute = "${meta.namespace}" + operator = "regexp" + value = "ai4eosc" + weight = -50 # anti-affinity for ai4eosc clients + } + # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid # overloading GPU clients with CPU-only jobs. affinity { @@ -68,7 +100,7 @@ job "userjob-${JOB_UUID}" { tags = [ "traefik.enable=true", "traefik.http.routers.${JOB_UUID}-api.tls=true", - "traefik.http.routers.${JOB_UUID}-api.rule=Host(`api-${DOMAIN}`, `www.api-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-api.rule=Host(`api-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.api-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", ] } @@ -78,7 +110,7 @@ job "userjob-${JOB_UUID}" { tags = [ "traefik.enable=true", "traefik.http.routers.${JOB_UUID}-monitor.tls=true", - "traefik.http.routers.${JOB_UUID}-monitor.rule=Host(`monitor-${DOMAIN}`, `www.monitor-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-monitor.rule=Host(`monitor-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.monitor-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", ] } @@ -88,7 +120,7 @@ job "userjob-${JOB_UUID}" { tags = [ "traefik.enable=true", "traefik.http.routers.${JOB_UUID}-ide.tls=true", - "traefik.http.routers.${JOB_UUID}-ide.rule=Host(`ide-${DOMAIN}`, `www.ide-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-ide.rule=Host(`ide-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.ide-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", ] } @@ -96,7 +128,7 @@ job "userjob-${JOB_UUID}" { size = ${DISK} } - task "storagetask" { + task "storage_mount" { // Running task in charge of mounting storage lifecycle { @@ -162,7 +194,7 @@ job "userjob-${JOB_UUID}" { } - task "usertask" { + task "main" { // Task configured by the user (deepaas, jupyter, vscode) driver = "docker" @@ -178,6 +210,9 @@ job "userjob-${JOB_UUID}" { volumes = [ "/nomad-storage/${JOB_UUID}:/storage:shared", ] + storage_opt = { + size = "${DISK}M" + } } env { @@ -209,7 +244,7 @@ job "userjob-${JOB_UUID}" { } } - task "storagecleanup" { + task "storage_cleanup" { // Unmount empty storage folder and delete it from host lifecycle { diff --git a/etc/modules/user.yaml b/etc/modules/user.yaml index b9196c1..d4cafba 100644 --- a/etc/modules/user.yaml +++ b/etc/modules/user.yaml @@ -9,6 +9,16 @@ # options: restricted set of values that the parameter can take (optional) # description: some comments on the parameter to be displayed to the end user (optional) +# CPU/RAM limits are based on the current have GPU flavours we (IFCA) have deployed: +# * g13-gpuib-8-86: +# - 8 GPUs Nvidia Tesla T4 +# - 86 VCPUs --> ~9.6 cores / gpu (reserving 10% for the node) +# - 351 GB RAM --> ~40 GB / gpu (reserving 10% for the node) +# * g12-gpuib-2-64: +# - 2 GPUs Nvidia Tesla V100 +# - 64 VCPUs --> ~28 cores / gpu (reserving 10% for the node) +# - 127 GB RAM --> ~57 GB / gpu (reserving 10% for the node) + general: title: @@ -69,12 +79,12 @@ hardware: ram: name: RAM memory (in MB) value: 8000 - range: [2000, 25000] + range: [2000, 40000] disk: name: Disk memory (in MB) value: 10000 - range: [1000, 20000] + range: [1000, 50000] storage: diff --git a/etc/tools/deep-oc-federated-server/nomad.hcl b/etc/tools/deep-oc-federated-server/nomad.hcl index dc320a0..b825659 100644 --- a/etc/tools/deep-oc-federated-server/nomad.hcl +++ b/etc/tools/deep-oc-federated-server/nomad.hcl @@ -9,7 +9,7 @@ When replacing user values we use safe_substitute() so that ge don't get an erro replacing Nomad values */ -job "userjob-${JOB_UUID}" { +job "tool-fl-${JOB_UUID}" { namespace = "${NAMESPACE}" type = "service" region = "global" @@ -24,6 +24,38 @@ job "userjob-${JOB_UUID}" { description = "${DESCRIPTION}" } + # Only use nodes that have succesfully passed the ai4-nomad_tests (ie. meta.status=ready) + constraint { + attribute = "${meta.status}" + operator = "regexp" + value = "ready" + } + + # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) + constraint { + attribute = "${meta.compute}" + operator = "=" + value = "true" + } + + # Only deploy in nodes serving that namespace (we use metadata instead of node-pools + # because Nomad does not allow a node to belong to several node pools) + constraint { + attribute = "${meta.namespace}" + operator = "regexp" + value = "${NAMESPACE}" + } + + # Try to deploy iMagine jobs on nodes that are iMagine-exclusive + # In this way, we leave AI4EOSC nodes for AI4EOSC users and for iMagine users only + # when iMagine nodes are fully booked. + affinity { + attribute = "${meta.namespace}" + operator = "regexp" + value = "ai4eosc" + weight = -50 # anti-affinity for ai4eosc clients + } + # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid # overloading GPU clients with CPU-only jobs. affinity { @@ -68,7 +100,7 @@ job "userjob-${JOB_UUID}" { tags = [ "traefik.enable=true", "traefik.http.routers.${JOB_UUID}-fedserver.tls=true", - "traefik.http.routers.${JOB_UUID}-fedserver.rule=Host(`fedserver-${DOMAIN}`, `www.fedserver-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-fedserver.rule=Host(`fedserver-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.fedserver-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", "traefik.http.services.${JOB_UUID}-fedserver.loadbalancer.server.scheme=h2c", # grpc support ] } @@ -79,7 +111,7 @@ job "userjob-${JOB_UUID}" { tags = [ "traefik.enable=true", "traefik.http.routers.${JOB_UUID}-monitor.tls=true", - "traefik.http.routers.${JOB_UUID}-monitor.rule=Host(`monitor-${DOMAIN}`, `www.monitor-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-monitor.rule=Host(`monitor-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.monitor-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", ] } @@ -89,7 +121,7 @@ job "userjob-${JOB_UUID}" { tags = [ "traefik.enable=true", "traefik.http.routers.${JOB_UUID}-ide.tls=true", - "traefik.http.routers.${JOB_UUID}-ide.rule=Host(`ide-${DOMAIN}`, `www.ide-${DOMAIN}`)", + "traefik.http.routers.${JOB_UUID}-ide.rule=Host(`ide-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`, `www.ide-${HOSTNAME}.${meta.domain}-${BASE_DOMAIN}`)", ] } @@ -97,7 +129,7 @@ job "userjob-${JOB_UUID}" { size = ${DISK} } - task "usertask" { + task "main" { driver = "docker" # Use default command defined in the Dockerfile @@ -107,6 +139,9 @@ job "userjob-${JOB_UUID}" { ports = ["fedserver", "monitor", "ide"] shm_size = ${SHARED_MEMORY} memory_hard_limit = ${RAM} + storage_opt = { + size = "${DISK}M" + } } env {