diff --git a/ai4papi/conf.py b/ai4papi/conf.py index 6c35a2f..1ec5bda 100644 --- a/ai4papi/conf.py +++ b/ai4papi/conf.py @@ -88,6 +88,15 @@ def load_yaml_conf(fpath): } } +# For tools, map the Nomad job name prefixes to tool IDs +tools_nomad2id = { + 'fl': 'ai4os-federated-server', + 'cvat': 'ai4os-cvat', +} +for tool in TOOLS.keys(): + if tool not in tools_nomad2id.values(): + raise Exception(f"The tool {tool} is missing from the mapping dictionary.") + # OSCAR template with open(paths['conf'] / 'oscar.yaml', 'r') as f: OSCAR_TMPL = Template(f.read()) diff --git a/ai4papi/nomad/common.py b/ai4papi/nomad/common.py index 77491c3..ea33750 100644 --- a/ai4papi/nomad/common.py +++ b/ai4papi/nomad/common.py @@ -133,7 +133,7 @@ def get_deployment( # Iterate through tags to find `Host` tag for t in s['Tags']: try: - url = re.search('Host\(`(.+?)`', t).group(1) + url = re.search(r'Host\(`(.+?)`', t).group(1) break except Exception: url = "missing-endpoint" @@ -166,7 +166,7 @@ def get_deployment( info['main_endpoint'] = service2endpoint[service] except Exception: # return first endpoint - info['main_endpoint'] = list(info['endpoints'].values())[0] + info['main_endpoint'] = list(info['endpoints'].keys())[0] # Only fill resources if the job is allocated allocs = Nomad.job.get_allocations( @@ -274,7 +274,7 @@ def get_deployment( # Something happened, job didn't deploy (eg. job needs port that's currently being used) # We have to return `placement failures message`. info['status'] = 'error' - info['error_msg'] = f"{evals[0]['FailedTGAllocs']}" + info['error_msg'] = f"{evals[0].get('FailedTGAllocs', '')}" else: # info['error_msg'] = f"Job has not been yet evaluated. Contact with support sharing your job ID: {j['ID']}." diff --git a/ai4papi/routers/v1/__init__.py b/ai4papi/routers/v1/__init__.py index 9aa741a..2a1b846 100644 --- a/ai4papi/routers/v1/__init__.py +++ b/ai4papi/routers/v1/__init__.py @@ -1,6 +1,6 @@ import fastapi -from . import catalog, deployments, inference, secrets, stats, try_me +from . import catalog, deployments, inference, secrets, stats, storage, try_me router = fastapi.APIRouter() @@ -9,6 +9,7 @@ router.include_router(inference.router) router.include_router(secrets.router) router.include_router(stats.router) +router.include_router(storage.router) router.include_router(try_me.router) diff --git a/ai4papi/routers/v1/catalog/common.py b/ai4papi/routers/v1/catalog/common.py index e91f060..f2c82e1 100644 --- a/ai4papi/routers/v1/catalog/common.py +++ b/ai4papi/routers/v1/catalog/common.py @@ -22,6 +22,7 @@ This means you cannot name your modules like those names (eg. tags, detail, etc) """ +import configparser import re from typing import Tuple, Union import yaml @@ -32,12 +33,17 @@ import requests from ai4papi import utils +import ai4papi.conf as papiconf class Catalog: - def __init__(self) -> None: - pass + def __init__(self, repo: str) -> None: + """ + Parameters: + * repo: Github repo where the catalog is hosted (via git submodules) + """ + self.repo = repo @cached(cache=TTLCache(maxsize=1024, ttl=6*60*60)) @@ -57,8 +63,26 @@ def get_items( This is implemented in a separate function as many functions from this router are using this function, so we need to avoid infinite recursions. """ - return {} + gitmodules_url = f"https://raw.githubusercontent.com/{self.repo}/master/.gitmodules" + r = requests.get(gitmodules_url) + + cfg = configparser.ConfigParser() + cfg.read_string(r.text) + modules = {} + for section in cfg.sections(): + items = dict(cfg.items(section)) + key = items.pop('path').lower() + items['url'] = items['url'].replace('.git', '') # remove `.git`, if present + modules[key] = items + + # In the case of the tools repo, make sure to remove any tool that is not yet + # supported by PAPI (use the ^ operator to only keep common items) + if 'tool' in self.repo: + for tool_name in papiconf.TOOLS.keys() ^ modules.keys(): + _ = modules.pop(tool_name) + + return modules @cached(cache=TTLCache(maxsize=1024, ttl=6*60*60)) def get_filtered_list( diff --git a/ai4papi/routers/v1/catalog/modules.py b/ai4papi/routers/v1/catalog/modules.py index 3ee1825..339f48d 100644 --- a/ai4papi/routers/v1/catalog/modules.py +++ b/ai4papi/routers/v1/catalog/modules.py @@ -1,35 +1,14 @@ -import configparser from copy import deepcopy import types -from cachetools import cached, TTLCache from fastapi import APIRouter, HTTPException from natsort import natsorted -import requests from ai4papi import quotas, nomad import ai4papi.conf as papiconf from .common import Catalog, retrieve_docker_tags -@cached(cache=TTLCache(maxsize=1024, ttl=6*60*60)) -def get_items(self): - gitmodules_url = "https://raw.githubusercontent.com/ai4os-hub/modules-catalog/master/.gitmodules" - r = requests.get(gitmodules_url) - - cfg = configparser.ConfigParser() - cfg.read_string(r.text) - - modules = {} - for section in cfg.sections(): - items = dict(cfg.items(section)) - key = items.pop('path').lower() - items['url'] = items['url'].replace('.git', '') # remove `.git`, if present - modules[key] = items - - return modules - - def get_config( self, item_name: str, @@ -91,8 +70,9 @@ def get_config( return conf -Modules = Catalog() -Modules.get_items = types.MethodType(get_items, Modules) +Modules = Catalog( + repo='ai4os-hub/modules-catalog', +) Modules.get_config = types.MethodType(get_config, Modules) diff --git a/ai4papi/routers/v1/catalog/tools.py b/ai4papi/routers/v1/catalog/tools.py index 473415f..22a0446 100644 --- a/ai4papi/routers/v1/catalog/tools.py +++ b/ai4papi/routers/v1/catalog/tools.py @@ -1,31 +1,15 @@ from copy import deepcopy import types -from cachetools import cached, TTLCache from fastapi import APIRouter, HTTPException +from fastapi.security import HTTPBearer from ai4papi import quotas import ai4papi.conf as papiconf from .common import Catalog, retrieve_docker_tags -@cached(cache=TTLCache(maxsize=1024, ttl=6*60*60)) -def get_items(self): - # Set default branch manually (because we are not yet reading this from submodules) - # TODO: start reading from submodules (only accept the submodules that have been - # integrated in papiconf.TOOLS) - tools_branches= { - 'ai4os-federated-server': 'main', - } - - tools = {} - for k in papiconf.TOOLS.keys(): - tools[k] = { - 'url': f'https://github.com/ai4os/{k}', - 'branch': tools_branches[k], - } - - return tools +security = HTTPBearer() def get_config( @@ -33,6 +17,12 @@ def get_config( item_name: str, vo: str, ): + """ + Returns the default configuration (dict) for creating a deployment + for a specific item. It is prefilled with the appropriate + docker image and the available docker tags. + """ + # Retrieve tool configuration try: conf = deepcopy(papiconf.TOOLS[item_name]['user']['full']) @@ -51,25 +41,27 @@ def get_config( if repo not in ['deephdc', 'ai4oshub']: repo = 'ai4oshub' - # Fill with correct Docker image - conf["general"]["docker_image"]["value"] = f"{repo}/{image}" + # Fill with correct Docker image and tags (not needed for CVAT because hardcoded) + if item_name != 'ai4os-cvat': + conf["general"]["docker_image"]["value"] = f"{repo}/{image}" - # Add available Docker tags - tags = retrieve_docker_tags(image=image, repo=repo) - conf["general"]["docker_tag"]["options"] = tags - conf["general"]["docker_tag"]["value"] = tags[0] + tags = retrieve_docker_tags(image=image, repo=repo) + conf["general"]["docker_tag"]["options"] = tags + conf["general"]["docker_tag"]["value"] = tags[0] # Modify the resources limits for a given user or VO - conf["hardware"] = quotas.limit_resources( - item_name=item_name, - vo=vo, - ) + if conf.get("hardware", None): + conf["hardware"] = quotas.limit_resources( + item_name=item_name, + vo=vo, + ) return conf -Tools = Catalog() -Tools.get_items = types.MethodType(get_items, Tools) +Tools = Catalog( + repo='ai4os/tools-catalog', +) Tools.get_config = types.MethodType(get_config, Tools) diff --git a/ai4papi/routers/v1/deployments/tools.py b/ai4papi/routers/v1/deployments/tools.py index 82a16a0..49cf252 100644 --- a/ai4papi/routers/v1/deployments/tools.py +++ b/ai4papi/routers/v1/deployments/tools.py @@ -1,4 +1,5 @@ from copy import deepcopy +from datetime import datetime import re import secrets import types @@ -12,6 +13,7 @@ from ai4papi import auth, quotas, utils import ai4papi.conf as papiconf import ai4papi.nomad.common as nomad +from ai4papi.routers.v1.catalog.tools import Tools as Tools_catalog from ai4papi.routers.v1 import secrets as ai4secrets @@ -128,12 +130,28 @@ def get_deployment( detail="This deployment is not a tool.", ) + # Add an additional field with the tool type + # We map name from Nomad job to tool ID + match = re.search(r'tool-(.*?)-[a-f0-9-]{36}', job['name']) + nomad_name = match.group(1) if match else '' + tool_id = papiconf.tools_nomad2id.get(nomad_name, '') + job['tool_name'] = tool_id + + # Additional checks + if tool_id == 'ai4os-cvat': + # Remove useless endpoints (they all point to same url) + ignore = ['server', 'grafana'] + job['endpoints'] = {k: v for k, v in job['endpoints'].items() if k not in ignore} + if job['active_endpoints']: + job['active_endpoints'] = [k for k in job['active_endpoints'] if k not in ignore] + return job @router.post("") def create_deployment( vo: str, + tool_name: str, conf: Union[dict, None] = None, authorization=Depends(security), ): @@ -164,11 +182,12 @@ def create_deployment( auth_info = auth.get_user_info(token=authorization.credentials) auth.check_vo_membership(vo, auth_info['vos']) - # Retrieve toolname from configuration, else deploy first tool in the list - try: - tool_name = conf["general"]["docker_image"].split('/')[1] # deephdc/* - except Exception: - tool_name = list(papiconf.TOOLS.keys())[0] + # Check tool_ID + if tool_name not in Tools_catalog.get_items().keys(): + raise HTTPException( + status_code=400, + detail="This ID does not correspond to an available tool.", + ) # Load tool configuration nomad_conf = deepcopy(papiconf.TOOLS[tool_name]['nomad']) @@ -185,10 +204,12 @@ def create_deployment( user_conf = utils.validate_conf(user_conf) # Check if the provided configuration is within the job quotas - quotas.check_jobwise( - conf=user_conf, - vo=vo, - ) + # Skip this check with CVAT because it does not have a "hardware" section in the conf + if tool_name not in ['ai4os-cvat']: + quotas.check_jobwise( + conf=user_conf, + vo=vo, + ) # Generate UUID from (MAC address+timestamp) so it's unique job_uuid = uuid.uuid1() @@ -201,71 +222,127 @@ def create_deployment( base_domain = papiconf.MAIN_CONF['lb']['domain'][vo] - # Create a default secret for the Federated Server - _ = ai4secrets.create_secret( - vo=vo, - secret_path=f"deployments/{job_uuid}/federated/default", - secret_data={'token': secrets.token_hex()}, - authorization=SimpleNamespace( - credentials=authorization.credentials, - ), - ) + # Deploy a Federated server + if tool_name == 'ai4os-federated-server': - # Create a Vault token so that the deployment can access the Federated secret - vault_token = ai4secrets.create_vault_token( - jwt=authorization.credentials, - issuer=auth_info['issuer'], - ttl='365d', # 1 year expiration date - ) + # Create a default secret for the Federated Server + _ = ai4secrets.create_secret( + vo=vo, + secret_path=f"deployments/{job_uuid}/federated/default", + secret_data={'token': secrets.token_hex()}, + authorization=SimpleNamespace( + credentials=authorization.credentials, + ), + ) - # Replace the Nomad job template - nomad_conf = nomad_conf.safe_substitute( - { - 'JOB_UUID': job_uuid, - 'NAMESPACE': papiconf.MAIN_CONF['nomad']['namespaces'][vo], - 'PRIORITY': priority, - 'OWNER': auth_info['id'], - 'OWNER_NAME': auth_info['name'], - 'OWNER_EMAIL': auth_info['email'], - 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters - 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters - 'BASE_DOMAIN': base_domain, - 'HOSTNAME': job_uuid, - 'DOCKER_IMAGE': user_conf['general']['docker_image'], - 'DOCKER_TAG': user_conf['general']['docker_tag'], - 'CPU_NUM': user_conf['hardware']['cpu_num'], - 'RAM': user_conf['hardware']['ram'], - 'DISK': user_conf['hardware']['disk'], - 'SHARED_MEMORY': user_conf['hardware']['ram'] * 10**6 * 0.5, - # Limit at 50% of RAM memory, in bytes - 'JUPYTER_PASSWORD': user_conf['general']['jupyter_password'], - 'VAULT_TOKEN': vault_token, - 'FEDERATED_ROUNDS': user_conf['configuration']['rounds'], - 'FEDERATED_METRIC': user_conf['configuration']['metric'], - 'FEDERATED_MIN_FIT_CLIENTS': user_conf['configuration']['min_fit_clients'], - 'FEDERATED_MIN_AVAILABLE_CLIENTS': user_conf['configuration']['min_available_clients'], - 'FEDERATED_STRATEGY': user_conf['configuration']['strategy'], - 'MU_FEDPROX': user_conf['configuration']['mu'], - 'FEDAVGM_SERVER_FL' : user_conf['configuration']['fl'], - 'FEDAVGM_SERVER_MOMENTUM': user_conf['configuration']['momentum'], - 'DP': user_conf['configuration']['dp'], - 'NOISE_MULT': user_conf['configuration']['noise_mult'], - 'SAMPLED_CLIENTS': user_conf['configuration']['sampled_clients'], - 'CLIP_NORM': user_conf['configuration']['clip_norm'] - } - ) + # Create a Vault token so that the deployment can access the Federated secret + vault_token = ai4secrets.create_vault_token( + jwt=authorization.credentials, + issuer=auth_info['issuer'], + ttl='365d', # 1 year expiration date + ) + + # Replace the Nomad job template + nomad_conf = nomad_conf.safe_substitute( + { + 'JOB_UUID': job_uuid, + 'NAMESPACE': papiconf.MAIN_CONF['nomad']['namespaces'][vo], + 'PRIORITY': priority, + 'OWNER': auth_info['id'], + 'OWNER_NAME': auth_info['name'], + 'OWNER_EMAIL': auth_info['email'], + 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters + 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters + 'BASE_DOMAIN': base_domain, + 'HOSTNAME': job_uuid, + 'DOCKER_IMAGE': user_conf['general']['docker_image'], + 'DOCKER_TAG': user_conf['general']['docker_tag'], + 'CPU_NUM': user_conf['hardware']['cpu_num'], + 'RAM': user_conf['hardware']['ram'], + 'DISK': user_conf['hardware']['disk'], + 'SHARED_MEMORY': user_conf['hardware']['ram'] * 10**6 * 0.5, + # Limit at 50% of RAM memory, in bytes + 'JUPYTER_PASSWORD': user_conf['general']['jupyter_password'], + 'VAULT_TOKEN': vault_token, + 'FEDERATED_ROUNDS': user_conf['configuration']['rounds'], + 'FEDERATED_METRIC': user_conf['configuration']['metric'], + 'FEDERATED_MIN_FIT_CLIENTS': user_conf['configuration']['min_fit_clients'], + 'FEDERATED_MIN_AVAILABLE_CLIENTS': user_conf['configuration']['min_available_clients'], + 'FEDERATED_STRATEGY': user_conf['configuration']['strategy'], + 'MU_FEDPROX': user_conf['configuration']['mu'], + 'FEDAVGM_SERVER_FL' : user_conf['configuration']['fl'], + 'FEDAVGM_SERVER_MOMENTUM': user_conf['configuration']['momentum'], + 'DP': user_conf['configuration']['dp'], + 'NOISE_MULT': user_conf['configuration']['noise_mult'], + 'SAMPLED_CLIENTS': user_conf['configuration']['sampled_clients'], + 'CLIP_NORM': user_conf['configuration']['clip_norm'] + } + ) - # Convert template to Nomad conf - nomad_conf = nomad.load_job_conf(nomad_conf) + # Convert template to Nomad conf + nomad_conf = nomad.load_job_conf(nomad_conf) - tasks = nomad_conf['TaskGroups'][0]['Tasks'] - usertask = [t for t in tasks if t['Name']=='main'][0] + tasks = nomad_conf['TaskGroups'][0]['Tasks'] + usertask = [t for t in tasks if t['Name']=='main'][0] + + # Launch `deep-start` compatible service if needed + service = user_conf['general']['service'] + if service in ['deepaas', 'jupyter', 'vscode']: + usertask['Config']['command'] = 'deep-start' + usertask['Config']['args'] = [f'--{service}'] + + # Deploy a CVAT tool + elif tool_name == 'ai4os-cvat': + + # Enforce defining CVAT username and password + cvat = {k: v for k, v in user_conf['general'].items() if k in ['cvat_username', 'cvat_password']} + if not all(cvat.values()): + raise HTTPException( + status_code=400, + detail="You must fill all CVAT-related variables.", + ) + + # Enforce all rclone vars are defined + rclone = {k: v for k, v in user_conf['storage'].items() if k.startswith('rclone')} + if not all(rclone.values()): + raise HTTPException( + status_code=400, + detail="You must fill all RCLONE-related variables.", + ) + + # Replace the Nomad job template + job_title = re.sub( + r'[<>:"/\\|?* ]', + '_', + user_conf['general']['title'][:45], + ) # make title foldername-friendly + + nomad_conf = nomad_conf.safe_substitute( + { + 'JOB_UUID': job_uuid, + 'NAMESPACE': papiconf.MAIN_CONF['nomad']['namespaces'][vo], + 'PRIORITY': priority, + 'OWNER': auth_info['id'], + 'OWNER_NAME': auth_info['name'], + 'OWNER_EMAIL': auth_info['email'], + 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters + 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters + 'BASE_DOMAIN': base_domain, + 'HOSTNAME': job_uuid, + 'CVAT_USERNAME': user_conf['general']['cvat_username'], + 'CVAT_PASSWORD': user_conf['general']['cvat_password'], + 'RESTORE_FROM': user_conf['storage']['cvat_backup'], + 'BACKUP_NAME': f'{job_title}', + 'RCLONE_CONFIG_RSHARE_URL': user_conf['storage']['rclone_url'], + 'RCLONE_CONFIG_RSHARE_VENDOR': user_conf['storage']['rclone_vendor'], + 'RCLONE_CONFIG_RSHARE_USER': user_conf['storage']['rclone_user'], + 'RCLONE_CONFIG_RSHARE_PASS': user_conf['storage']['rclone_password'], + 'RCLONE_CONFIG': user_conf['storage']['rclone_conf'], + } + ) - # Launch `deep-start` compatible service if needed - service = user_conf['general']['service'] - if service in ['deepaas', 'jupyter', 'vscode']: - usertask['Config']['command'] = 'deep-start' - usertask['Config']['args'] = [f'--{service}'] + # Convert template to Nomad conf + nomad_conf = nomad.load_job_conf(nomad_conf) # Submit job r = nomad.create_deployment(nomad_conf) @@ -300,15 +377,15 @@ def delete_deployment( ) # Remove Vault secrets belonging to that deployment - r = ai4secrets.get_secrets( + secrets = ai4secrets.get_secrets( vo=vo, subpath=f"/deployments/{deployment_uuid}", authorization=SimpleNamespace( credentials=authorization.credentials, ), ) - for path in r.keys(): - r = ai4secrets.delete_secret( + for path in secrets.keys(): + _ = ai4secrets.delete_secret( vo=vo, secret_path=path, authorization=SimpleNamespace( diff --git a/ai4papi/routers/v1/storage.py b/ai4papi/routers/v1/storage.py new file mode 100644 index 0000000..09ceacf --- /dev/null +++ b/ai4papi/routers/v1/storage.py @@ -0,0 +1,85 @@ +""" +Misc utilities regarding AI4OS compatible storages. +""" + +import json +import subprocess +import types + +from fastapi import APIRouter, Depends, HTTPException +from fastapi.security import HTTPBearer + +from ai4papi import auth +from ai4papi.routers.v1 import secrets as ai4secrets + + +router = APIRouter( + prefix="/storage", + tags=["Storage utilities"], + responses={404: {"description": "Not found"}}, +) +security = HTTPBearer() + + +@router.get("/{storage_name}/ls") +def storage_ls( + vo: str, + storage_name: str, + subpath: str = '', + authorization=Depends(security), + ): + """ + Returns a list of files/folders inside a given subpath of the specified storage. + It is using RCLONE under-the-hood. + + Parameters: + * **vo**: Virtual Organization where you want to create your deployment + * **storage_name**: storage to parse. + * **subpath**: subpath to query + """ + # Retrieve authenticated user info + auth_info = auth.get_user_info(token=authorization.credentials) + auth.check_vo_membership(vo, auth_info['vos']) + + # Retrieve storage credentials + if storage_name: + # Retrieve the rclone credentials + secrets = ai4secrets.get_secrets( + vo=vo, + subpath='/services/storage/', + authorization=types.SimpleNamespace( + credentials=authorization.credentials, + ), + ) + storage = secrets[f'/services/storage/{storage_name}'] + if not storage: + raise HTTPException( + status_code=401, + detail="Invalid storage name.", + ) + + # Use rclone to parse the existing CVAT backups to restore from + result = subprocess.run([ + f"export RCLONE_CONFIG_RSHARE_VENDOR={storage['vendor']} && " + f"export RCLONE_CONFIG_RSHARE_URL={storage['server']}/remote.php/dav/files/{storage['loginName']} && " + "export RCLONE_CONFIG_RSHARE_TYPE=webdav && " + f"export RCLONE_CONFIG_RSHARE_USER={storage['loginName']} && " + f"export RCLONE_CONFIG_RSHARE_PASS={storage['appPassword']} && " + "export RCLONE_CONFIG_RSHARE_PASS=$(rclone obscure $RCLONE_CONFIG_RSHARE_PASS) && " + f"rclone lsjson rshare:/{subpath} ;" + "for var in $(env | grep '^RCLONE_CONFIG_RSHARE_' | awk -F= '{print $1}'); do unset $var; done" + ], + shell=True, + capture_output=True, + text=True + ) + + # Parse the JSON output + try: + json_output = json.loads(result.stdout) + return json_output + except Exception: + raise HTTPException( + status_code=500, + detail=f"Error retrieving information from storage. \n \n {result.stderr}", + ) diff --git a/docker/Dockerfile b/docker/Dockerfile index 2e698b7..8478df0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -4,9 +4,12 @@ FROM ubuntu:22.04 # Install Nomad # Updated commands: https://developer.hashicorp.com/nomad/tutorials/get-started/gs-install +# curl, unzip: install for the rclone command RUN apt-get update && \ apt-get install -y \ wget \ + curl \ + unzip \ gpg \ coreutils \ lsb-release \ @@ -30,6 +33,9 @@ ENV NOMAD_CLIENT_KEY=/home/nomad-certs/cli-key.pem # ENV FORWARDED_ALLOW_IPS="*" ENV FORWARDED_ALLOW_IPS=172.16.44.228 +# Install latest rclone (for checking backups in CVAT; "/storage" route) +RUN curl https://rclone.org/install.sh | bash + # Install API WORKDIR /home/ai4-papi COPY . . diff --git a/etc/tools/ai4os-cvat/nomad.hcl b/etc/tools/ai4os-cvat/nomad.hcl new file mode 100644 index 0000000..f22af87 --- /dev/null +++ b/etc/tools/ai4os-cvat/nomad.hcl @@ -0,0 +1,1059 @@ +/* +Convention: +----------- +* ${UPPERCASE} are replaced by the user +* ${lowercase} are replace by Nomad at launchtime +* remaining is default, same for everybody + +When replacing user values we use safe_substitute() so that ge don't get an error for not +replacing Nomad values +*/ + +/* +Main changes with respect to the reference job located in [1]. + +- added preliminary constraints and affinites +- adapted meta field +- group renamed to 'user_group' +- $$ replaced with $$$$ to avoid escaping in Python Template [2] +- replace ${BASE} with ${JOB_UUID} +- renamed task "server" to "main" (to share same info retrieving pattern) + +I also had to replace the following meta fields, otherwise when retrieving the +job info the ${env_var} where not being replaced. I'm having to do something similar +with ${meta.domain} but I don't want to extend it to env_vars just to support CVAT. + +To avoid too much disruption, I'm only changing this inside the service field +- ${NOMAD_META_job_uuid} --> ${JOB_UUID} +- ${NOMAD_META_cvat_hostname} --> ${meta.domain}-${BASE_DOMAIN} + +To avoid too much disruption, I'm only changing this in the "main" task (parameter `image`) +- ${NOMAD_META_server_image}:${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom} --> registry.services.ai4os.eu/ai4os/ai4-cvat-server:v2.7.3-AI4OS + +[1]: https://github.com/ai4os/ai4os-cvat/blob/v2.7.3-AI4OS/nomad/ai4-cvat.jobspec.nomad.hcl +[2]: https://stackoverflow.com/a/56957750/18471590 + +Note: +In several part of the job we use the old name of the repo (ai4os/ai4-cvat) which +should redirect fine to the new repo name (ai4os/ai4os-cvat) +But it is important nevertheless to keep it in mind, just in case. + +*/ + + +job "tool-cvat-${JOB_UUID}" { + namespace = "${NAMESPACE}" + type = "service" + region = "global" + id = "${JOB_UUID}" + priority = "${PRIORITY}" + + meta { + owner = "${OWNER}" # user-id from OIDC + owner_name = "${OWNER_NAME}" + owner_email = "${OWNER_EMAIL}" + title = "${TITLE}" + description = "${DESCRIPTION}" + + # CVAT-specific metadata + force_pull_img_cvat_server = true + force_pull_img_cvat_ui = true + cvat_version = "v2.7.3" + cvat_version_custom = "-AI4OS" + cvat_hostname = "${meta.domain}-${BASE_DOMAIN}" + job_uuid = "${JOB_UUID}" + restore_from = "${RESTORE_FROM}" + backup_name = "${BACKUP_NAME}" + + grafana_clickhouse_plugin_version = "3.3.0" + smokescreen_opts = "" + clickhouse_image = "clickhouse/clickhouse-server:22.3-alpine" + db_image = "postgres:16.4-alpine" + grafana_image = "grafana/grafana-oss:9.3.6" + redis_image = "eqalpha/keydb:x86_64_v6.3.2" + ui_image = "registry.services.ai4os.eu/ai4os/ai4-cvat-ui" + opa_image = "openpolicyagent/opa:0.45.0-rootless" + vector_image = "timberio/vector:0.26.0-alpine" + server_image = "registry.services.ai4os.eu/ai4os/ai4-cvat-server" + su_username = "${CVAT_USERNAME}" + su_password = "${CVAT_PASSWORD}" + + RCLONE_CONFIG = "${RCLONE_CONFIG}" + RCLONE_CONFIG_RSHARE_TYPE = "webdav" + RCLONE_CONFIG_RSHARE_URL = "${RCLONE_CONFIG_RSHARE_URL}" + RCLONE_CONFIG_RSHARE_VENDOR = "${RCLONE_CONFIG_RSHARE_VENDOR}" + RCLONE_CONFIG_RSHARE_USER = "${RCLONE_CONFIG_RSHARE_USER}" + RCLONE_CONFIG_RSHARE_PASS = "${RCLONE_CONFIG_RSHARE_PASS}" + + # remote path common for CVAT instances, without trailing / + RCLONE_REMOTE_PATH = "/ai4os-storage/tools/cvat" + } + + # Only use nodes that have succesfully passed the ai4-nomad_tests (ie. meta.status=ready) + constraint { + attribute = "${meta.status}" + operator = "regexp" + value = "ready" + } + + # Only launch in compute nodes (to avoid clashing with system jobs, eg. Traefik) + constraint { + attribute = "${meta.compute}" + operator = "=" + value = "true" + } + + # Only deploy in nodes serving that namespace (we use metadata instead of node-pools + # because Nomad does not allow a node to belong to several node pools) + constraint { + attribute = "${meta.namespace}" + operator = "regexp" + value = "${NAMESPACE}" + } + + # Try to deploy iMagine jobs on nodes that are iMagine-exclusive + # In this way, we leave AI4EOSC nodes for AI4EOSC users and for iMagine users only + # when iMagine nodes are fully booked. + affinity { + attribute = "${meta.namespace}" + operator = "regexp" + value = "ai4eosc" + weight = -50 # anti-affinity for ai4eosc clients + } + + # CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid + # overloading GPU clients with CPU-only jobs. + affinity { + attribute = "${meta.tags}" + operator = "regexp" + value = "cpu" + weight = 50 + } + + # Avoid rescheduling the job on **other** nodes during a network cut + # Command not working due to https://github.com/hashicorp/nomad/issues/16515 + reschedule { + attempts = 0 + unlimited = false + } + + group "usergroup" { + + # Recover the job in the **original** node when the network comes back + # (after a network cut). + # If network cut lasts more than 10 days (240 hrs), job is restarted anyways. + # Do not increase too much this limit because we want to still be able to notice + # when nodes are truly removed from the cluster (not just temporarily lost). + max_client_disconnect = "240h" + + ephemeral_disk { + size = 4096 + } + + network { + port "ui" { + to = 80 + } + port "server" { + to = 8080 + } + port "utils" { + to = 8080 + } + port "worker-import" { + to = 8080 + } + port "worker-export" { + to = 8080 + } + port "worker-annotation" { + to = 8080 + } + port "worker-webhooks" { + to = 8080 + } + port "worker-quality-reports" { + to = 8080 + } + port "worker-analytics-reports" { + to = 8080 + } + port "opa" { + to = 8181 + } + port "grafana" { + to = 3000 + } + port "db" { + to = 5432 + } + port "redis" { + to = 6379 + } + port "clickhouse_native" { + to = 9000 + } + port "clickhouse_http" { + to = 8123 + } + port "clickhouse_inter_server" { + to = 9009 + } + port "vector" { + to = 80 + } + } + + service { + name = "${JOB_UUID}-ui" + port = "ui" + tags = [ + "traefik.enable=true", + "traefik.http.routers.${NOMAD_META_job_uuid}-ui.tls=true", + "traefik.http.routers.${NOMAD_META_job_uuid}-ui.entrypoints=websecure", + "traefik.http.routers.${NOMAD_META_job_uuid}-ui.rule=Host(`${JOB_UUID}.${meta.domain}-${BASE_DOMAIN}`)" + ] + } + + service { + name = "${JOB_UUID}-server" + port = "server" + tags = [ + "traefik.enable=true", + "traefik.http.routers.${NOMAD_META_job_uuid}-server.tls=true", + "traefik.http.routers.${NOMAD_META_job_uuid}-server.entrypoints=websecure", + "traefik.http.routers.${NOMAD_META_job_uuid}-server.rule=Host(`${JOB_UUID}.${meta.domain}-${BASE_DOMAIN}`) && PathPrefix(`/api/`, `/static/`, `/admin`, `/documentation/`, `/django-rq`)" + ] + } + + service { + name = "${JOB_UUID}-grafana" + port = "grafana" + tags = [ + "traefik.enable=true", + "traefik.http.routers.${NOMAD_META_job_uuid}-grafana.tls=true", + "traefik.http.routers.${NOMAD_META_job_uuid}-grafana.entrypoints=websecure", + "traefik.http.routers.${NOMAD_META_job_uuid}-grafana.rule=Host(`${JOB_UUID}.${meta.domain}-${BASE_DOMAIN}`) && PathPrefix(`/analytics`)", + "traefik.http.middlewares.${NOMAD_META_job_uuid}-grafana-analytics-auth.forwardauth.address=http://${NOMAD_HOST_ADDR_server}/analytics", + "traefik.http.middlewares.${NOMAD_META_job_uuid}-grafana-analytics-auth.forwardauth.authRequestHeaders=Cookie,Authorization", + "traefik.http.middlewares.${NOMAD_META_job_uuid}-grafana-analytics-strip-prefix.stripprefix.prefixes=/analytics", + "traefik.http.routers.${NOMAD_META_job_uuid}-grafana.middlewares=${NOMAD_META_job_uuid}-grafana-analytics-auth@consulcatalog,${NOMAD_META_job_uuid}-grafana-analytics-strip-prefix@consulcatalog", + "traefik.services.${NOMAD_META_job_uuid}-grafana.loadbalancer.servers.url=${NOMAD_HOST_ADDR_grafana}", + "traefik.services.${NOMAD_META_job_uuid}-grafana.loadbalancer.passHostHeader=false" + ] + } + + task "share" { + lifecycle { + hook = "prestart" + sidecar = "true" + } + driver = "docker" + kill_timeout = "30s" + env { + RCLONE_CONFIG = "${NOMAD_META_RCLONE_CONFIG}" + RCLONE_CONFIG_RSHARE_TYPE = "webdav" + RCLONE_CONFIG_RSHARE_URL = "${NOMAD_META_RCLONE_CONFIG_RSHARE_URL}" + RCLONE_CONFIG_RSHARE_VENDOR = "${NOMAD_META_RCLONE_CONFIG_RSHARE_VENDOR}" + RCLONE_CONFIG_RSHARE_USER = "${NOMAD_META_RCLONE_CONFIG_RSHARE_USER}" + RCLONE_CONFIG_RSHARE_PASS = "${NOMAD_META_RCLONE_CONFIG_RSHARE_PASS}" + REMOTE_PATH = "rshare:${NOMAD_META_RCLONE_REMOTE_PATH}" + LOCAL_PATH = "/mnt" + } + config { + force_pull = true + image = "registry.services.ai4os.eu/ai4os/docker-storage:latest" + privileged = true + volumes = [ + "..${NOMAD_ALLOC_DIR}/data/share:/mnt/share:rshared" + ] + mount { + type = "bind" + target = "/srv/.rclone/rclone.conf" + source = "local/rclone.conf" + readonly = false + } + mount { + type = "bind" + target = "/entrypoint.sh" + source = "local/entrypoint.sh" + readonly = false + } + entrypoint = [ + "/bin/bash", + "-c", + "chmod +x /entrypoint.sh; /entrypoint.sh" + ] + } + template { + data = <<-EOF + [ai4eosc-share] + type = webdav + url = https://share.services.ai4os.eu/remote.php/dav + vendor = nextcloud + user = ${NOMAD_META_RCLONE_CONFIG_RSHARE_USER} + pass = ${NOMAD_META_RCLONE_CONFIG_RSHARE_PASS} + EOF + destination = "local/rclone.conf" + } + template { + data = <<-EOF + #!/usr/bin/env bash + export RCLONE_CONFIG_RSHARE_PASS=$(rclone obscure $$RCLONE_CONFIG_RSHARE_PASS) + rm -rf $LOCAL_PATH/share + mkdir -p $LOCAL_PATH/share + rclone mkdir $REMOTE_PATH/share + chown 1000:1000 $LOCAL_PATH/share + chmod 750 $LOCAL_PATH/share + rclone --log-level INFO mount $REMOTE_PATH/share $LOCAL_PATH/share \ + --uid 1000 \ + --gid 1000 \ + --dir-perms 0750 \ + --allow-non-empty \ + --allow-other \ + --vfs-cache-mode full + EOF + destination = "local/entrypoint.sh" + } + resources { + cpu = 50 # minimum number of CPU MHz is 2 + memory = 2000 + } + } + + task "synclocal" { + lifecycle { + hook = "prestart" + sidecar = "false" + } + driver = "docker" + kill_timeout = "30s" + env { + RCLONE_CONFIG = "${NOMAD_META_RCLONE_CONFIG}" + RCLONE_CONFIG_RSHARE_TYPE = "webdav" + RCLONE_CONFIG_RSHARE_URL = "${NOMAD_META_RCLONE_CONFIG_RSHARE_URL}" + RCLONE_CONFIG_RSHARE_VENDOR = "${NOMAD_META_RCLONE_CONFIG_RSHARE_VENDOR}" + RCLONE_CONFIG_RSHARE_USER = "${NOMAD_META_RCLONE_CONFIG_RSHARE_USER}" + RCLONE_CONFIG_RSHARE_PASS = "${NOMAD_META_RCLONE_CONFIG_RSHARE_PASS}" + REMOTE_PATH = "rshare:${NOMAD_META_RCLONE_REMOTE_PATH}/backups" + LOCAL_PATH = "/alloc/data" + RESTORE_FROM = "${NOMAD_META_restore_from}" + } + config { + force_pull = true + image = "registry.services.ai4os.eu/ai4os/docker-storage:latest" + mount { + type = "bind" + target = "/srv/.rclone/rclone.conf" + source = "local/rclone.conf" + readonly = false + } + mount { + type = "bind" + target = "/sync_local.sh" + source = "local/sync_local.sh" + readonly = false + } + entrypoint = [ + "/bin/bash", + "-c", + "chmod +x /sync_local.sh; /sync_local.sh" + ] + } + template { + data = <<-EOF + [ai4eosc-share] + type = webdav + url = https://share.services.ai4os.eu/remote.php/dav + vendor = nextcloud + user = ${NOMAD_META_RCLONE_CONFIG_RSHARE_USER} + pass = ${NOMAD_META_RCLONE_CONFIG_RSHARE_PASS} + EOF + destination = "local/rclone.conf" + } + template { + data = <<-EOF + #!/usr/bin/env bash + tarbals='db data events redis' + export RCLONE_CONFIG_RSHARE_PASS=$(rclone obscure $$RCLONE_CONFIG_RSHARE_PASS) + for tarbal in $tarbals; do + rm -rf $LOCAL_PATH/$tarbal + mkdir -p $LOCAL_PATH/$tarbal + if [[ $tarbal == "data" ]]; then + chown -R 1000 $LOCAL_PATH/data + chgrp -R 1000 $LOCAL_PATH/data + chmod -R 750 $LOCAL_PATH/data + fi + done + if [ -z "$${RESTORE_FROM}" ]; then + echo "CVAT backup not specified, a clean start will be performed" + elif [[ $(rclone lsd $REMOTE_PATH/$$RESTORE_FROM; echo $?) == 0 ]]; then + echo "found a CVAT backup '$$RESTORE_FROM', syncing ..." + rm -rf $LOCAL_PATH/$$RESTORE_FROM + mkdir -p $LOCAL_PATH/$$RESTORE_FROM + rclone sync $REMOTE_PATH/$$RESTORE_FROM $LOCAL_PATH/$$RESTORE_FROM --progress + for tarbal in $tarbals; do + if [ -f $LOCAL_PATH/$$RESTORE_FROM/$tarbal.tar.gz ]; then + echo -n "extracting $tarbal.tar.gz ... " + cd $LOCAL_PATH/$tarbal && tar -xf $LOCAL_PATH/$$RESTORE_FROM/$tarbal.tar.gz --strip 1 + if [[ $? == 0 ]]; then echo "OK"; else echo "ERROR"; fi + else + echo "file not found: $LOCAL_PATH/$$RESTORE_FROM/$tarbal.tar.gz" + fi + done + else + echo "CVAT backup '$$RESTORE_FROM' not found, a clean start will be performed" + fi + EOF + destination = "local/sync_local.sh" + } + resources { + cpu = 50 # minimum number of CPU MHz is 2 + memory = 2000 + } + } + + task "syncremote" { + lifecycle { + hook = "poststop" + sidecar = "false" + } + driver = "docker" + kill_timeout = "30s" + env { + RCLONE_CONFIG = "${NOMAD_META_RCLONE_CONFIG}" + RCLONE_CONFIG_RSHARE_TYPE = "webdav" + RCLONE_CONFIG_RSHARE_URL = "${NOMAD_META_RCLONE_CONFIG_RSHARE_URL}" + RCLONE_CONFIG_RSHARE_VENDOR = "${NOMAD_META_RCLONE_CONFIG_RSHARE_VENDOR}" + RCLONE_CONFIG_RSHARE_USER = "${NOMAD_META_RCLONE_CONFIG_RSHARE_USER}" + RCLONE_CONFIG_RSHARE_PASS = "${NOMAD_META_RCLONE_CONFIG_RSHARE_PASS}" + REMOTE_PATH = "rshare:${NOMAD_META_RCLONE_REMOTE_PATH}/backups" + LOCAL_PATH = "/alloc/data" + BACKUP_NAME = "${NOMAD_META_backup_name}" + } + config { + force_pull = true + image = "registry.services.ai4os.eu/ai4os/docker-storage:latest" + mount { + type = "bind" + target = "/srv/.rclone/rclone.conf" + source = "local/rclone.conf" + readonly = false + } + mount { + type = "bind" + target = "/sync_remote.sh" + source = "local/sync_remote.sh" + readonly = false + } + entrypoint = [ + "/bin/bash", + "-c", + "chmod +x /sync_remote.sh; /sync_remote.sh" + ] + } + template { + data = <<-EOF + [ai4eosc-share] + type = webdav + url = https://share.services.ai4os.eu/remote.php/dav + vendor = nextcloud + user = ${NOMAD_META_RCLONE_CONFIG_RSHARE_USER} + pass = ${NOMAD_META_RCLONE_CONFIG_RSHARE_PASS} + EOF + destination = "local/rclone.conf" + } + template { + data = <<-EOF + #!/usr/bin/env bash + TS=$(date +"%Y-%m-%d-%H-%M-%S-%N") + BACKUP_NAME="$${BACKUP_NAME}_$${TS}" + tarbals='db data events redis' + export RCLONE_CONFIG_RSHARE_PASS=$(rclone obscure $$RCLONE_CONFIG_RSHARE_PASS) + echo "creating a CVAT backup $$BACKUP_NAME ..." + if [[ -d $LOCAL_PATH/$$BACKUP_NAME ]]; then + echo "ERROR: local backup folder $LOCAL_PATH/$$BACKUP_NAME already exists" + exit 1 + fi + rm -rf $LOCAL_PATH/$$BACKUP_NAME + mkdir -p $LOCAL_PATH/$$BACKUP_NAME + cd $LOCAL_PATH + for tarbal in $tarbals; do + echo -n "creating $tarbal.tar.gz ..." + tar -czf $LOCAL_PATH/$$BACKUP_NAME/$tarbal.tar.gz $tarbal + if [ -f $LOCAL_PATH/$$BACKUP_NAME/$tarbal.tar.gz ]; then echo "OK"; else echo "ERROR"; fi + done + if [[ $(rclone lsd $REMOTE_PATH/$$BACKUP_NAME; echo $?) == 0 ]]; then + echo "ERROR: remote backup folder $REMOTE_PATH/$$BACKUP_NAME already exists" + exit 1 + fi + rclone mkdir $REMOTE_PATH/$$BACKUP_NAME + rclone sync $LOCAL_PATH/$$BACKUP_NAME $REMOTE_PATH/$$BACKUP_NAME --progress + EOF + destination = "local/sync_remote.sh" + } + resources { + cpu = 50 # minimum number of CPU MHz is 2 + memory = 2000 + } + } + + task "clickhouse" { + driver = "docker" + kill_timeout = "30s" + resources { + memory = 4096 + } + env { + CLICKHOUSE_DB = "cvat" + CLICKHOUSE_USER = "user" + CLICKHOUSE_PASSWORD = "user" + } + config { + image = "${NOMAD_META_clickhouse_image}" + ports = ["clickhouse_native", "clickhouse_http", "clickhouse_inter_server"] + volumes = [ + "..${NOMAD_ALLOC_DIR}/data/events:/var/lib/clickhouse" + ] + mount { + type = "bind" + target = "/docker-entrypoint-initdb.d/init.sh" + source = "local/docker-entrypoint-initdb.d/init.sh" + readonly = false + } + } + template { + data = <<-EOF + #!/bin/bash + CLICKHOUSE_DB="$$$${CLICKHOUSE_DB:-cvat}"; + clickhouse-client --query "CREATE DATABASE IF NOT EXISTS $$$${CLICKHOUSE_DB};" + echo " + CREATE TABLE IF NOT EXISTS $$$${CLICKHOUSE_DB}.events + ( + \`scope\` String NOT NULL, + \`obj_name\` String NULL, + \`obj_id\` UInt64 NULL, + \`obj_val\` String NULL, + \`source\` String NOT NULL, + \`timestamp\` DateTime64(3, 'Etc/UTC') NOT NULL, + \`count\` UInt16 NULL, + \`duration\` UInt32 DEFAULT toUInt32(0), + \`project_id\` UInt64 NULL, + \`task_id\` UInt64 NULL, + \`job_id\` UInt64 NULL, + \`user_id\` UInt64 NULL, + \`user_name\` String NULL, + \`user_email\` String NULL, + \`org_id\` UInt64 NULL, + \`org_slug\` String NULL, + \`payload\` String NULL + ) + ENGINE = MergeTree + PARTITION BY toYYYYMM(timestamp) + ORDER BY (timestamp) + SETTINGS index_granularity = 8192 + ;" | clickhouse-client + EOF + destination = "local/docker-entrypoint-initdb.d/init.sh" + } + } + + task "grafana" { + driver = "docker" + kill_timeout = "30s" + env { + GF_PATHS_PROVISIONING = "/etc/grafana/provisioning" + GF_AUTH_BASIC_ENABLED = false + GF_AUTH_ANONYMOUS_ENABLED = true + GF_AUTH_ANONYMOUS_ORG_ROLE = "Admin" + GF_AUTH_DISABLE_LOGIN_FORM = true + GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS = "grafana-clickhouse-datasource" + GF_SERVER_ROOT_URL = "http://${NOMAD_META_job_uuid}.${NOMAD_META_cvat_hostname}/analytics" + GF_INSTALL_PLUGINS = "https://github.com/grafana/clickhouse-datasource/releases/download/v${NOMAD_META_grafana_clickhouse_plugin_version}/grafana-clickhouse-datasource-${NOMAD_META_grafana_clickhouse_plugin_version}.linux_amd64.zip;grafana-clickhouse-datasource" + GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH = "/var/lib/grafana/dashboards/all_events.json" + } + config { + image = "${NOMAD_META_grafana_image}" + ports = ["grafana"] + mount { + type = "bind" + target = "/var/lib/grafana/dashboards/all_events.json" + source = "local/var/lib/grafana/dashboards/all_events.json" + readonly = false + } + mount { + type = "bind" + target = "/var/lib/grafana/dashboards/management.json" + source = "local/var/lib/grafana/dashboards/management.json" + readonly = false + } + mount { + type = "bind" + target = "/var/lib/grafana/dashboards/monitoring.json" + source = "local/var/lib/grafana/dashboards/monitoring.json" + readonly = false + } + mount { + type = "bind" + target = "/etc/grafana/provisioning/dashboards/dashboard.yaml" + source = "local/etc/grafana/provisioning/dashboards/dashboard.yaml" + readonly = false + } + mount { + type = "bind" + target = "/etc/grafana/provisioning/datasources/ds.yaml" + source = "local/etc/grafana/provisioning/datasources/ds.yaml" + readonly = false + } + command = "exec" + args = [ + "/run.sh" + ] + } + artifact { + source = "https://github.com/ai4os/ai4-cvat/raw/${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom}/components/analytics/grafana/dashboards/all_events.json" + destination = "local/var/lib/grafana/dashboards/" + } + artifact { + source = "https://github.com/ai4os/ai4-cvat/raw/${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom}/components/analytics/grafana/dashboards/management.json" + destination = "local/var/lib/grafana/dashboards/" + } + artifact { + source = "https://github.com/ai4os/ai4-cvat/raw/${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom}/components/analytics/grafana/dashboards/monitoring.json" + destination = "local/var/lib/grafana/dashboards/" + } + template { + data = <<-EOF + apiVersion: 1 + providers: + - name: cvat-logs + type: file + updateIntervalSeconds: 30 + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: true + EOF + destination = "local/etc/grafana/provisioning/dashboards/dashboard.yaml" + } + template { + data = <<-EOF + apiVersion: 1 + datasources: + - name: ClickHouse + type: grafana-clickhouse-datasource + isDefault: true + jsonData: + defaultDatabase: cvat + port: ${NOMAD_HOST_PORT_clickhouse_native} + server: ${NOMAD_HOST_IP_clickhouse_native} + username: user + tlsSkipVerify: false + secureJsonData: + password: user + editable: true + EOF + destination = "local/etc/grafana/provisioning/datasources/ds.yaml" + } + } + + task "db" { + driver = "docker" + kill_timeout = "30s" + env { + POSTGRES_USER = "root" + POSTGRES_DB = "cvat" + POSTGRES_HOST_AUTH_METHOD = "trust" + PGDATA = "/var/lib/postgresql/data/pgdata" + } + config { + image = "${NOMAD_META_db_image}" + privileged = true + force_pull = "false" + ports = ["db"] + volumes = [ + "..${NOMAD_ALLOC_DIR}/data/db:/var/lib/postgresql/data" + ] + } + } + + task "redis" { + driver = "docker" + kill_timeout = "30s" + resources { + cores = 1 + memory = 5120 + } + config { + image = "${NOMAD_META_redis_image}" + ports = ["redis"] + volumes = [ + "..${NOMAD_ALLOC_DIR}/data/redis:/data" + ] + command = "keydb-server" + args = [ + "/etc/keydb/keydb.conf", + "--storage-provider", "flash", "/data/flash", + "--maxmemory", "5G", + "--maxmemory-policy", "allkeys-lfu" + ] + } + } + + task "vector" { + driver = "docker" + kill_timeout = "30s" + resources { + memory = 1024 + } + env { + CLICKHOUSE_DB = "cvat" + CLICKHOUSE_USER = "user" + CLICKHOUSE_PASSWORD = "user" + CLICKHOUSE_HOST = "${NOMAD_HOST_IP_clickhouse_http}" + CLICKHOUSE_PORT = "${NOMAD_HOST_PORT_clickhouse_http}" + } + config { + image = "${NOMAD_META_vector_image}" + ports = ["vector"] + mount { + type = "bind" + target = "/etc/vector/vector.toml" + source = "local/etc/vector/vector.toml" + readonly = false + } + } + artifact { + source = "https://github.com/ai4os/ai4-cvat/raw/${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom}/components/analytics/vector/vector.toml" + destination = "local/etc/vector/" + } + } + + task "main" { + driver = "docker" + kill_timeout = "30s" + resources { + cores = 1 + memory = 4096 + } + env { + DJANGO_LOG_LEVEL = "INFO" + DJANGO_MODWSGI_EXTRA_ARGS = "" + ALLOWED_HOSTS = "*" + CVAT_REDIS_HOST = "${NOMAD_HOST_IP_redis}" + CVAT_REDIS_PORT = "${NOMAD_HOST_PORT_redis}" + CVAT_REDIS_PASSWORD = "" + CVAT_POSTGRES_HOST = "${NOMAD_HOST_IP_db}" + CVAT_POSTGRES_PORT = "${NOMAD_HOST_PORT_db}" + ADAPTIVE_AUTO_ANNOTATION = "false" + IAM_OPA_ADDR = "${NOMAD_HOST_ADDR_opa}" + IAM_OPA_HOST = "${NOMAD_HOST_IP_opa}" + IAM_OPA_PORT = "${NOMAD_HOST_PORT_opa}" + IAM_OPA_BUNDLE = "1" + NUMPROCS = "2" + DJANGO_LOG_SERVER_HOST = "${NOMAD_HOST_IP_vector}" + DJANGO_LOG_SERVER_PORT = "${NOMAD_HOST_PORT_vector}" + DJANGO_SUPERUSER_USERNAME = "${NOMAD_META_su_username}" + DJANGO_SUPERUSER_PASSWORD = "${NOMAD_META_su_password}" + CLICKHOUSE_HOST = "${NOMAD_HOST_IP_clickhouse_http}" + CLICKHOUSE_PORT = "${NOMAD_HOST_PORT_clickhouse_http}" + CVAT_ANALYTICS = "1" + CVAT_BASE_URL = "" + CVAT_HOST = "${NOMAD_META_job_uuid}.${NOMAD_META_cvat_hostname}" + SMOKESCREEN_OPTS = "${NOMAD_META_smokescreen_opts}" + } + config { + image = "registry.services.ai4os.eu/ai4os/ai4-cvat-server:v2.7.3-AI4OS" + force_pull = "${NOMAD_META_force_pull_img_cvat_server}" + ports = ["server"] + volumes = [ + "..${NOMAD_ALLOC_DIR}/data/data:/home/django/data", + "..${NOMAD_ALLOC_DIR}/data/share:/home/django/share" + ] + command = "init" + args = [ + "ensuresuperuser", + "run", + "server" + ] + } + } + + task "utils" { + lifecycle { + hook = "poststart" + sidecar = "true" + } + driver = "docker" + kill_timeout = "30s" + resources { + cores = 1 + memory = 1024 + } + env { + CVAT_REDIS_HOST = "${NOMAD_HOST_IP_redis}" + CVAT_REDIS_PORT = "${NOMAD_HOST_PORT_redis}" + CVAT_REDIS_PASSWORD = "" + CVAT_POSTGRES_HOST = "${NOMAD_HOST_IP_db}" + CVAT_POSTGRES_PORT = "${NOMAD_HOST_PORT_db}" + CLICKHOUSE_HOST = "${NOMAD_HOST_IP_clickhouse_http}" + CLICKHOUSE_PORT = "${NOMAD_HOST_PORT_clickhouse_http}" + DJANGO_LOG_SERVER_HOST = "${NOMAD_HOST_IP_vector}" + DJANGO_LOG_SERVER_PORT = "${NOMAD_HOST_PORT_vector}" + NUMPROCS = "1" + } + config { + image = "${NOMAD_META_server_image}:${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom}" + force_pull = "${NOMAD_META_force_pull_img_cvat_server}" + ports = ["utils"] + volumes = [ + "..${NOMAD_ALLOC_DIR}/data/data:/home/django/data", + "..${NOMAD_ALLOC_DIR}/data/share:/home/django/share", + ] + command = "run" + args = [ + "utils" + ] + } + } + + task "worker-import" { + lifecycle { + hook = "poststart" + sidecar = "true" + } + driver = "docker" + kill_timeout = "30s" + resources { + cores = 1 + memory = 1024 + } + env { + CVAT_REDIS_HOST = "${NOMAD_HOST_IP_redis}" + CVAT_REDIS_PORT = "${NOMAD_HOST_PORT_redis}" + CVAT_REDIS_PASSWORD = "" + CVAT_POSTGRES_HOST = "${NOMAD_HOST_IP_db}" + CVAT_POSTGRES_PORT = "${NOMAD_HOST_PORT_db}" + DJANGO_LOG_SERVER_HOST = "${NOMAD_HOST_IP_vector}" + DJANGO_LOG_SERVER_PORT = "${NOMAD_HOST_PORT_vector}" + NUMPROCS = "2" + SMOKESCREEN_OPTS = "${NOMAD_META_smokescreen_opts}" + } + config { + image = "${NOMAD_META_server_image}:${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom}" + force_pull = "${NOMAD_META_force_pull_img_cvat_server}" + ports = ["worker-import"] + volumes = [ + "..${NOMAD_ALLOC_DIR}/data/data:/home/django/data", + "..${NOMAD_ALLOC_DIR}/data/share:/home/django/share", + ] + command = "run" + args = [ + "worker.import" + ] + } + } + + task "worker-export" { + lifecycle { + hook = "poststart" + sidecar = "true" + } + driver = "docker" + kill_timeout = "30s" + resources { + cores = 1 + memory = 1024 + } + env { + CVAT_REDIS_HOST = "${NOMAD_HOST_IP_redis}" + CVAT_REDIS_PORT = "${NOMAD_HOST_PORT_redis}" + CVAT_REDIS_PASSWORD = "" + CVAT_POSTGRES_HOST = "${NOMAD_HOST_IP_db}" + CVAT_POSTGRES_PORT = "${NOMAD_HOST_PORT_db}" + DJANGO_LOG_SERVER_HOST = "${NOMAD_HOST_IP_vector}" + DJANGO_LOG_SERVER_PORT = "${NOMAD_HOST_PORT_vector}" + NUMPROCS = "2" + } + config { + image = "${NOMAD_META_server_image}:${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom}" + force_pull = "${NOMAD_META_force_pull_img_cvat_server}" + ports = ["worker-export"] + volumes = [ + "..${NOMAD_ALLOC_DIR}/data/data:/home/django/data", + "..${NOMAD_ALLOC_DIR}/data/share:/home/django/share", + ] + command = "run" + args = [ + "worker.export" + ] + } + } + + task "worker-annotation" { + lifecycle { + hook = "poststart" + sidecar = "true" + } + driver = "docker" + kill_timeout = "30s" + resources { + cores = 1 + memory = 1024 + } + env { + CVAT_REDIS_HOST = "${NOMAD_HOST_IP_redis}" + CVAT_REDIS_PORT = "${NOMAD_HOST_PORT_redis}" + CVAT_REDIS_PASSWORD = "" + CVAT_POSTGRES_HOST = "${NOMAD_HOST_IP_db}" + CVAT_POSTGRES_PORT = "${NOMAD_HOST_PORT_db}" + DJANGO_LOG_SERVER_HOST = "${NOMAD_HOST_IP_vector}" + DJANGO_LOG_SERVER_PORT = "${NOMAD_HOST_PORT_vector}" + NUMPROCS = "1" + } + config { + image = "${NOMAD_META_server_image}:${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom}" + force_pull = "${NOMAD_META_force_pull_img_cvat_server}" + ports = ["worker-annotation"] + volumes = [ + "..${NOMAD_ALLOC_DIR}/data/data:/home/django/data", + "..${NOMAD_ALLOC_DIR}/data/share:/home/django/share", + ] + command = "run" + args = [ + "worker.annotation" + ] + } + } + + task "worker-webhooks" { + lifecycle { + hook = "poststart" + sidecar = "true" + } + driver = "docker" + kill_timeout = "30s" + env { + CVAT_REDIS_HOST = "${NOMAD_HOST_IP_redis}" + CVAT_REDIS_PORT = "${NOMAD_HOST_PORT_redis}" + CVAT_REDIS_PASSWORD = "" + CVAT_POSTGRES_HOST = "${NOMAD_HOST_IP_db}" + CVAT_POSTGRES_PORT = "${NOMAD_HOST_PORT_db}" + DJANGO_LOG_SERVER_HOST = "${NOMAD_HOST_IP_vector}" + DJANGO_LOG_SERVER_PORT = "${NOMAD_HOST_PORT_vector}" + NUMPROCS = "1" + SMOKESCREEN_OPTS = "${NOMAD_META_smokescreen_opts}" + } + config { + image = "${NOMAD_META_server_image}:${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom}" + force_pull = "${NOMAD_META_force_pull_img_cvat_server}" + ports = ["worker-webhooks"] + volumes = [ + "..${NOMAD_ALLOC_DIR}/data/data:/home/django/data", + ] + command = "run" + args = [ + "worker.webhooks" + ] + } + } + + task "worker-quality-reports" { + lifecycle { + hook = "poststart" + sidecar = "true" + } + driver = "docker" + kill_timeout = "30s" + env { + CVAT_REDIS_HOST = "${NOMAD_HOST_IP_redis}" + CVAT_REDIS_PORT = "${NOMAD_HOST_PORT_redis}" + CVAT_REDIS_PASSWORD = "" + CVAT_POSTGRES_HOST = "${NOMAD_HOST_IP_db}" + CVAT_POSTGRES_PORT = "${NOMAD_HOST_PORT_db}" + DJANGO_LOG_SERVER_HOST = "${NOMAD_HOST_IP_vector}" + DJANGO_LOG_SERVER_PORT = "${NOMAD_HOST_PORT_vector}" + NUMPROCS = "1" + } + config { + image = "${NOMAD_META_server_image}:${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom}" + force_pull = "${NOMAD_META_force_pull_img_cvat_server}" + ports = ["worker-quality-reports"] + volumes = [ + "..${NOMAD_ALLOC_DIR}/data/data:/home/django/data", + ] + command = "run" + args = [ + "worker.quality_reports" + ] + } + } + + task "worker-analytics-reports" { + lifecycle { + hook = "poststart" + sidecar = "true" + } + driver = "docker" + kill_timeout = "30s" + resources { + cores = 1 + memory = 1024 + } + env { + CVAT_REDIS_HOST = "${NOMAD_HOST_IP_redis}" + CVAT_REDIS_PORT = "${NOMAD_HOST_PORT_redis}" + CVAT_REDIS_PASSWORD = "" + CVAT_POSTGRES_HOST = "${NOMAD_HOST_IP_db}" + CVAT_POSTGRES_PORT = "${NOMAD_HOST_PORT_db}" + DJANGO_LOG_SERVER_HOST = "${NOMAD_HOST_IP_vector}" + DJANGO_LOG_SERVER_PORT = "${NOMAD_HOST_PORT_vector}" + NUMPROCS = "2" + } + config { + image = "${NOMAD_META_server_image}:${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom}" + force_pull = "${NOMAD_META_force_pull_img_cvat_server}" + ports = ["worker-analytics-reports"] + volumes = [ + "..${NOMAD_ALLOC_DIR}/data/data:/home/django/data", + ] + command = "run" + args = [ + "worker.analytics_reports" + ] + } + } + + task "opa" { + driver = "docker" + kill_timeout = "30s" + config { + image = "${NOMAD_META_opa_image}" + ports = ["opa"] + command = "run" + args = [ + "--server", + "--log-level=info", + "--set=services.cvat.url=http://${NOMAD_HOST_ADDR_server}", + "--set=bundles.cvat.service=cvat", + "--set=bundles.cvat.resource=/api/auth/rules", + "--set=bundles.cvat.polling.min_delay_seconds=5", + "--set=bundles.cvat.polling.max_delay_seconds=15" + ] + } + } + + task "ui" { + lifecycle { + hook = "poststart" + sidecar = "true" + } + driver = "docker" + kill_timeout = "30s" + config { + image = "${NOMAD_META_ui_image}:${NOMAD_META_cvat_version}${NOMAD_META_cvat_version_custom}" + force_pull = "${NOMAD_META_force_pull_img_cvat_ui}" + ports = ["ui"] + } + } + } +} diff --git a/etc/tools/ai4os-cvat/user.yaml b/etc/tools/ai4os-cvat/user.yaml new file mode 100644 index 0000000..afb9fd4 --- /dev/null +++ b/etc/tools/ai4os-cvat/user.yaml @@ -0,0 +1,66 @@ +--- +# User customizable configuration to make a deployment in Nomad. +# Additional non-customizable values (eg. ports) are hardcoded in `job.nomad`. + +# All conf parameters follow the same structure: +# varname: +# name: name of the parameter to be displayed to end user (mandatory) +# value: (default) value of the parameter (mandatory) +# options: restricted set of values that the parameter can take (optional) +# description: some comments on the parameter to be displayed to the end user (optional) + + +general: + + title: + name: Deployment title + value: '' + description: Provide short title for this deployment (less than 45 characters). Useful when you have lots of different active deployments. + + desc: + name: Deployment description + value: '' + description: Provide some additional extended information about this deployment. + + cvat_username: + name: CVAT superuser name + value: '' + description: Select a username (or email) for your CVAT instance superuser. + + cvat_password: + name: CVAT superuser password + value: '' + description: Select a password for your CVAT instance superuser. + +storage: + + rclone_conf: + name: RCLONE configuration + value: '/srv/.rclone/rclone.conf' + description: rclone.conf location + + rclone_url: + name: Storage URL + value: '' + description: Remote storage link to be accessed via rclone (webdav). For example, in Nextcloud `https://share.services.ai4os.eu/remote.php/dav/files/` + + rclone_vendor: + name: RCLONE vendor + value: 'nextcloud' + options: ['nextcloud'] + description: RCLONE vendor (webdav) + + rclone_user: + name: RCLONE user + value: '' + description: rclone user to access a webdav remote storage + + rclone_password: + name: RCLONE user password + value: '' + + cvat_backup: + name: CVAT backups + value: '' + options: [''] + description: CVAT backup where to restore from diff --git a/tests/catalog/tools.py b/tests/catalog/tools.py index a9e2aa4..78f8e6e 100644 --- a/tests/catalog/tools.py +++ b/tests/catalog/tools.py @@ -1,6 +1,22 @@ + +import os +from types import SimpleNamespace + from ai4papi.routers.v1.catalog.tools import Tools +# Retrieve EGI token (not generated on the fly in case the are rate limiting issues +# if too many queries) +token = os.getenv('TMP_EGI_TOKEN') +if not token: + raise Exception( +'Please remember to set a token as ENV variable before executing \ +the tests! \n\n \ + export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \ +If running from VScode make sure to launch `code` from that terminal so it can access \ +that ENV variable.' + ) + # List tools tools_list = list(Tools.get_items().keys()) @@ -32,23 +48,27 @@ tools_tags = Tools.get_tags() assert isinstance(tools_tags, list) # empty list; deprecated method -# Explore an individual tool -tool_name = tools_list[0] +# Explore individual tools +# Contrary than for modules, we do this for all tools because tool configurations are +# particular for each tool +for tool_name in tools_list: -# Get tool config -tool_conf = Tools.get_config( - item_name=tool_name, - vo='vo.ai4eosc.eu', -) -assert isinstance(tool_conf, dict) -assert 'general' in tool_conf.keys() + print(f' - Testing {tool_name}') -# Get tool metadata -tool_meta = Tools.get_metadata( - item_name=tool_name, -) -assert isinstance(tool_meta, dict) -assert 'title' in tool_meta.keys() + # Get tool config + tool_conf = Tools.get_config( + item_name=tool_name, + vo='vo.ai4eosc.eu', + ) + assert isinstance(tool_conf, dict) + assert 'general' in tool_conf.keys() + + # Get tool metadata + tool_meta = Tools.get_metadata( + item_name=tool_name, + ) + assert isinstance(tool_meta, dict) + assert 'title' in tool_meta.keys() #TODO: we should not be able to get config or metadata for a module_name diff --git a/tests/deployments/modules.py b/tests/deployments/modules.py index 9b6a266..28b5315 100644 --- a/tests/deployments/modules.py +++ b/tests/deployments/modules.py @@ -29,6 +29,8 @@ assert isinstance(rcreate, dict) assert 'job_ID' in rcreate.keys() +time.sleep(0.2) # Nomad takes some time to allocate deployment + # Retrieve that module rdep = modules.get_deployment( vo='vo.ai4eosc.eu', @@ -40,6 +42,7 @@ assert isinstance(rdep, dict) assert 'job_ID' in rdep.keys() assert rdep['job_ID']==rcreate['job_ID'] +assert rdep['status']!='error' # Retrieve all modules rdeps = modules.get_deployments( @@ -50,6 +53,7 @@ ) assert isinstance(rdeps, list) assert any([d['job_ID']==rcreate['job_ID'] for d in rdeps]) +assert all([d['job_ID']!='error' for d in rdeps]) # Check that we cannot retrieve that module from tools # This should break! @@ -79,10 +83,11 @@ credentials=token ), ) -time.sleep(3) # Nomad takes some time to delete assert isinstance(rdel, dict) assert 'status' in rdel.keys() +time.sleep(3) # Nomad takes some time to delete + # Check module no longer exists rdeps3 = modules.get_deployments( vos=['vo.ai4eosc.eu'], diff --git a/tests/deployments/tools.py b/tests/deployments/tools.py index be366b3..ade3da3 100644 --- a/tests/deployments/tools.py +++ b/tests/deployments/tools.py @@ -18,9 +18,12 @@ that ENV variable.' ) +print(' Testing FL server') + # Create tool rcreate = tools.create_deployment( vo='vo.ai4eosc.eu', + tool_name='ai4os-federated-server', conf={}, authorization=SimpleNamespace( credentials=token @@ -29,6 +32,8 @@ assert isinstance(rcreate, dict) assert 'job_ID' in rcreate.keys() +time.sleep(0.2) # Nomad takes some time to allocate deployment + # Retrieve that tool rdep = tools.get_deployment( vo='vo.ai4eosc.eu', @@ -40,6 +45,7 @@ assert isinstance(rdep, dict) assert 'job_ID' in rdep.keys() assert rdep['job_ID']==rcreate['job_ID'] +assert rdep['status']!='error' # Retrieve all tools rdeps = tools.get_deployments( @@ -50,6 +56,7 @@ ) assert isinstance(rdeps, list) assert any([d['job_ID']==rcreate['job_ID'] for d in rdeps]) +assert all([d['job_ID']!='error' for d in rdeps]) # Check that we cannot retrieve that tool from modules # This should break! @@ -79,10 +86,11 @@ credentials=token ), ) -time.sleep(3) # Nomad takes some time to delete assert isinstance(rdel, dict) assert 'status' in rdel.keys() +time.sleep(3) # Nomad takes some time to delete + # Check tool no longer exists rdeps3 = tools.get_deployments( vos=['vo.ai4eosc.eu'], @@ -92,4 +100,49 @@ ) assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps3]) +############################################################ +# Additionally test simply the creation of the other tools # +############################################################ + +print(' Testing CVAT') + +# Create tool +rcreate = tools.create_deployment( + vo='vo.ai4eosc.eu', + tool_name='ai4os-cvat', + conf={ + 'general':{ + 'title': 'CVAT test', + 'cvat_username': 'mock_user', + 'cvat_password': 'mock_password', + }, + 'storage': { + 'rclone_conf': '/srv/.rclone/rclone.conf', + 'rclone_url': 'https://share.services.ai4os.eu/remote.php/webdav', + 'rclone_vendor': 'nextcloud', + 'rclone_user': 'mock_user', + 'rclone_password': 'mock_password', + } + }, + authorization=SimpleNamespace( + credentials=token + ), +) +assert isinstance(rcreate, dict) +assert 'job_ID' in rcreate.keys() +assert rdep['status']!='error' + +time.sleep(0.2) # Nomad takes some time to allocate deployment + +# Delete tool +rdel = tools.delete_deployment( + vo='vo.ai4eosc.eu', + deployment_uuid=rcreate['job_ID'], + authorization=SimpleNamespace( + credentials=token + ), +) +assert isinstance(rdel, dict) +assert 'status' in rdel.keys() + print('Deployments (tools) tests passed!') diff --git a/tests/main.py b/tests/main.py index a676bc2..3605e32 100644 --- a/tests/main.py +++ b/tests/main.py @@ -9,6 +9,7 @@ #TODO: move to proper testing package #TODO: rename test script: modules --> test_modules +#TODO: add spinners import ai4papi.conf as papiconf @@ -25,4 +26,5 @@ import routes import test_secrets import test_stats +import test_storage import test_launch diff --git a/tests/routes.py b/tests/routes.py index 456cbdd..4635316 100644 --- a/tests/routes.py +++ b/tests/routes.py @@ -43,4 +43,6 @@ assert ('/v1/try_me/nomad/{deployment_uuid}', {'GET'}) in routes assert ('/v1/try_me/nomad/{deployment_uuid}', {'DELETE'}) in routes +assert ('/v1/storage/{storage_name}/ls', {'GET'}) in routes + print('Checks for API routes passed!') diff --git a/tests/test_storage.py b/tests/test_storage.py new file mode 100644 index 0000000..11c8e28 --- /dev/null +++ b/tests/test_storage.py @@ -0,0 +1,28 @@ +import os +from types import SimpleNamespace + +from ai4papi.routers.v1 import storage + + +# Retrieve EGI token (not generated on the fly in case the are rate limiting issues +# if too many queries) +token = os.getenv('TMP_EGI_TOKEN') +if not token: + raise Exception( +'Please remember to set a token as ENV variable before executing \ +the tests! \n\n \ + export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \ +If running from VScode make sure to launch `code` from that terminal so it can access \ +that ENV variable.' + ) + +r = storage.storage_ls( + vo='vo.ai4eosc.eu', + storage_name='share.services.ai4os.eu', + subpath='ai4os-storage', + authorization=SimpleNamespace( + credentials=token + ), +) + +print('Storage tests passed!')